In [10]:
import requests # for making standard html requests
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup # magical tool for parsing html data
import json # for parsing data
import pandas as pd
import csv
import random
import time
import glob

In [3]:
# Build list of user agents to avoid being blocked.
user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0",
               "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
               "Mozilla/5.0"]
containers = []

# Pull 10 pages of listings for list of suburbs in Cork city area.
for location in ['douglas','ballincollig','carrigaline','bishopstown','glasheen','blackrock','rochestown',
                 'lehenaghmore','ballinlough','ballintemple','wilton','model-farm-road']:
    for offset in range(0,60,20): # 20 listings per page
        user_agent = random.sample(user_agents,1)
        url = "http://www.daft.ie/cork-city/houses-for-sale/" + location + "/?offset=" + str(offset)
        req = Request(url , headers={'User-Agent': str(user_agent)})
        webpage = urlopen(req).read()
        page_soup = soup(webpage, "html.parser")
        containers += page_soup.findAll("div","AdCard__adCardContainer AdCard__adCardContainer--premium")
        containers += page_soup.findAll("div","AdCard__adCardContainer AdCard__adCardContainer--standard")
        time.sleep(5) # Wait 5 seconds between hitting site
        print(location,offset, len(containers))

douglas 0 18
douglas 20 38
douglas 40 58
ballincollig 0 74
ballincollig 20 92
ballincollig 40 92
carrigaline 0 110
carrigaline 20 130
carrigaline 40 138
bishopstown 0 157
bishopstown 20 162
bishopstown 40 162
glasheen 0 177
glasheen 20 177
glasheen 40 177
blackrock 0 196
blackrock 20 201
blackrock 40 201
rochestown 0 221
rochestown 20 223
rochestown 40 223
lehenaghmore 0 230
lehenaghmore 20 230
lehenaghmore 40 230
ballinlough 0 238
ballinlough 20 238
ballinlough 40 238
ballintemple 0 244
ballintemple 20 244
ballintemple 40 244
wilton 0 250
wilton 20 250
wilton 40 250
model-farm-road 0 262
model-farm-road 20 262
model-farm-road 40 262


In [40]:
# For each kind of listing, create a list of containers.
# containers = page_soup.findAll("div","AdCard__adCardContainer AdCard__adCardContainer--premium")
# containers += page_soup.findAll("div","AdCard__adCardContainer AdCard__adCardContainer--standard")

# Check all previous scrapes. If there is a new listing in this run then pull additional details from listing page
df = pd.concat([pd.read_csv(f) for f in glob.glob('cork_property_prices*.csv')], ignore_index = True)
df = df.drop_duplicates(subset='listing_id', keep='last') # Dedup on listing_id and keep first record.
listings = set(df['listing_id'])

# Write data to an array for output to csv later
rows = []

for idx, container in enumerate(containers):
    listing = {}
    # Check if the listing is "valid", ads will not have a price section.
    if container.find('strong', "PropertyInformationCommonStyles__costAmountCopy") is None:
        print("Ad Detected!")
    else: 
        listing['listing_id'] = container.find('a',"PropertyInformationCommonStyles__addressCopy--link")["href"].split("-")[-1][:-1]
        listing['url'] = container.find('a',"PropertyInformationCommonStyles__addressCopy--link")["href"]
        listing['location'] = container.find('a',"PropertyInformationCommonStyles__propertyPrice--link")["href"].split("/")[3]
        listing['address'] = container.find('a',"PropertyInformationCommonStyles__addressCopy--link").getText().replace(",","|")
        listing['beds'] = container.find('div', "QuickPropertyDetails__iconCopy").getText()
        listing['bathrooms'] = container.find('div', "QuickPropertyDetails__iconCopy--WithBorder").getText()
        listing['cost'] = container.find('strong', "PropertyInformationCommonStyles__costAmountCopy").getText().replace("€","")
        listing['num_pics'] = container.find('span', "PropertyImage__picturesAmountCopy").getText()
        listing['property_type'] = container.find('div', "QuickPropertyDetails__propertyType").getText().replace("\n","").strip()
        try:
            listing['ber'] = container.find("img",{"class":"PropertyImage__berImage"}).get_attribute_list("src")[0][-6:-4]
        except:
            listing['ber'] = 'NA'
        rows.append(list(listing.values())) # Add listing to rows list for output.
        
        if int(listing['listing_id']) not in listings:
            eircode = page_soup.find('div', "PropertyMainInformation__eircode").getText().split(":")[1].strip()
            propertyOverviewDetails = page_soup.find('div', "PropertyOverview__propertyOverviewDetails").getText()
            if "Overall Floor Area" in propertyOverviewDetails:
                floor_area = propertyOverviewDetails.split("Overall Floor Area:")[1].strip().split(" ")[0]
        else:
            # Old listing
            listing['eircode'] = ""
            listing['floor_area'] = 0 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Ad Detected!
New Listing
Ad Detected!
Ad Detected!
New Listing
Ad Detected!
Ad Detected!
New Listing
Ad Detected!
Ad Detected!
New Listing


In [41]:
len(rows)

255

In [52]:
# Check all previous scrapes. If there is a new listing in this run then pull additional details from the page of that listing
df = pd.concat([pd.read_csv(f) for f in glob.glob('cork_property_prices*.csv')], ignore_index = True)
df = df.drop_duplicates(subset='listing_id', keep='last') # Dedup on listing_id and keep first record.
listings = set(df['listing_id'])

for row in rows: 
    if int(row[0]) not in listings:
        url = "http://www.daft.ie/" + row[1]
        print(url)
        req = Request(url , headers={'User-Agent': str(user_agent)})
        webpage = urlopen(req).read()
        page_soup = soup(webpage, "html.parser")
        eircode = page_soup.find('div', "PropertyMainInformation__eircode").getText().split(":")[1].strip()
        propertyOverviewDetails = page_soup.find('div', "PropertyOverview__propertyOverviewDetails").getText()
        if "Overall Floor Area" in propertyOverviewDetails:
            floor_area = propertyOverviewDetails.split("Overall Floor Area:")[1].strip().split(" ")[0]
            print(floor_area)
        time.sleep(5)
        floor = float(floor_area)
# Get eircode <div class="PropertyMainInformation__eircode">
# Get floor area <span class="PropertyOverview__floorArea">Overall Floor Area:</span>


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


http://www.daft.ie//cork/houses-for-sale/douglas/1-well-road-homes-churchyard-lane-douglas-cork-2600677/
74.3
http://www.daft.ie//cork/houses-for-sale/ballincollig/46-tuairin-glas-greenfield-ballincollig-cork-2600311/
http://www.daft.ie//cork/houses-for-sale/bishopstown/maryville-48-rossa-avenue-bishopstown-cork-2419284/
158.18
http://www.daft.ie//cork/houses-for-sale/cork-city/54-wilton-road-cork-city-cork-2544464/
109


In [5]:

# Output to flat file for modeling 
#filename = "cork_property_prices.csv"
filename = "cork_property_prices_" + str(round(time.time())) + ".csv"

# field names  
fields = ['listing_id', 'url', 'location', 'address', 'beds', 'bathrooms', 'cost', 'num_pics', 'property_type', 'ber'] 

# writing to csv file  
with open(filename, 'w') as csvfile:  
    # creating a csv writer object  
    csvwriter = csv.writer(csvfile)  
        
    # writing the fields  
    csvwriter.writerow(fields)  
        
    # writing the data rows  
    csvwriter.writerows(rows) 