This webscrapper uses selenium to access the main Irish property website, Daft.ie, to extract the daily list of properties whith the search criteria. The list contains a number of data points that would be usefull for a machine learning model aimed at predicting property prices. The data is then exported to a local folder.

In [2]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait 
import pandas as pd
import numpy as np
from random import random

In [3]:
#the website and search parameters
url = 'https://www.daft.ie/property-for-sale/galway-city?numBeds_from=2&salePrice_to=200000'

#intiate the selenium webdriver
driver = webdriver.Chrome()
driver.get(url)

#navigate through the intial pop up windows
driver.switch_to.active_element

button = driver.find_elements_by_class_name( "cc-modal__btn.cc-modal__btn--daft")

button[1].click()

driver.switch_to.active_element

driver.find_element_by_class_name('styles__CloseContainer-qea560-4.LGmOf').click()

driver.switch_to.active_element

<selenium.webdriver.remote.webelement.WebElement (session="741caeaf5cc46f79fc91684a79ce6351", element="9c0ea9e3-43a1-4c3a-b308-143f7fe9c818")>

In [4]:
# find the lowest level granualrity class that contains all of the data required on the search results page
adds = driver.find_elements_by_class_name('Card__Content-x1sjdn-9.iEbIAZ')

#find the url associated with each search result, to be used to navigate to the details page and extract further information.
urlclass = driver.find_elements_by_css_selector('.SearchPage__Result-gg133s-2.itNYNv [href]')

In [5]:
#data poitns from the search page
price = []
address =[]
beds =[]
baths = []
size = []
htype = []
urls = []

Next is the collection of the high level data points by interating through the web objects captured in adds
Unfortuately there was no consistent class or xpath for the address field, however the line seperations where consistent when the overall class was extracted to text which allowed me to use the index to navigate to the correct data for each list.

In [6]:
for i in adds:
    text = i.text
    dlist = text.splitlines()
    price.append(dlist[0])
    address.append(dlist[1])
    beds.append(dlist[2])
    baths.append(dlist[3])
    if 'm²' not in dlist[4]:
        size.append(None)
        htype.append(dlist[4])
    else:
        size.append(dlist[4])
        htype.append(dlist[5])

In [7]:
#iterate through the web objects in urlclass and extract the url string
for i in urlclass:
    urltext = i.get_attribute('href')
    urls.append(urltext)

In [8]:
#create a data frame and do a little initial data cleaning.
df = pd.DataFrame( {'price' : price , 'address' : address , 'beds' : beds , 'baths': baths , 'size' : size , 'htype' : htype , 'urls' : urls} )  
#remove test adds  
df = df[( df['address'].str.contains('Testing ') == False)]
#format price
df['price'] = df['price'].str.extract('([0-9]+,[0-9]+)', expand=True)
df['price'] = df['price'].str.replace(',' , '').astype('int64')
#format beds
df['beds'] = df['beds'].str.replace('Bed' , '').astype('int64')
#format baths
df['baths'] = df['baths'].str.replace('Bath' , '').astype('int64')
#format size
df['size'] = df['size'].str.replace('m²' , '' ).astype('float')

In [9]:
#data points from the individual property pages.
pdesc = []
pprop = []
pgps = []

In [11]:
# using the urls reset the driver to each one individually and extract the description , properties and GPS coordinates.
dfurls = df['urls']

for i in dfurls:
    driver.get(i)
    
    try:
        desc= driver.find_element_by_class_name('PropertyPage__StandardParagraph-sc-14jmnho-8.kDFIyQ').text
        pdesc.append(desc)
    except:
        pdesc.append(None)
        
    try:
        prop= driver.find_element_by_class_name('PropertyDetailsList__PropertyDetailsListContainer-sc-1cjwtjz-0.bnzQrB').text
        pprop.append(prop)
    except:
        pprop.append(None)
        
    try:
        gps = driver.find_element_by_css_selector('.NewButton__ButtonContainer-yem86a-4.dFKaNf.button-container [href]')
        gpsurl = gps.get_attribute('href')
        pgps.append(gpsurl)
    except:
        pgps.append(None)

In [12]:
#add to the dataframe
df['desc'] = pdesc
df['pprop'] = pprop
df['pgps'] = pgps

# format gps
df['pgps'] = df['pgps'].str.extract('(loc:.*)' )
df['pgps'] = df['pgps'].str.replace('loc:' , '')
df['pgps'] = df['pgps'].str.replace('\+-' , ' ')
df['lat'] = df['pgps'].str.extract('(^.*)\s').astype('float')
df['long'] = df['pgps'].str.extract('\s(.*)').astype('float')


In [14]:
df.head()

Unnamed: 0,price,address,beds,baths,size,htype,urls,desc,pprop,pgps,lat,long
0,200000,"Apartment 21, Té­ Luaghaidh, Gleann Na Ré­, Re...",2,1,64.0,Apartment,https://www.daft.ie/for-sale/apartment-apartme...,Ocean Property Management are excited to offer...,Fully managed property with Ocean Property Man...,53.273146 9.001582,53.273146,9.001582
1,180000,"9 Cluain Ard, Ballybrit, Co. Galway",3,2,,Semi-D,https://www.daft.ie/for-sale/semi-detached-hou...,Semi detached three bedroom house. \nRefurbish...,,53.287977 9.001229,53.287977,9.001229
2,190000,"26 Henry Street, Co. Galway",2,2,65.0,Terrace,https://www.daft.ie/for-sale/terraced-house-26...,This City centre terraced property is certainl...,ATTENTION INVESTORS & BUILDERS.\nTwo bedroom t...,53.270772 9.059481,53.270772,9.059481
3,195000,"236 Gleann Na Ri, Murrough, Renmore, Co. Galway",2,2,73.0,Apartment,https://www.daft.ie/for-sale/apartment-236-gle...,O’Donnellan & Joyce Auctioneers are pleased to...,Spacious apartment residence\nOverlooking larg...,53.272133 9.004803,53.272133,9.004803
4,195000,"307 Tirellan Heights, Headford Road, Co. Galway",3,1,87.0,Semi-D,https://www.daft.ie/for-sale/semi-detached-hou...,FOR SALE BY PRIVATE TREATY\n\nO'Donnellan & Jo...,,53.291675 9.044638,53.291675,9.044638
