## Daft Scraping

### Get individual rental ad  URLs
Pagination is done using the 'offset' property in the search results URL, so we can use that browse through the results pages.  
Daft displays 20 results per page, hence the value of '20' in this line:
       `for offset in range(0, number_of_adds, 20):`
    

In [1]:
from bs4 import BeautifulSoup
import urllib

daftresults_urlroot = 'http://www.daft.ie/dublin/apartments-for-rent/?s%5Bignored_agents%5D%5B0%5D=5732&s%5Bignored_agents%5D%5B1%5D=428&s%5Bignored_agents%5D%5B2%5D=1551&s%5Bsort_by%5D=date&s%5Bsort_type%5D=d&offset='
allAdUrls = []
number_of_adds = 100

def getAdUrls(pageresults):
    
    adURLs = []
    for result in pageresults:
        adURLs.append("http://www.daft.ie" + result.a["href"])
    return adURLs

for offset in range(0, number_of_adds, 20):
    results_html = urllib.request.urlopen(daftresults_urlroot + str(offset)).read()
    soup = BeautifulSoup(results_html, "html5lib")
    results = soup.find_all("div", class_="search_result_title_box")
    allAdUrls = allAdUrls + getAdUrls(results)

print(allAdUrls)
print('Current number of Dublin rental ads: ' + str(len(allAdUrls)))
number_of_adds = len(allAdUrls)


['http://www.daft.ie/dublin/apartments-for-rent/tallaght/57-kiltipper-gate-kiltipper-way-tallaght-dublin-1772738/', 'http://www.daft.ie/dublin/apartments-for-rent/arbour-hill/7-temple-house-temple-st-west-arbour-hill-dublin-1772723/', 'http://www.daft.ie/dublin/apartments-for-rent/harolds-cross/2-manor-villas-harolds-cross-dublin-1772774/', 'http://www.daft.ie/dublin/apartments-for-rent/stillorgan/70-patrician-villas-stillorgan-dublin-1772739/', 'http://www.daft.ie/dublin/apartments-for-rent/lucan/larkfield-square-lucan-lucan-dublin-1772718/', 'http://www.daft.ie/dublin/apartments-for-rent/rathcoole/21-eaton-tce-rathcoole-rathcoole-dublin-1772719/', 'http://www.daft.ie/dublin/apartments-for-rent/dublin-1/kirkpatrick-housespencer-dock-dublin-1-dublin-1772717/', 'http://www.daft.ie/dublin/apartments-for-rent/dublin-7/blackhall-court-blackhall-green-dublin-7-dublin-1772709/', 'http://www.daft.ie/dublin/apartments-for-rent/booterstown/booterstown-wood-booterstown-avenue-booterstown-dublin-

### Download individual Ad pages

This section loops through the list of individual rental ad URLs, and downloads them into a 'daftpages' directory. 

#### Skip if you download and extract the zipped/tared archive instead

In [11]:
# Rental Ad URLs are now in this array: allAdUrls
# Loop through and download

for idx,adUrl in enumerate(allAdUrls):
    filename = adUrl.split('/')[-2]
    urllib.request.urlretrieve(adUrl, 'daftpages/'+ filename + '.html')

### Zip up pages

In [18]:
import tarfile
import datetime
import os

current_date = datetime.datetime.now().isoformat()
tar_file_name = 'daftpages_' + current_date + '.tar'
source_dir = 'daftpages/'
with tarfile.open(tar_file_name, "w:gz") as tar:
    tar.add(source_dir, arcname=os.path.basename(source_dir))

### Start scraping

In [20]:
import pandas as pd
import re
import json
import csv


num_of_rows = number_of_adds
data_csv = 'data/scraped_data.csv'
all_orig_field_names = [
    'property_id',
    'property_category',
    'property_title',
    'property_type',
    'seller_name',
    'seller_id',
    'seller_type',
    'open_viewing',
    'no_of_photos',
    'available_from',
    'lease_units',
    'available_for',    
    'area',
    'county',
    'latitude',
    'longitude',    
    'furnished',
    'bathrooms',   
    'beds',   
    'facility',    
    'environment',
    'published_date',
    'page_name',
    'platform',
    'currency',
    'price_frequency',
    'price'
]
all_facilities=[
    'Parking', 
    'Cable Television', 
    'Dryer', 
    'Garden / Patio / Balcony', 
    'Washing Machine', 
    'Serviced Property', 
    'Pets Allowed', 
    'Wheelchair Access', 
    'Central Heating', 
    'Microwave', 
    'Smoking', 
    'Dishwasher', 
    'House Alarm', 
    'Internet'
]

with open(data_csv, 'w') as csvfile:
    for idx,daft_filename in enumerate(os.listdir('daftpages/')):
        try:
            adpage_html = open('daftpages/' + daft_filename).read()
            soup = BeautifulSoup(adpage_html, "html5lib")
        except:
            # seems like some pages have encoding issues?
            print('issue reading in page daftpages/' + daft_filename + '. Skipping this Ad.')
            continue

        #print(soup)
        # There is a handy javascrupt json dictionary on those daft pages, listing key features of the add
        # To get this data, find all script tags, then get the contents of the 10ths tag found (seems to be the 10th.
        # Now, this seems to be a bit brittle, need to find a way to target this better than just hope it'll always be 
        # the 10th script tag on the page; But maybe for now it's enough)
        scriptdata = soup.find_all('script', type='text/javascript')    
        trackingparams = scriptdata[10].get_text()
        trackingparams = trackingparams.replace('\u20ac','')

        try:
            feature_str = "{" + str(re.search('\\{(.+?)\\}', trackingparams).group(1)) + "}"
        except AttributeError:
            feature_str = "{}"
    
        ad_data = json.loads(feature_str)  
        
        field_names = ad_data.keys()
        
        facilities = ad_data['facility'].split(',')
        facilties_dict = dict.fromkeys(all_facilities)
        for facility in facilities:
            if facility in all_facilities:
                facilties_dict[facility] = True
 

        # check for missing fields (mostly seller_id and seller_name), and add them with empty vals if required
        missing_fiels = set(all_orig_field_names) - set(field_names)
        for missing in missing_fiels:
            ad_data[missing] = ""
        
        ad_data.update(facilties_dict)
        
        all_field_names = all_orig_field_names + all_facilities
        
        writer = csv.DictWriter(csvfile, fieldnames=all_field_names)
        if idx == 0: 
            writer.writeheader()
        writer.writerow(ad_data)
        


issue reading in page daftpages/daft_ad_108.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_167.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_170.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_173.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_180.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_181.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_182.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_25.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_257.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_351.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_398.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_503.html. Skipping this Ad.
issue reading in page daftpages/the-plaza-swords-dublin-1772641.html. Skipping this Ad.


### Create Pandas Dataframe from CSV

In [19]:


data_csv = 'data/scraped_data.csv'
df = pd.read_csv(data_csv)

drop some not very useful columns
df = df.drop('environment', 1)
df = df.drop('page_name', 1)
df = df.drop('platform', 1)
df = df.drop('property_category', 1)
df.head()

Unnamed: 0,1772670,rental,"1 BED, Tallaght Cross West, Tallaght, Dublin 24",apartment,IRES,9871,agent,no,3,2017-10-01,...,True.1,Unnamed: 32,Unnamed: 33,Unnamed: 34,True.2,True.3,Unnamed: 37,True.4,Unnamed: 39,Unnamed: 40
0,1772585,rental,"139 Grand Central, Sandyford, Dublin 18",apartment,IRES,9871.0,agent,no,6,2017-12-05,...,,,,,,,,,,
1,1772257,rental,"17 Kearns Court, Kilmainham, Dublin 8",apartment,,,private,no,8,2017-10-01,...,True,True,,,True,True,,,,True
2,1772612,rental,"199 Tyrconnell Road, Inchicore, Dublin 8",apartment,,,private,no,7,2017-09-15,...,True,,,,True,True,,True,,
3,1772467,rental,"2 BED, Tallaght Cross West, Tallaght, Dublin 24",apartment,IRES,9871.0,agent,no,3,2017-09-12,...,True,,,,True,True,,,,
4,1772774,rental,"2 manor villas, Harold's Cross, Dublin 6",apartment,,,private,no,7,2017-09-12,...,True,,,,True,True,True,,,True


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622 entries, 0 to 621
Data columns (total 37 columns):
property_id                 622 non-null int64
property_title              622 non-null object
property_type               622 non-null object
seller_name                 472 non-null object
seller_id                   472 non-null float64
seller_type                 622 non-null object
open_viewing                622 non-null object
no_of_photos                622 non-null int64
available_from              622 non-null object
lease_units                 622 non-null object
available_for               622 non-null int64
area                        622 non-null object
county                      622 non-null object
latitude                    622 non-null float64
longitude                   622 non-null float64
furnished                   622 non-null object
bathrooms                   622 non-null int64
beds                        622 non-null int64
facility                    581 n

In [9]:
df.set_index('property_id').index.get_duplicates()

[1765333, 1767907, 1770435, 1771252]