## Daft Scraping

### Get individual rental ad  URLs
Pagination is done using the 'offset' property in the search results URL, so we can use that browse through the results pages.  
Daft displays 20 results per page, hence the value of '20' in this line:
       `for offset in range(0, number_of_adds, 20):`
    

In [1]:
from bs4 import BeautifulSoup
import urllib

daftresults_urlroot = 'http://www.daft.ie/dublin/apartments-for-rent/?s%5Bignored_agents%5D%5B0%5D=5732&s%5Bignored_agents%5D%5B1%5D=428&s%5Bignored_agents%5D%5B2%5D=1551&s%5Bsort_by%5D=date&s%5Bsort_type%5D=d&offset='
allAdUrls = []
number_of_adds = 300

def getAdUrls(pageresults):
    
    adURLs = []
    for result in pageresults:
        adURLs.append("http://www.daft.ie" + result.a["href"])
    return adURLs

for offset in range(0, number_of_adds, 20):
    results_html = urllib.request.urlopen(daftresults_urlroot + str(offset)).read()
    soup = BeautifulSoup(results_html, "html5lib")
    results = soup.find_all("div", class_="search_result_title_box")
    allAdUrls = allAdUrls + getAdUrls(results)

#print(allAdUrls)
print('Current number of Dublin rental ads: ' + str(len(allAdUrls)))
number_of_adds = len(allAdUrls)


Current number of Dublin rental ads: 300


### Download individual Ad pages

This section loops through the list of individual rental ad URLs, and downloads them into a 'daftpages' directory. 

#### Skip if you download and extract the zipped/tared archive instead

In [2]:
# Rental Ad URLs are now in this array: allAdUrls
# Loop through and download

for idx,adUrl in enumerate(allAdUrls):
    filename = adUrl.split('/')[-2]
    urllib.request.urlretrieve(adUrl, 'daftpages/'+ filename + '.html')

### Zip up pages

In [3]:
import tarfile
import datetime
import os

current_date = datetime.datetime.now().isoformat()
tar_file_name = 'daftpages_' + current_date + '.tar'
source_dir = 'daftpages/'
with tarfile.open(tar_file_name, "w:gz") as tar:
    tar.add(source_dir, arcname=os.path.basename(source_dir))

### Start scraping

In [4]:
import pandas as pd
import re
import json
import csv


num_of_rows = number_of_adds
data_csv = 'data/scraped_data.csv'
all_orig_field_names = [
    'property_id',
    'property_category',
    'property_title',
    'property_type',
    'seller_name',
    'seller_id',
    'seller_type',
    'open_viewing',
    'no_of_photos',
    'available_from',
    'lease_units',
    'available_for',    
    'area',
    'county',
    'latitude',
    'longitude',    
    'furnished',
    'bathrooms',   
    'beds',   
    'facility',    
    'environment',
    'published_date',
    'page_name',
    'platform',
    'currency',
    'price_frequency',
    'price'
]
all_facilities=[
    'Parking', 
    'Cable Television', 
    'Dryer', 
    'Garden / Patio / Balcony', 
    'Washing Machine', 
    'Serviced Property', 
    'Pets Allowed', 
    'Wheelchair Access', 
    'Central Heating', 
    'Microwave', 
    'Smoking', 
    'Dishwasher', 
    'House Alarm', 
    'Internet'
]

with open(data_csv, 'w') as csvfile:
    for idx,daft_filename in enumerate(os.listdir('daftpages/')):
        try:
            adpage_html = open('daftpages/' + daft_filename).read()
            soup = BeautifulSoup(adpage_html, "html5lib")
        except:
            # seems like some pages have encoding issues?
            print('issue reading in page daftpages/' + daft_filename + '. Skipping this Ad.')
            continue

        #print(soup)
        # There is a handy javascrupt json dictionary on those daft pages, listing key features of the add
        # To get this data, find all script tags, then get the contents of the 10ths tag found (seems to be the 10th.
        # Now, this seems to be a bit brittle, need to find a way to target this better than just hope it'll always be 
        # the 10th script tag on the page; But maybe for now it's enough)
        scriptdata = soup.find_all('script', type='text/javascript')    
        trackingparams = scriptdata[10].get_text()
        trackingparams = trackingparams.replace('\u20ac','')

        try:
            feature_str = "{" + str(re.search('\\{(.+?)\\}', trackingparams).group(1)) + "}"
        except AttributeError:
            feature_str = "{}"
    
        ad_data = json.loads(feature_str)  
        
        field_names = ad_data.keys()
        
        facilities = ad_data['facility'].split(',')
        facilties_dict = dict.fromkeys(all_facilities)
        for facility in facilities:
            if facility in all_facilities:
                facilties_dict[facility] = True
 

        # check for missing fields (mostly seller_id and seller_name), and add them with empty vals if required
        missing_fiels = set(all_orig_field_names) - set(field_names)
        for missing in missing_fiels:
            ad_data[missing] = ""
        
        ad_data.update(facilties_dict)
        
        all_field_names = all_orig_field_names + all_facilities
        
        writer = csv.DictWriter(csvfile, fieldnames=all_field_names)
        if idx == 0: 
            writer.writeheader()
        writer.writerow(ad_data)
        


issue reading in page daftpages/26-marlborough-street-dublin-1-dublin-1-dublin-1774504.html. Skipping this Ad.
issue reading in page daftpages/cathedral-court-dublin-2-dublin-1771875.html. Skipping this Ad.
issue reading in page daftpages/city-gate-st-augustines-st-dublin-8-dublin-1771830.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_108.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_167.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_170.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_173.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_180.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_181.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_182.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_25.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_257.html. Skipping this Ad.
issue reading in page daftpages/daft_ad_351.html. Skipping this Ad.
issue r

### Create Pandas Dataframe from CSV

In [5]:


data_csv = 'data/scraped_data.csv'
df = pd.read_csv(data_csv)

#drop some not very useful columns
df = df.drop('environment', 1)
df = df.drop('page_name', 1)
df = df.drop('platform', 1)
df = df.drop('property_category', 1)
df.head()

Unnamed: 0,property_id,property_title,property_type,seller_name,seller_id,seller_type,open_viewing,no_of_photos,available_from,lease_units,...,Washing Machine,Serviced Property,Pets Allowed,Wheelchair Access,Central Heating,Microwave,Smoking,Dishwasher,House Alarm,Internet
0,1772670,"1 BED, Tallaght Cross West, Tallaght, Dublin 24",apartment,IRES,9871.0,agent,no,3,2017-10-01,months,...,True,,,,True,True,,True,,
1,1773305,"1 Palace Street, Dublin 2, Dublin 2",apartment,Herbert Property Services,7549.0,agent,no,6,2017-09-15,months,...,True,,,,True,True,,,,True
2,1773577,"10 Clarinda House, Clarinda Park West, Dun Lao...",apartment,,,private,no,8,2017-09-18,months,...,True,,,,True,True,,,,True
3,1772865,"109 Geraldstown Wood, Santry, Dublin 9",apartment,KELLY BRADSHAW DALTON,11.0,agent,yes,6,2017-09-13,months,...,True,,,,True,True,,True,,
4,1771138,"11 saunders house, spencer dock, Dublin 1, Dub...",apartment,,,private,no,10,2017-09-16,months,...,True,,,,,,,,,
