In [45]:
import os
import urllib
from requests.utils import quote
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from urllib.parse import urlparse
import copy 
import time
import json
import pandas as pd
import re

In [2]:
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC

In [3]:
from retrying import retry

## Set-up

We start by setting up Chromedriver.


In [4]:
options = webdriver.ChromeOptions()
prefs = {
    "download.default_directory": os.getcwd(),
    "executable_path": r"C:\Users\Dom_W\Documents\Development\Selenium\chromedriver.exe",
}
options.add_experimental_option("prefs", prefs)

# I have created special user profile for scraping it won't open existing one.
options.add_argument(
    "user-data-dir=C:\\Users\\Dom_W\\AppData\\Local\\Google\\Chrome\\User Data\\Profile 6"
)  # Path to your chrome profile

Boot up Chrome

In [5]:
driver = webdriver.Chrome(chrome_options=options)
# You might need to update ChromeDriver here.

## Functions for crawling

In [15]:
def extract_href_from_list_of_elements(element_list):
    if isinstance(element_list, list):
        return [element.get_attribute("href") for element in element_list]
    return element_list.get_attribute("href")

In [7]:
def extract_text_from_list_of_elements(element_list):
    if isinstance(element_list, list):
        return [element.text for element in element_list]
    return element_list.text

We need to do two things. We need to crawl and to scape. We can separate those two things out. 

We're going to use Selenium to get around any crawl protection as it will just prompt me and I can fill it in.

In [8]:
def do_we_end_pagination(driver,ending_class):
    # Check if we've reached the end. If we have then time to bail.
    try:
        end_button = driver.find_element_by_css_selector(ending_class)
        return True
    except NoSuchElementException as e:
        return False

In [9]:
def extract_properties(driver, output_list):
    # All property elements
    all_properties = driver.find_elements_by_css_selector(
        ".propertyCard-details .propertyCard-link"
    )

    # Get links
    all_property_hrefs = extract_href_from_list_of_elements(all_properties)

    # Dump into the output list.
    output_list.extend(all_property_hrefs)

In [10]:
def crawl_and_extract(output_list):
    # Make sure the page is loaded.

    # Page loads with ajax on subsequent clicks so we need to wait for loading.

    # If it presents a captcha solve in the 20 seconds you've got.
    for x in range(40):
        WebDriverWait(driver, 20).until_not(
            EC.text_to_be_present_in_element(
                (By.CSS_SELECTOR, ".propertyCard-title"), "Loading Property"
            )
        )
    extract_properties(driver, output_list)
    

##  Run the crawl.

In [79]:
starting_search = "https://www.rightmove.co.uk/property-to-rent/find.html?propertyTypes=flat&keywords=&includeLetAgreed=false&dontShow=houseShare%2Cretirement&channel=RENT&index=0&mustHave=garden&primaryDisplayPropertyType=flats&retirement=false&houseFlatShare=false&maxBedrooms=2&sortType=6&minPrice=1000&viewType=LIST&maxPrice=1750&radius=0.0&propFeature=Garden&locationIdentifier=REGION%5E87490"

In [80]:
driver.get(starting_search)

In [81]:
property_output_list = []
more_pages = True

In [82]:
counter = 1
while more_pages:
    print("Page {}.".format(counter))
    
    # We wrap this in generic error handling because selenium just loves to fail
    # for weird reasons.
    print("Extracting page.")
    crawl_and_extract(property_output_list)    

    # Then got to the next page.
    driver.find_element_by_css_selector(".pagination-direction--next").click()

    # A quick sleep to look slightly less suspicious
    time.sleep(2)
    
    counter+= 1
    
    # Time to stop?
    if do_we_end_pagination(driver, "button.pagination-direction--next[disabled]"):
        extract_properties(driver, property_output_list)
        break   

Page 1.
Extracting page.
Page 2.
Extracting page.
Page 3.
Extracting page.
Page 4.
Extracting page.
Page 5.
Extracting page.
Page 6.
Extracting page.
Page 7.
Extracting page.
Page 8.
Extracting page.
Page 9.
Extracting page.
Page 10.
Extracting page.
Page 11.
Extracting page.
Page 12.
Extracting page.
Page 13.
Extracting page.
Page 14.
Extracting page.
Page 15.
Extracting page.
Page 16.
Extracting page.
Page 17.
Extracting page.
Page 18.
Extracting page.
Page 19.
Extracting page.
Page 20.
Extracting page.
Page 21.
Extracting page.
Page 22.
Extracting page.
Page 23.
Extracting page.
Page 24.
Extracting page.
Page 25.
Extracting page.
Page 26.
Extracting page.
Page 27.
Extracting page.
Page 28.
Extracting page.
Page 29.
Extracting page.
Page 30.
Extracting page.
Page 31.
Extracting page.
Page 32.
Extracting page.
Page 33.
Extracting page.
Page 34.
Extracting page.
Page 35.
Extracting page.
Page 36.
Extracting page.
Page 37.
Extracting page.
Page 38.
Extracting page.
Page 39.
Extracting p

In [84]:
len(property_output_list)

1100

## Functions for indexing 

In [85]:
def try_to_extract_list(driver, css_selector):
    try:
        output = extract_text_from_list_of_elements(
            driver.find_elements_by_css_selector(css_selector)
        )
    except NoSuchElementException as e:
        print(e)
        output = []
    
    return output

In [86]:
def try_to_extract_str(driver, css_selector):
    try:
        output = driver.find_element_by_css_selector(css_selector).text
    except NoSuchElementException as e:
        print(e)
        output = []
    
    return output

In [87]:
def extract_data(url):
    # No ajax cleverness here. Just all the reqs. The webdriver inbuilt wait
    # is pretty great.
    driver.get(url)

    # We'll put our wait to be less suspicious and be patient here.
    time.sleep(2)

    # Now we get all the stuff we want:
    price = try_to_extract_str(driver, current_rm_css_class["price"])
    address = try_to_extract_str(driver, current_rm_css_class["address"])
    
    # No way to filter with CSS so we do it with python.
    all_property_aspects = try_to_extract_list(driver, current_rm_css_class["bedrooms"])
    bedrooms = next(filter(lambda i: re.search("bedroom", i, re.IGNORECASE), all_property_aspects))
    
    # Key features is often missing.
    key_features = try_to_extract_list(driver, current_rm_css_class["key_features"])
    property_description = try_to_extract_str(driver, current_rm_css_class["property_description"])
    
    # Stitch together stations for easier parsing
    nearby_stations = try_to_extract_list(driver, current_rm_css_class["nearby_stations_name"])
    nearby_stations_distance = try_to_extract_list(driver, current_rm_css_class["nearby_stations_distance"])
    
    nearby_station_zip = list(zip(nearby_stations, nearby_stations_distance))


    return {
        "url":rm_property,
        "price": price,
        "address": address,
        "bedrooms": bedrooms,
        "key_features": key_features,
        "property_description": property_description,
        "nearby_stations": nearby_station_zip,
        "nearest_station": nearby_station_zip[0]
    }

In [88]:
current_rm_css_class = {
    "price": "._1gfnqJ3Vtd1z40MlC0MzXu",
    "address": 'h1[itemprop="streetAddress"]',
    "bedrooms": "._1u12RxIYGx3c84eaGxI6_b",
    "key_features": ".lIhZ24u1NHMa5Y6gDH90A",
    "property_description": ".OD0O7FWw1TjbTD4sdRi1_",
    "nearby_stations_name": ".cGDiWU3FlTjqSs-F1LwK4",
    "nearby_stations_distance": "._1ZY603T1ryTT3dMgGkM7Lg"
}

## Actually running

Run and save.

In [89]:
output_property_data_scrape = []

In [None]:
for rm_property in property_output_list:
    try:
        print("Scraping: {}".format(rm_property))
        properties = extract_data(rm_property)
        output_property_data_scrape.append(properties)
    except Exception as e:
        print(e)

Backup for when Windows decides to randomly update.

In [93]:
with open('property_details_2.json', 'w') as outfile:
    json.dump(output_property_data_scrape, outfile)

In [94]:
df = pd.DataFrame(output_property_data_scrape)

Brief bit of post processing to find our gardens.

In [113]:
df['key_features_str'] = df['key_features'].apply(lambda x: ",".join(x))

In [138]:
df['has_garden_private'] = (df['key_features_str'].str.contains(r"(shared|private) [Gg]arden\b") | df['property_description'].str.contains(r"(shared|private) [Gg]arden\b"))

  return func(self, *args, **kwargs)


About 40% don't actually have gardens.

In [139]:
df['has_garden_private'].value_counts()

False    851
True     126
Name: has_garden_private, dtype: int64

In [116]:
df.to_csv("output_property_details_flats_1.csv")

In [140]:
df.to_clipboard()