# Exploring Rightmove data

> Aim: Query and explore the Rightmove data.

## Load modules

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [14]:
# go to https://www.rightmove.co.uk/house-prices.html
# enter your specified search
# go to page 2
# copy the URL up to *Number= and replace the URL below

url = "https://www.rightmove.co.uk/house-prices/w3-9jj.html?radius=0.5&soldIn=5&tenure=FREEHOLD&pageNumber="

def get_sold_property_info_postcode(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    n_pages = soup.find_all("div", class_="dsrm_dropdown_section")[0].find_all("span")[1]
    n_pages = int(n_pages.text.replace("of ", ""))

    print(f"There's {n_pages} of results.")

    time.sleep(3)

    properties_info_pages = []

    for i in range(n_pages):

        print(f"scraping page {i}")
        response = requests.get(url + str(i + 1) , headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        properties_info_pages.append(get_sold_property_info(soup))
        
        time.sleep(3)
    
    properties_info_pages = pd.concat(properties_info_pages)

    return properties_info_pages

def get_sold_property_info(soup):
    property_cards = soup.find_all("a", attrs={"data-testid": "propertyCard"})

    properties_info = []
    
    for property_card in property_cards:
        address = property_card.find("h2").text

        property_type = property_card.find_all("div", attrs={"aria-label": re.compile(r"property type:", re.IGNORECASE)})
        if property_type:
            property_type = property_type[0].text.replace("Property Type: ", "")
        else:
            property_type = ""

        bedrooms = property_card.find_all("div", attrs={"aria-label": re.compile(r"bedrooms", re.IGNORECASE)})
        if bedrooms:
            bedrooms = int(bedrooms[0].text.replace("Bedrooms: ", ""))
        else:
            bedrooms = pd.NA

        dates, prices = extract_dates_prices(property_card)

        property_info = pd.DataFrame({
            "property_type": property_type,
            "address": address,
            "date": dates,
            "price": prices,
            "bedrooms": bedrooms,
        })

        properties_info.append(property_info)
    
    properties_info = pd.concat(properties_info).reset_index(drop=True)

    return properties_info

def extract_dates_prices(property_card):
    prices_dates = property_card.find_all("td")[2:]

    dates = []
    prices = []

    for i, p in enumerate(prices_dates):
        # Reached the end of available dates.
        if p.text == "":
            break

        price = p.text.replace("\x00", "")
        
        if i % 2 == 0:
            dates.append(price)
        else:
            assert price[0] == "£"
            prices.append(int(price[1:].replace(",", "")))
    
    return dates, prices

In [15]:
properties_info_pages = get_sold_property_info_postcode(url)

There's 13 of results.
scraping page 0
scraping page 1
scraping page 2
scraping page 3
scraping page 4
scraping page 5
scraping page 6
scraping page 7
scraping page 8
scraping page 9
scraping page 10
scraping page 11
scraping page 12


In [16]:
properties_info_pages.reset_index(drop=True).to_csv("../data/02-explore_right_move/w3_9jj_0.5_mile_flat_sold.csv", index=False)