# Collecting Data From Rightmove Listings

Before extracting the target data, I use this notebook to practice the use of a hidden API in the QS University Rankings website, that way guaging the functionality.

## Import Packages

In [603]:
# Web - Scraping and API Requests
import requests
from httpx import AsyncClient, Response
from parsel import Selector
import parsel
import jmespath
import asyncio

# Data Manipulation and Analysis
import pandas as pd
from pprint import pprint 
import json
from typing import List
from typing import TypedDict

# Database Connection
from sqlalchemy import create_engine

# File and System Operations
import os
import sys

In [604]:
pd.set_option('display.max_columns', None) # Display all columns in any given DataFrame

In [605]:
# This allows one to reload the custom package without having to install it again
%load_ext autoreload 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [606]:
# this allows one to reload the custom package without having to install it again
%autoreload 1

sys.path.insert(0,'../src/')

# https://stackoverflow.com/questions/70898150/jupyter-autoreload-workflow/73623267#73623267
# My custom package
%aimport rental_utils

In [607]:
# Load SQL Magic for Jupyter Notebooks
%load_ext sql
%config SqlMagic.displaylimit = None
%config SqlMagic.autocommit=True # for engines that do not support autommit

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


## Set Up An Initial Request

### Establish Headers

In [608]:
# 1. establish HTTP client with browser-like headers to avoid being blocked
client = AsyncClient(
    headers={
        "User-Agent": "Mozilla/5.0",
        "Accept": "text/html",
    }
)

### Define a function that parses rightmove property data to only get the relevant fields

In [609]:
class PropertyResult(TypedDict):
    """this is what our result dataset will look like"""
    id: str
    available: bool
    archived: bool
    phone: str
    bedrooms: int
    bathrooms: int
    type: str
    property_type: str
    tags: list
    description: str
    title: str
    subtitle: str
    price: str
    price_sqft: str
    address: dict
    latitude: float
    longitude: float
    features: list
    history: dict
    photos: list
    floorplans: list
    agency: dict
    industryAffiliations: list
    nearest_airports: list
    nearest_stations: list
    sizings: list
    brochures: list


def parse_property(data) -> PropertyResult:
    """parse rightmove cache data for proprety information"""
    # here we define field name to JMESPath mapping
    parse_map = {
        "id": "id",
        "available": "status.published",
        "archived": "status.archived",
        "phone": "contactInfo.telephoneNumbers.localNumber",
        "bedrooms": "bedrooms",
        "bathrooms": "bathrooms",
        "type": "transactionType",
        "property_type": "propertySubType",
        "tags": "tags",
        "description": "text.description",
        "title": "text.pageTitle",
        "subtitle": "text.propertyPhrase",
        "price": "prices.primaryPrice",
        "price_sqft": "prices.pricePerSqFt",
        "address": "address",
        "latitude": "location.latitude",
        "longitude": "location.longitude",
        "features": "keyFeatures",
        "history": "listingHistory",
        "photos": "images[*].{url: url, caption: caption}",
        "floorplans": "floorplans[*].{url: url, caption: caption}",
        "agency": """customer.{
            id: branchId, 
            branch: branchName, 
            company: companyName, 
            address: displayAddress, 
            commercial: commercial, 
            buildToRent: buildToRent,
            isNew: isNewHomeDeveloper
        }""",
        "industryAffiliations": "industryAffiliations[*].name",
        "nearest_airports": "nearestAirports[*].{name: name, distance: distance}",
        "nearest_stations": "nearestStations[*].{name: name, distance: distance}",
        "sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}",
        "brochures": "brochures",
    }
    results = {}
    for key, path in parse_map.items():
        value = jmespath.search(path, data)
        results[key] = value
    return results

### Define a Function that Will Find JSON objects in text, and generate decoded JSON data

In [610]:
def find_json_objects(text: str, decoder=json.JSONDecoder()):
    """Find JSON objects in text, and generate decoded JSON data"""
    pos = 0
    while True:
        match = text.find("{", pos)
        if match == -1:
            break
        try:
            result, index = decoder.raw_decode(text[match:])
            yield result
            pos = match + index
        except ValueError:
            pos = match + 1

### Define a function that will find the PAGE_MODEL javascript variable and extract it 

In [611]:
# This function will find the PAGE_MODEL javascript variable and extract it 
def extract_property(response: Response) -> dict:
    """extract property data from rightmove PAGE_MODEL javascript variable"""
    selector = Selector(response.text)
    data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
    if not data:
        print(f"page {response.url} is not a property listing page")
        return
    json_data = list(find_json_objects(data))[0]
    return json_data["propertyData"]

### Define the primary scraping function that takes urls and returns the data

In [612]:
async def scrape_properties(urls: List[str]) -> List[dict]:
    """
    Scrape Rightmove property listings from a list of URLs,
    parse relevant fields, and save all results to a single JSON file.
    """
    # Prepare asynchronous GET requests for all URLs using the shared client
    to_scrape = [client.get(url) for url in urls]

    # List to store parsed property data
    properties = []

    # Asynchronously process each response as it completes
    for response in asyncio.as_completed(to_scrape):
        # Await the HTTP response for the property page
        response = await response

        # Extract and parse the property data from the response
        prop = parse_property(extract_property(response))

        # Add the parsed property data to the list
        properties.append(prop)

    # Save all parsed properties as a single JSON array to disk
    with open("../data/rightmove_properties.json", "w", encoding="utf-8") as f:
        json.dump(properties, f, indent=2)

    # Return the list of parsed property dictionaries
    return properties




### Define a running script

In [613]:
async def run():
    data = await scrape_properties([
        "https://www.rightmove.co.uk/properties/163907069#/",
        "https://www.rightmove.co.uk/properties/163907291#/",
    ])

    print(json.dumps(data, indent=2))

In [614]:
await run()
await client.aclose()

[
  {
    "id": "163907069",
    "available": true,
    "archived": false,
    "phone": "020 3872 4834",
    "bedrooms": 2,
    "bathrooms": 1,
    "type": "RENT",
    "property_type": "Flat",
    "tags": [],
    "description": "A charming split level two bedroom apartment to rent with access to a private roof terrace in the centre of East Dulwich.<br /><br />\rLocated on this great road just off the vibrant Lordship Lane is this charming split level two bedroom apartment to rent. With beautiful views over London and a large decked private roof terrace. Features include a spacious reception room and separate fully fitted kitchen. Available end of July. <br /><br />Additional Information<br /><br />Holding Deposit: \u00a3553.84 (1 week)*<br />Tenancy Deposit: \u00a32769.23 (5 weeks)*<br />Council Tax Band: D<br /><br />*The deposit amounts are approximate and will vary depending on the final rent agreed.<br /><br />",
    "title": "2 bedroom flat for rent in Bassano Street, London, SE22

In [593]:
with open("../data/rightmove_properties.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [601]:
display(data[0])

{'id': '163907291',
 'available': True,
 'archived': False,
 'phone': '020 3835 3553',
 'bedrooms': 2,
 'bathrooms': 2,
 'type': 'RENT',
 'property_type': 'Apartment',
 'tags': [],
 'description': 'Located on the 6th floor, is this gorgeous two bedroom, two bathroom corner unit offering jaw dropping views of Thames City & The Shard. Spread over 807 Sq.Ft this lateral apartment sits on the best plot position of the building, with an over sized living room that is bathed in light and further heightened with access to a private balcony makes for the perfect in-out flow. <br /><br />Apartment Features:<br />- Corner apartment <br />- Direct view of The Shard<br />- Designed by London-based duo Benningen & Lloyd<br />- Rare marble, crafted tiles with mood lit bathrooms<br />- Wine cooler <br />- Underfloor heating and comfort cooling<br /><br />Resident Exclusive Amenities:<br />- Access to the infamous Sky Pool <br />- Orangery rooftop <br />- 24/7 concierge <br />- State of the art gym <b

### Normalise th JSON so that it turns into a dataframe

In [595]:
data_norm = pd.json_normalize(data, max_level = 0)
data_norm.head()

Unnamed: 0,id,available,archived,phone,bedrooms,bathrooms,type,property_type,tags,description,title,subtitle,price,price_sqft,address,latitude,longitude,features,history,photos,floorplans,agency,industryAffiliations,nearest_airports,nearest_stations,sizings,brochures
0,163907291,True,False,020 3835 3553,2,2,RENT,Apartment,[],"Located on the 6th floor, is this gorgeous two...","2 bedroom apartment for rent in The Modern, Em...",2 bedroom apartment,"£4,400 pcm",,"{'displayAddress': 'The Modern, Embassy Garden...",51.482636,-0.129653,"[Available Now, Virtual tour available, Fully ...",{'listingUpdateReason': 'Added yesterday'},[{'url': 'https://media.rightmove.co.uk/265k/2...,[{'url': 'https://media.rightmove.co.uk/265k/2...,"{'id': 264404, 'branch': 'London', 'company': ...",[],[],"[{'name': 'Nine Elms Station', 'distance': 0.1...","[{'unit': 'ha', 'min': 0.01, 'max': 0.01}, {'u...",[]
1,163907069,True,False,020 3872 4834,2,1,RENT,Flat,[],A charming split level two bedroom apartment t...,"2 bedroom flat for rent in Bassano Street, Lon...",2 bedroom flat,"£2,400 pcm",,"{'displayAddress': 'Bassano Street, London, SE...",51.456039,-0.075196,"[-\tTwo double bedrooms, - Spaciou...",{'listingUpdateReason': 'Added yesterday'},[{'url': 'https://media.rightmove.co.uk/58k/57...,[{'url': 'https://media.rightmove.co.uk/58k/57...,"{'id': 57994, 'branch': 'East Dulwich', 'compa...",[Property Redress Scheme],[],"[{'name': 'East Dulwich Station', 'distance': ...",[],[]


### Filter out only the desired columns

In [596]:
def filter_df(df=data):
    results_df = pd.json_normalize(df, max_level=0)
    results_df.columns = results_df.columns.str.lower().str.replace('.', '_').str.replace(' ', '_')
    base_cols = [
        'id',
        'available',
        'archived',
        'bedrooms',
        'bathrooms',
        'property_type',
        'description',
        'title',
        'subtitle',
        'price',
        'price_sqft',
        'latitude',
        'longitude',
        'nearest_stations',
        'sizings',
        'history',
        'address',
        'features'
    ]
    columns_of_interest = base_cols
    filtered_df = results_df[columns_of_interest]
    return filtered_df


filtered_df = filter_df(data)
filtered_df.head()

Unnamed: 0,id,available,archived,bedrooms,bathrooms,property_type,description,title,subtitle,price,price_sqft,latitude,longitude,nearest_stations,sizings,history,address,features
0,163907291,True,False,2,2,Apartment,"Located on the 6th floor, is this gorgeous two...","2 bedroom apartment for rent in The Modern, Em...",2 bedroom apartment,"£4,400 pcm",,51.482636,-0.129653,"[{'name': 'Nine Elms Station', 'distance': 0.1...","[{'unit': 'ha', 'min': 0.01, 'max': 0.01}, {'u...",{'listingUpdateReason': 'Added yesterday'},"{'displayAddress': 'The Modern, Embassy Garden...","[Available Now, Virtual tour available, Fully ..."
1,163907069,True,False,2,1,Flat,A charming split level two bedroom apartment t...,"2 bedroom flat for rent in Bassano Street, Lon...",2 bedroom flat,"£2,400 pcm",,51.456039,-0.075196,"[{'name': 'East Dulwich Station', 'distance': ...",[],{'listingUpdateReason': 'Added yesterday'},"{'displayAddress': 'Bassano Street, London, SE...","[-\tTwo double bedrooms, - Spaciou..."


### Explode the columns that are lists


In [597]:
def flatten_df(filtered_df):
    df = filtered_df.copy()

    # Extract nearest station names and distances
    def extract_stations(stations, key):
        if not isinstance(stations, list): # if stations is not a list, return None for all three
            return [None] * 3
        # Extract the first three stations' names or distances, or None if not available
        return [s.get(key) if isinstance(s, dict) else None for s in stations[:3]] + [None] * (3 - len(stations))

    # Apply the extraction function to the nearest_stations column
    df[['station_1_name', 'station_2_name', 'station_3_name']] = df['nearest_stations'].apply(lambda x: pd.Series(extract_stations(x, 'name')))
    df[['station_1_dist', 'station_2_dist', 'station_3_dist']] = df['nearest_stations'].apply(lambda x: pd.Series(extract_stations(x, 'distance')))

    # Extract sizings with unit == "sqm"
    def extract_sqm(sizings):
        if not isinstance(sizings, list): # if sizings is not a list, return None
            return pd.Series([None, None])
        for s in sizings: # loop through each element (each one is a dictionary) in the sizings list
            if isinstance(s, dict) and s.get('unit') == 'sqm': # if the dictionary has a 'unit' key with value 'sqm'
                return pd.Series([s.get('min'), s.get('max')]) # return the min and max values of the sizing measurement
        return pd.Series([None, None]) # else return None for both min and max
    
    # Apply the extraction function to the sizings column
    df[['size_min_sqm', 'size_max_sqm']] = df['sizings'].apply(extract_sqm)


     # Normalize the 'history' and 'address' nested dictionaries
    history_df = pd.json_normalize(df['history'])  # flatten the nested dict in history
    history_df.columns = [f'history_{str.lower(col)}' for col in history_df.columns]  # prefix with 'history_'

    address_df = pd.json_normalize(df['address'])  # flatten the nested dict in address
    address_df.columns = [f'{str.lower(col)}' for col in address_df.columns]  # prefix with 'address_'

    # Concatenate the new columns and drop the originals
    df = pd.concat([df.drop(columns=['nearest_stations', 'sizings', 'history', 'address']), history_df, address_df], axis=1)

    return df




flattened_df = flatten_df(filtered_df)
flattened_df.head()

Unnamed: 0,id,available,archived,bedrooms,bathrooms,property_type,description,title,subtitle,price,price_sqft,latitude,longitude,features,station_1_name,station_2_name,station_3_name,station_1_dist,station_2_dist,station_3_dist,size_min_sqm,size_max_sqm,history_listingupdatereason,displayaddress,countrycode,deliverypointid,ukcountry,outcode,incode
0,163907291,True,False,2,2,Apartment,"Located on the 6th floor, is this gorgeous two...","2 bedroom apartment for rent in The Modern, Em...",2 bedroom apartment,"£4,400 pcm",,51.482636,-0.129653,"[Available Now, Virtual tour available, Fully ...",Nine Elms Station,Vauxhall Station,Pimlico Station,0.196516,0.347126,0.485662,75.0,75.0,Added yesterday,"The Modern, Embassy Gardens, SW11",GB,107171035.0,England,SW11,7AY
1,163907069,True,False,2,1,Flat,A charming split level two bedroom apartment t...,"2 bedroom flat for rent in Bassano Street, Lon...",2 bedroom flat,"£2,400 pcm",,51.456039,-0.075196,"[-\tTwo double bedrooms, - Spaciou...",East Dulwich Station,North Dulwich Station,Denmark Hill Station,0.444656,0.600147,1.048489,,,Added yesterday,"Bassano Street, London, SE22",GB,,England,SE22,8RY


### Select only the very most essential columns

In [602]:
core_df = flattened_df[['id',
                       'bedrooms',
                       'bathrooms',
                       'price',
                       'latitude',
                       'longitude',
                       'station_1_name',
                       'station_1_dist',
                       'size_min_sqm',
                       'size_max_sqm',
                       'displayaddress']]

In [599]:
def clean_price(df):
    # Make a copy to avoid modifying the original DataFrame
    df = df.copy()
    # Remove pound sign and commas, extract numeric part, and convert to float
    df['price'] = (
        df['price']
        .str.replace('£', '', regex=False)
        .str.replace(',', '', regex=False)
        .str.extract(r'(\d+)')
        .astype(float)
    )
    return df

In [600]:
clean_price(core_df)

Unnamed: 0,id,bedrooms,bathrooms,price,latitude,longitude,station_1_name,station_1_dist,size_min_sqm,size_max_sqm,displayaddress
0,163907291,2,2,4400.0,51.482636,-0.129653,Nine Elms Station,0.196516,75.0,75.0,"The Modern, Embassy Gardens, SW11"
1,163907069,2,1,2400.0,51.456039,-0.075196,East Dulwich Station,0.444656,,,"Bassano Street, London, SE22"
