# Collecting Data From Rightmove Listings

Before extracting the target data, I use this notebook to practice the use of a hidden API in the QS University Rankings website, that way guaging the functionality.

## Import Packages

In [1]:
# Web - Scraping and API Requests
import requests
from httpx import AsyncClient, Response
from parsel import Selector
import parsel
import jmespath
import asyncio

# Data Manipulation and Analysis
import pandas as pd
from pprint import pprint 
import json
from typing import List
from typing import TypedDict

# Database Connection
from sqlalchemy import create_engine

# File and System Operations
import os
import sys

In [2]:
pd.set_option('display.max_columns', None) # Display all columns in any given DataFrame

## Set Up An Initial Request

### Establish Headers

In [3]:
# 1. establish HTTP client with browser-like headers to avoid being blocked
client = AsyncClient(
    headers={
        "User-Agent": "Mozilla/5.0",
        "Accept": "text/html",
    }
)

### Define a function that parses rightmove property data to only get the relevant fields

In [4]:
class PropertyResult(TypedDict):
    """this is what our result dataset will look like"""
    id: str
    available: bool
    archived: bool
    phone: str
    bedrooms: int
    bathrooms: int
    type: str
    property_type: str
    tags: list
    description: str
    title: str
    subtitle: str
    price: str
    price_sqft: str
    address: dict
    latitude: float
    longitude: float
    features: list
    history: dict
    photos: list
    floorplans: list
    agency: dict
    industryAffiliations: list
    nearest_airports: list
    nearest_stations: list
    sizings: list
    brochures: list


def parse_property(data) -> PropertyResult:
    """parse rightmove cache data for proprety information"""
    # here we define field name to JMESPath mapping
    parse_map = {
        "id": "id",
        "available": "status.published",
        "archived": "status.archived",
        "phone": "contactInfo.telephoneNumbers.localNumber",
        "bedrooms": "bedrooms",
        "bathrooms": "bathrooms",
        "type": "transactionType",
        "property_type": "propertySubType",
        "tags": "tags",
        "description": "text.description",
        "title": "text.pageTitle",
        "subtitle": "text.propertyPhrase",
        "price": "prices.primaryPrice",
        "price_sqft": "prices.pricePerSqFt",
        "address": "address",
        "latitude": "location.latitude",
        "longitude": "location.longitude",
        "features": "keyFeatures",
        "history": "listingHistory",
        "photos": "images[*].{url: url, caption: caption}",
        "floorplans": "floorplans[*].{url: url, caption: caption}",
        "agency": """customer.{
            id: branchId, 
            branch: branchName, 
            company: companyName, 
            address: displayAddress, 
            commercial: commercial, 
            buildToRent: buildToRent,
            isNew: isNewHomeDeveloper
        }""",
        "industryAffiliations": "industryAffiliations[*].name",
        "nearest_airports": "nearestAirports[*].{name: name, distance: distance}",
        "nearest_stations": "nearestStations[*].{name: name, distance: distance}",
        "sizings": "sizings[*].{unit: unit, min: minimumSize, max: maximumSize}",
        "brochures": "brochures",
    }
    results = {}
    for key, path in parse_map.items():
        value = jmespath.search(path, data)
        results[key] = value
    return results

### Define a Function that Will Find JSON objects in text, and generate decoded JSON data

In [5]:
def find_json_objects(text: str, decoder=json.JSONDecoder()):
    """Find JSON objects in text, and generate decoded JSON data"""
    pos = 0
    while True:
        match = text.find("{", pos)
        if match == -1:
            break
        try:
            result, index = decoder.raw_decode(text[match:])
            yield result
            pos = match + index
        except ValueError:
            pos = match + 1

### Define a function that will find the PAGE_MODEL javascript variable and extract it 

In [6]:
# This function will find the PAGE_MODEL javascript variable and extract it 
def extract_property(response: Response) -> dict:
    """extract property data from rightmove PAGE_MODEL javascript variable"""
    selector = Selector(response.text)
    data = selector.xpath("//script[contains(.,'PAGE_MODEL = ')]/text()").get()
    if not data:
        print(f"page {response.url} is not a property listing page")
        return
    json_data = list(find_json_objects(data))[0]
    return json_data["propertyData"]

### Define the primary scraping function that takes urls and returns the data

In [7]:
async def scrape_properties(urls: List[str]) -> List[dict]:
    """
    Scrape Rightmove property listings from a list of URLs,
    parse relevant fields, and save all results to a single JSON file.
    """
    # Prepare asynchronous GET requests for all URLs using the shared client
    to_scrape = [client.get(url) for url in urls]

    # List to store parsed property data
    properties = []

    # Asynchronously process each response as it completes
    for response in asyncio.as_completed(to_scrape):
        # Await the HTTP response for the property page
        response = await response

        # Extract and parse the property data from the response
        prop = parse_property(extract_property(response))

        # Add the parsed property data to the list
        properties.append(prop)

    # Save all parsed properties as a single JSON array to disk
    with open("../../data/rightmove_select.json", "w", encoding="utf-8") as f:
        json.dump(properties, f, indent=2)

    # Return the list of parsed property dictionaries
    return properties




### Define a running script
This saves out the data

In [8]:
async def run():
    data = await scrape_properties([
        "https://www.rightmove.co.uk/properties/163907069#/",
        "https://www.rightmove.co.uk/properties/163907291#/",
    ])

    print(json.dumps(data, indent=2))

In [9]:
await run()
await client.aclose()

[
  {
    "id": "163907069",
    "available": true,
    "archived": false,
    "phone": "020 3872 4834",
    "bedrooms": 2,
    "bathrooms": 1,
    "type": "RENT",
    "property_type": "Flat",
    "tags": [],
    "description": "A charming split level two bedroom apartment to rent with access to a private roof terrace in the centre of East Dulwich.<br /><br />\rLocated on this great road just off the vibrant Lordship Lane is this charming split level two bedroom apartment to rent. With beautiful views over London and a large decked private roof terrace. Features include a spacious reception room and separate fully fitted kitchen. Available end of July. <br /><br />Additional Information<br /><br />Holding Deposit: \u00a3553.84 (1 week)*<br />Tenancy Deposit: \u00a32769.23 (5 weeks)*<br />Council Tax Band: D<br /><br />*The deposit amounts are approximate and will vary depending on the final rent agreed.<br /><br />",
    "title": "2 bedroom flat for rent in Bassano Street, London, SE22