# Extracting Data From Multiple Properties on Rightmove

## Import Packages

In [35]:
# Web - Scraping and API Requests
import requests
from httpx import AsyncClient, Response
from parsel import Selector
import parsel
import jmespath
import asyncio
from urllib.parse import urlencode

# Data Manipulation and Analysis
import pandas as pd
from pprint import pprint 
import json
from typing import List
from typing import TypedDict

# Database Connection
from sqlalchemy import create_engine

# File and System Operations
import os
import sys

## Other Setup

In [36]:
pd.set_option('display.max_columns', None) # Display all columns in any given DataFrame

In [37]:
# This allows one to reload the custom package without having to install it again
%load_ext autoreload 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
# this allows one to reload the custom package without having to install it again
%autoreload 1

sys.path.insert(0,'../src/')

# Import the custom package and sub-packages
%aimport rental_utils
%aimport rental_utils.functions
%aimport rental_utils.sql_queries


## Make Requests

### Set up HTTP Headers so as to Slip Through the API.

In [39]:
# 1. establish HTTP client with browser-like headers to avoid being blocked
client = AsyncClient(headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept": "application/json",  # Accept json apis
    "Referer": "https://www.rightmove.co.uk/",  # Helps mimic browser use
})


### Define a function that takes a location string and finds out whatr its location identifier should be

In [40]:
async def find_locations(query: str) -> List[str]:
    """use rightmove's typeahead api to find location IDs. Returns list of location IDs in most likely order"""
    # Tokenize the query string into two-character segments separated by slashes, as required by the API
    tokenize_query = "".join(c + ("/" if i % 2 == 0 else "") for i, c in enumerate(query.upper(), start=1))
    # Construct the URL for the typeahead API using the tokenized query
    url = f"https://www.rightmove.co.uk/typeAhead/uknostreet/{tokenize_query.strip('/')}/"
    # Make an asynchronous GET request to the API
    response = await client.get(url)
    # Parse the JSON response from the API
    data = json.loads(response.text)
    # Extract and return the list of location identifiers from the response
    return [prediction["locationIdentifier"] for prediction in data["typeAheadLocations"]]

#### Find the location identifiers for london and cornwall

In [41]:
london_id = (await rental_utils.functions.find_locations("london"))[0]
cornwall_id = (await find_locations("cornwall"))[0]


print(cornwall_id)
print(london_id)


REGION^61294
REGION^87490


### Define a Function that Makes the Request to Rightmove's Hidden API

This requires using the hidden api endpoint (the base url), and passing through the search parameters

In [None]:
async def scrape_search(location_id: str, total_results = 250) -> str:
    """
    Scrapes rental property listings from Rightmove for a given location identifier, handling pagination and returning all results.
    """
    RESULTS_PER_PAGE = 24

    def make_url(offset: int) -> str:
        url = "https://www.rightmove.co.uk/api/_search?"
        params = {
            "areaSizeUnit": "sqm", # the units for the size of each property
            "channel": "RENT",  # BUY or RENT - for my puyrposes, rent is the most relevant
            "currencyCode": "GBP", # chosen currency
            "includeSSTC": "false", # an empty search parameter
            "index": offset, # the number of the search result/property displayed at the start of the page 
            "isFetching": "false", 
            "locationIdentifier": location_id, # the location we wish to search for (London)
            "numberOfPropertiesPerPage": RESULTS_PER_PAGE,
            "radius": "0.0", # how far away we are allowed to be from the geographgical boundaries of the region
            "sortType": "6", # the sorting mechanism for search results
            "viewType": "LIST", # how results appear
        }
        return url + urlencode(params)

    # Build the URL for the first page of results
    url = make_url(0)
    # print(f"Requesting URL: {url}")
    # Send the request to the Rightmove API for the first page
    first_page = await client.get(url)
    # print(f"First page status: {first_page.status_code}")
    # Parse the JSON response from the first page
    first_page_data = first_page.json()
    results = first_page_data["properties"]

    # Prepare to fetch additional pages if there are more results
    other_pages = []
    # rightmove sets the API limit to 1000 properties, but here max_api_results is set to 20 for demonstration/testing
    max_api_results = 1000    
    # The 'index' parameter in the URL specifies the starting property for each page
    for offset in range(RESULTS_PER_PAGE, total_results, RESULTS_PER_PAGE):
        # Stop scraping more pages when the scraper reaches the API limit
        if offset >= max_api_results: 
            break
        print(f"Scheduling request for offset: {offset}")
        # Schedule the request for the next page
        other_pages.append(client.get(make_url(offset)))
    # Asynchronously (using async) gather and process all additional page responses
    for response in asyncio.as_completed(other_pages):
        response = await response
        # print(f"Received response for additional page: {response.status_code}")
        data = json.loads(response.text)
        results.extend(data['properties'])
    
    # display the number of results that we managed to parse across multiple pages
    total_results = len(results)
    print(f"Found {total_results} properties")
    return results




### Scrape Multiple Pages of Results, Each with Multiple Properties

I then save this out to a large json file that is then cleaned and analysed in separate notebooks

In [47]:
london_results = await scrape_search(london_id)
with open("../data/rightmove_properties.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(london_results, indent=2))



Scheduling request for offset: 24
Scheduling request for offset: 48
Scheduling request for offset: 72
Scheduling request for offset: 96
Scheduling request for offset: 120
Scheduling request for offset: 144
Scheduling request for offset: 168
Scheduling request for offset: 192
Scheduling request for offset: 216
Scheduling request for offset: 240
Found 275 properties
