# Clean the Property Data Gathered

## Import Packages

In [48]:
# Web - Scraping and API Requests
import requests
from httpx import AsyncClient, Response
from parsel import Selector
import parsel
import jmespath
import asyncio

# Data Manipulation and Analysis
import pandas as pd
from pprint import pprint 
import json
from typing import List
from typing import TypedDict

# Database Connection
from sqlalchemy import create_engine

# File and System Operations
import os
import sys

In [49]:
pd.set_option('display.max_columns', None) # Display all columns in any given DataFrame

In [50]:
# This allows one to reload the custom package without having to install it again
%load_ext autoreload 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
# this allows one to reload the custom package without having to install it again
%autoreload 1

sys.path.insert(0,'../src/')

# https://stackoverflow.com/questions/70898150/jupyter-autoreload-workflow/73623267#73623267
# My custom package
%aimport rental_utils

In [52]:
# Load SQL Magic for Jupyter Notebooks
%load_ext sql
%config SqlMagic.displaylimit = None
%config SqlMagic.autocommit=True # for engines that do not support autommit

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


## Clean the data for selected urls

In [53]:
with open("../data/rightmove_properties.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [54]:
display(data[0])

{'id': 163628153,
 'bedrooms': 0,
 'bathrooms': 1,
 'numberOfImages': 10,
 'numberOfFloorplans': 0,
 'numberOfVirtualTours': 0,
 'summary': 'A quirky studio apartment situated on the raised ground floor of a stucco fronted period house. The propery benefits from having high ceilings throughout, open plan kitchen, mezzanine sleeping area and a three piece shower room. Available immediately offered furnished. Rent includes all utilities...',
 'displayAddress': 'Belsize Park, London, NW3',
 'countryCode': 'GB',
 'location': {'latitude': 51.548427, 'longitude': -0.171794},
 'propertyImages': {'images': [{'url': '107k/106225/163628153/106225_BEP210149_L_IMG_00_0000.jpeg',
    'caption': 'Picture No. 01',
    'srcUrl': 'https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/107k/106225/163628153/106225_BEP210149_L_IMG_00_0000_max_476x317.jpeg'},
   {'url': '107k/106225/163628153/106225_BEP210149_L_IMG_01_0000.jpeg',
    'caption': 'Picture No. 10',
    'srcUrl': 'https://media.rightmove.co.uk:

### Normalise th JSON so that it turns into a dataframe

In [55]:
data_norm = pd.json_normalize(data, max_level = 1)
data_norm.head(1)

Unnamed: 0,id,bedrooms,bathrooms,numberOfImages,numberOfFloorplans,numberOfVirtualTours,summary,displayAddress,countryCode,propertySubType,premiumListing,featuredProperty,distance,transactionType,commercial,development,residential,students,auction,feesApply,feesApplyText,displaySize,showOnMap,propertyUrl,contactUrl,staticMapUrl,channel,firstVisibleDate,keywords,keywordMatchType,saved,hidden,onlineViewingsAvailable,hasBrandPlus,displayStatus,enquiredTimestamp,enquiryAddedTimestamp,enquiryCalledTimestamp,heading,isRecent,enhancedListing,addedOrReduced,formattedBranchName,formattedDistance,propertyTypeFullDescription,location.latitude,location.longitude,propertyImages.images,propertyImages.mainImageSrc,propertyImages.mainMapImageSrc,listingUpdate.listingUpdateReason,listingUpdate.listingUpdateDate,price.amount,price.frequency,price.currencyCode,price.displayPrices,customer.branchId,customer.brandPlusLogoURI,customer.contactTelephone,customer.branchDisplayName,customer.branchName,customer.brandTradingName,customer.branchLandingPageUrl,customer.development,customer.showReducedProperties,customer.commercial,customer.showOnMap,customer.enhancedListing,customer.developmentContent,customer.buildToRent,customer.buildToRentBenefits,customer.brandPlusLogoUrl,productLabel.productLabelText,productLabel.spotlightLabel,lozengeModel.matchingLozenges
0,163628153,0,1,10,0,0,A quirky studio apartment situated on the rais...,"Belsize Park, London, NW3",GB,Apartment,False,True,,rent,False,False,True,False,False,True,,,True,/properties/163628153#/?channel=RES_LET,/property-to-rent/contactBranch.html?propertyI...,,RENT,2025-06-21T11:00:44Z,[],no_keyword,False,False,False,True,,,,,Featured Property,False,False,Added on 21/06/2025,"by Stones Residential, Belsize Park",,Studio apartment,51.548427,-0.171794,[{'url': '107k/106225/163628153/106225_BEP2101...,https://media.rightmove.co.uk:443/dir/crop/10:...,https://media.rightmove.co.uk:443/dir/crop/10:...,new,2025-06-21T11:06:13Z,1500,monthly,GBP,"[{'displayPrice': '£1,500 pcm', 'displayPriceQ...",106225,/company/clogo_19244_0005.jpeg,020 3909 6480,"Stones Residential, Belsize Park",Belsize Park,Stones Residential,/estate-agents/agent/Stones-Residential/Belsiz...,False,True,False,True,False,,False,[],https://media.rightmove.co.uk:443/company/clog...,,False,[]


### Filter out only the desired columns

In [57]:
def filter_df(df=data_norm):
    base_cols = [
        'id',
        'bedrooms',
        'bathrooms',
        'numberOfImages',
        'displayAddress',
        'location.latitude',
        'location.longitude',
        'propertySubType',
        'listingUpdate.listingUpdateReason',
        'listingUpdate.listingUpdateDate',
        'price.amount',
        'price.frequency',
        'premiumListing',
        'featuredProperty',
        'transactionType',
        'students',
        'displaySize',
        'propertyUrl',
        'firstVisibleDate',
        'addedOrReduced',
        'propertyTypeFullDescription'
    ]
    columns_of_interest = base_cols
    filtered_df = df[columns_of_interest]
    return filtered_df


filtered_df = filter_df(data_norm)
filtered_df.head()

Unnamed: 0,id,bedrooms,bathrooms,numberOfImages,displayAddress,location.latitude,location.longitude,propertySubType,listingUpdate.listingUpdateReason,listingUpdate.listingUpdateDate,price.amount,price.frequency,premiumListing,featuredProperty,transactionType,students,displaySize,propertyUrl,firstVisibleDate,addedOrReduced,propertyTypeFullDescription
0,163628153,0,1,10,"Belsize Park, London, NW3",51.548427,-0.171794,Apartment,new,2025-06-21T11:06:13Z,1500,monthly,False,True,rent,False,,/properties/163628153#/?channel=RES_LET,2025-06-21T11:00:44Z,Added on 21/06/2025,Studio apartment
1,163931060,1,1,10,"Ravensbury Court, Bishopsford Road, Morden, Lo...",51.389634,-0.19673,Flat,new,2025-06-28T19:35:01Z,1300,monthly,False,False,rent,False,,/properties/163931060#/?channel=RES_LET,2025-06-28T19:29:09Z,Added today,1 bedroom flat
2,163930976,4,2,11,"Queens Road, Walthamstow, E17",51.576702,-0.028471,End of Terrace,new,2025-06-28T19:14:03Z,4350,monthly,False,False,rent,False,,/properties/163930976#/?channel=RES_LET,2025-06-28T19:08:16Z,Added today,4 bedroom end of terrace house
3,162597587,1,1,15,"Nine Elms Lane, London, SW8",51.484241,-0.12758,Flat,price_reduced,2025-06-28T19:12:16Z,4000,monthly,False,False,rent,False,,/properties/162597587#/?channel=RES_LET,2025-05-29T17:21:19Z,Reduced today,1 bedroom flat
4,163930820,1,1,7,"Brooklands Court, Cavendish Road, Brondesbury,...",51.544548,-0.204789,Apartment,new,2025-06-28T18:34:04Z,380,weekly,False,False,rent,False,47 sq. m.,/properties/163930820#/?channel=RES_LET,2025-06-28T18:28:54Z,Added today,1 bedroom apartment


AttributeError: 'str' object has no attribute 'values'

### Explode the columns that are lists


In [None]:
def flatten_df(filtered_df):
    df = filtered_df.copy()

    # Extract nearest station names and distances
    def extract_stations(stations, key):
        if not isinstance(stations, list): # if stations is not a list, return None for all three
            return [None] * 3
        # Extract the first three stations' names or distances, or None if not available
        return [s.get(key) if isinstance(s, dict) else None for s in stations[:3]] + [None] * (3 - len(stations))

    # Apply the extraction function to the nearest_stations column
    df[['station_1_name', 'station_2_name', 'station_3_name']] = df['nearest_stations'].apply(lambda x: pd.Series(extract_stations(x, 'name')))
    df[['station_1_dist', 'station_2_dist', 'station_3_dist']] = df['nearest_stations'].apply(lambda x: pd.Series(extract_stations(x, 'distance')))

    # Extract sizings with unit == "sqm"
    def extract_sqm(sizings):
        if not isinstance(sizings, list): # if sizings is not a list, return None
            return pd.Series([None, None])
        for s in sizings: # loop through each element (each one is a dictionary) in the sizings list
            if isinstance(s, dict) and s.get('unit') == 'sqm': # if the dictionary has a 'unit' key with value 'sqm'
                return pd.Series([s.get('min'), s.get('max')]) # return the min and max values of the sizing measurement
        return pd.Series([None, None]) # else return None for both min and max
    
    # Apply the extraction function to the sizings column
    df[['size_min_sqm', 'size_max_sqm']] = df['sizings'].apply(extract_sqm)


     # Normalize the 'history' and 'address' nested dictionaries
    history_df = pd.json_normalize(df['history'])  # flatten the nested dict in history
    history_df.columns = [f'history_{str.lower(col)}' for col in history_df.columns]  # prefix with 'history_'

    address_df = pd.json_normalize(df['address'])  # flatten the nested dict in address
    address_df.columns = [f'{str.lower(col)}' for col in address_df.columns]  # prefix with 'address_'

    # Concatenate the new columns and drop the originals
    df = pd.concat([df.drop(columns=['nearest_stations', 'sizings', 'history', 'address']), history_df, address_df], axis=1)

    return df




flattened_df = flatten_df(filtered_df)
flattened_df.head()

Unnamed: 0,id,available,archived,bedrooms,bathrooms,property_type,description,title,subtitle,price,price_sqft,latitude,longitude,features,station_1_name,station_2_name,station_3_name,station_1_dist,station_2_dist,station_3_dist,size_min_sqm,size_max_sqm,history_listingupdatereason,displayaddress,countrycode,deliverypointid,ukcountry,outcode,incode
0,163907069,True,False,2,1,Flat,A charming split level two bedroom apartment t...,"2 bedroom flat for rent in Bassano Street, Lon...",2 bedroom flat,"£2,400 pcm",,51.456039,-0.075196,"[-\tTwo double bedrooms, - Spaciou...",East Dulwich Station,North Dulwich Station,Denmark Hill Station,0.444656,0.600147,1.048489,,,Added yesterday,"Bassano Street, London, SE22",GB,,England,SE22,8RY
1,163907291,True,False,2,2,Apartment,"Located on the 6th floor, is this gorgeous two...","2 bedroom apartment for rent in The Modern, Em...",2 bedroom apartment,"£4,400 pcm",,51.482636,-0.129653,"[Available Now, Virtual tour available, Fully ...",Nine Elms Station,Vauxhall Station,Pimlico Station,0.196516,0.347126,0.485662,75.0,75.0,Added yesterday,"The Modern, Embassy Gardens, SW11",GB,107171035.0,England,SW11,7AY


### Select only the very most essential columns

In [None]:
core_df = flattened_df[['id',
                       'bedrooms',
                       'bathrooms',
                       'price',
                       'latitude',
                       'longitude',
                       'station_1_name',
                       'station_1_dist',
                       'size_min_sqm',
                       'size_max_sqm',
                       'displayaddress']]

In [None]:
def clean_price(df):
    # Make a copy to avoid modifying the original DataFrame
    df = df.copy()
    # Remove pound sign and commas, extract numeric part, and convert to float
    df['price'] = (
        df['price']
        .str.replace('£', '', regex=False)
        .str.replace(',', '', regex=False)
        .str.extract(r'(\d+)')
        .astype(float)
    )
    return df

In [None]:
clean_price(core_df)

Unnamed: 0,id,bedrooms,bathrooms,price,latitude,longitude,station_1_name,station_1_dist,size_min_sqm,size_max_sqm,displayaddress
0,163907069,2,1,2400.0,51.456039,-0.075196,East Dulwich Station,0.444656,,,"Bassano Street, London, SE22"
1,163907291,2,2,4400.0,51.482636,-0.129653,Nine Elms Station,0.196516,75.0,75.0,"The Modern, Embassy Gardens, SW11"
