# Clean the Property Data Gathered

## Import Packages

In [22]:
# Web - Scraping and API Requests
import requests
from httpx import AsyncClient, Response
from parsel import Selector
import parsel
import jmespath
import asyncio

# Data Manipulation and Analysis
import pandas as pd
from pprint import pprint 
import json
from typing import List
from typing import TypedDict

# Database Connection
from sqlalchemy import create_engine

# File and System Operations
import os
import sys

In [23]:
pd.set_option('display.max_columns', None) # Display all columns in any given DataFrame

In [24]:
# This allows one to reload the custom package without having to install it again
%load_ext autoreload 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
# this allows one to reload the custom package without having to install it again
%autoreload 1

sys.path.insert(0,'../src/')

# Import the custom package
%aimport rental_utils
%aimport rental_utils.functions

In [26]:
# Load SQL Magic for Jupyter Notebooks
%load_ext sql
%config SqlMagic.displaylimit = None
%config SqlMagic.autocommit=True # for engines that do not support autommit

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


## Clean the data for selected urls

In [27]:
with open("../data/rightmove_properties.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [28]:
display(data[0])

{'id': 161816171,
 'bedrooms': 2,
 'bathrooms': 1,
 'numberOfImages': 9,
 'numberOfFloorplans': 1,
 'numberOfVirtualTours': 0,
 'summary': 'A spacious two-bedroom flat on the top floor provides comfortable living accommodation.',
 'displayAddress': 'Rosendale Road London SE21',
 'countryCode': 'GB',
 'location': {'latitude': 51.4331, 'longitude': -0.09298},
 'propertyImages': {'images': [{'url': '69k/68804/161816171/68804_P300092_IMG_00_0000.jpeg',
    'caption': 'ROSENDALE ROAD, 1...',
    'srcUrl': 'https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/69k/68804/161816171/68804_P300092_IMG_00_0000_max_476x317.jpeg'},
   {'url': '69k/68804/161816171/68804_P300092_IMG_01_0000.jpeg',
    'caption': 'ROSENDALE ROAD, 1...',
    'srcUrl': 'https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/69k/68804/161816171/68804_P300092_IMG_01_0000_max_476x317.jpeg'},
   {'url': '69k/68804/161816171/68804_P300092_IMG_02_0000.jpeg',
    'caption': 'ROSENDALE ROAD, 1...',
    'srcUrl': 'https://media.rig

### Normalise th JSON so that it turns into a dataframe

In [29]:
data_norm = pd.json_normalize(data, max_level = 1)
data_norm.head(1)

Unnamed: 0,id,bedrooms,bathrooms,numberOfImages,numberOfFloorplans,numberOfVirtualTours,summary,displayAddress,countryCode,propertySubType,premiumListing,featuredProperty,distance,transactionType,commercial,development,residential,students,auction,feesApply,feesApplyText,displaySize,showOnMap,propertyUrl,contactUrl,staticMapUrl,channel,firstVisibleDate,keywords,keywordMatchType,saved,hidden,onlineViewingsAvailable,hasBrandPlus,displayStatus,enquiredTimestamp,enquiryAddedTimestamp,enquiryCalledTimestamp,heading,isRecent,enhancedListing,addedOrReduced,formattedBranchName,formattedDistance,propertyTypeFullDescription,location.latitude,location.longitude,propertyImages.images,propertyImages.mainImageSrc,propertyImages.mainMapImageSrc,listingUpdate.listingUpdateReason,listingUpdate.listingUpdateDate,price.amount,price.frequency,price.currencyCode,price.displayPrices,customer.branchId,customer.brandPlusLogoURI,customer.contactTelephone,customer.branchDisplayName,customer.branchName,customer.brandTradingName,customer.branchLandingPageUrl,customer.development,customer.showReducedProperties,customer.commercial,customer.showOnMap,customer.enhancedListing,customer.developmentContent,customer.buildToRent,customer.buildToRentBenefits,customer.brandPlusLogoUrl,productLabel.productLabelText,productLabel.spotlightLabel,lozengeModel.matchingLozenges
0,161816171,2,1.0,9,1,0,A spacious two-bedroom flat on the top floor p...,Rosendale Road London SE21,GB,Flat,False,True,,rent,False,False,True,False,False,True,Hamptons Fees and Charges in England\n\nPermit...,,True,/properties/161816171#/?channel=RES_LET,/property-to-rent/contactBranch.html?propertyI...,,RENT,2025-05-12T12:45:41Z,[],no_keyword,False,False,False,True,,,,,Featured Property,False,False,Added on 12/05/2025,"by Hamptons, Dulwich",,2 bedroom flat,51.4331,-0.09298,[{'url': '69k/68804/161816171/68804_P300092_IM...,https://media.rightmove.co.uk:443/dir/crop/10:...,https://media.rightmove.co.uk:443/dir/crop/10:...,new,2025-05-12T12:51:06Z,2200,monthly,GBP,"[{'displayPrice': '£2,200 pcm', 'displayPriceQ...",68804,/brand/brand_rmchoice_logo_25218_0003.jpeg,020 3869 0297,"Hamptons, Dulwich",Dulwich,Hamptons,/estate-agents/agent/Hamptons/Dulwich-68804.html,False,True,False,True,False,,False,[],https://media.rightmove.co.uk:443/brand/brand_...,,False,[]


### Filter out only the desired columns

In [30]:
def filter_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filters the input DataFrame to retain only the columns relevant for property analysis.

    Args:
        df (pd.DataFrame): The DataFrame to filter.

    Returns:
        pd.DataFrame: A DataFrame containing only the selected columns of interest.
    """
    # Define the list of columns to keep in the filtered DataFrame
    base_cols = [
        'id',
        'bedrooms',
        'bathrooms',
        'numberOfImages',
        'displayAddress',
        'location.latitude',
        'location.longitude',
        'propertySubType',
        'listingUpdate.listingUpdateReason',
        'listingUpdate.listingUpdateDate',
        'price.amount',
        'price.frequency',
        'premiumListing',
        'featuredProperty',
        'transactionType',
        'students',
        'displaySize',
        'propertyUrl',
        'firstVisibleDate',
        'addedOrReduced',
        'propertyTypeFullDescription'
    ]
    # Assign the columns of interest (can be extended or modified if needed)
    columns_of_interest = base_cols
    # Filter the DataFrame to include only the columns of interest
    filtered_df = df[columns_of_interest]
    # Return the filtered DataFrame
    return filtered_df


filtered_df = filter_df(data_norm)
filtered_df.head()

Unnamed: 0,id,bedrooms,bathrooms,numberOfImages,displayAddress,location.latitude,location.longitude,propertySubType,listingUpdate.listingUpdateReason,listingUpdate.listingUpdateDate,price.amount,price.frequency,premiumListing,featuredProperty,transactionType,students,displaySize,propertyUrl,firstVisibleDate,addedOrReduced,propertyTypeFullDescription
0,161816171,2,1.0,9,Rosendale Road London SE21,51.4331,-0.09298,Flat,new,2025-05-12T12:51:06Z,2200,monthly,False,True,rent,False,,/properties/161816171#/?channel=RES_LET,2025-05-12T12:45:41Z,Added on 12/05/2025,2 bedroom flat
1,163971062,2,1.0,11,"Marquess Road, London, N1",51.545995,-0.088096,Flat,new,2025-06-30T15:31:13Z,3000,monthly,False,False,rent,False,80 sq. m.,/properties/163971062#/?channel=RES_LET,2025-06-30T15:25:34Z,Added today,2 bedroom flat
2,163971071,1,1.0,8,"Sevington Street, London, W9",51.524532,-0.192148,Apartment,new,2025-06-30T15:31:13Z,1800,monthly,False,False,rent,False,,/properties/163971071#/?channel=RES_LET,2025-06-30T15:25:41Z,Added today,1 bedroom apartment
3,163971074,3,2.0,17,"Highgate Road, Highgate, NW5",51.55626,-0.146437,Maisonette,new,2025-06-30T15:31:13Z,980,weekly,False,False,rent,False,,/properties/163971074#/?channel=RES_LET,2025-06-30T15:25:44Z,Added today,3 bedroom maisonette
4,163971038,2,1.0,5,"Maybank House, E17",51.5581,-0.00654,Flat,new,2025-06-30T15:31:12Z,1750,monthly,False,False,rent,False,,/properties/163971038#/?channel=RES_LET,2025-06-30T15:25:17Z,Added today,2 bedroom flat


In [31]:
rental_utils.functions.filter_df(df=filtered_df)

Unnamed: 0,id,bedrooms,bathrooms,numberOfImages,displayAddress,location.latitude,location.longitude,propertySubType,listingUpdate.listingUpdateReason,listingUpdate.listingUpdateDate,price.amount,price.frequency,premiumListing,featuredProperty,transactionType,students,displaySize,propertyUrl,firstVisibleDate,addedOrReduced,propertyTypeFullDescription
0,161816171,2,1.0,9,Rosendale Road London SE21,51.4331,-0.09298,Flat,new,2025-05-12T12:51:06Z,2200,monthly,False,True,rent,False,,/properties/161816171#/?channel=RES_LET,2025-05-12T12:45:41Z,Added on 12/05/2025,2 bedroom flat
1,163971062,2,1.0,11,"Marquess Road, London, N1",51.545995,-0.088096,Flat,new,2025-06-30T15:31:13Z,3000,monthly,False,False,rent,False,80 sq. m.,/properties/163971062#/?channel=RES_LET,2025-06-30T15:25:34Z,Added today,2 bedroom flat
2,163971071,1,1.0,8,"Sevington Street, London, W9",51.524532,-0.192148,Apartment,new,2025-06-30T15:31:13Z,1800,monthly,False,False,rent,False,,/properties/163971071#/?channel=RES_LET,2025-06-30T15:25:41Z,Added today,1 bedroom apartment
3,163971074,3,2.0,17,"Highgate Road, Highgate, NW5",51.55626,-0.146437,Maisonette,new,2025-06-30T15:31:13Z,980,weekly,False,False,rent,False,,/properties/163971074#/?channel=RES_LET,2025-06-30T15:25:44Z,Added today,3 bedroom maisonette
4,163971038,2,1.0,5,"Maybank House, E17",51.5581,-0.00654,Flat,new,2025-06-30T15:31:12Z,1750,monthly,False,False,rent,False,,/properties/163971038#/?channel=RES_LET,2025-06-30T15:25:17Z,Added today,2 bedroom flat
5,163721564,0,1.0,8,"Lighterman Point, 3 New Village Avenue, London...",51.513176,-0.000335,Flat,price_reduced,2025-06-30T15:30:17Z,1600,monthly,False,False,rent,False,,/properties/163721564#/?channel=RES_LET,2025-06-24T11:55:03Z,Reduced today,Studio flat
6,86726619,1,1.0,18,"Rendel House, Goodluck Hope, London, E14",51.509024,0.007284,Apartment,new,2025-06-30T15:30:09Z,535,weekly,False,False,rent,False,51 sq. m.,/properties/86726619#/?channel=RES_LET,2024-01-09T19:05:03Z,Added today,1 bedroom apartment
7,163970990,0,1.0,10,"Anchor House, St George Wharf, London",51.486236,-0.124846,Apartment,new,2025-06-30T15:30:07Z,496,weekly,False,False,rent,False,34 sq. m.,/properties/163970990#/?channel=RES_LET,2025-06-30T15:24:43Z,Added today,Studio apartment
8,163969877,1,,2,"Cheyne Avenue, London, TW2",51.44548,-0.37638,Terraced,new,2025-06-30T15:30:07Z,950,monthly,False,False,rent,True,,/properties/163969877#/?channel=RES_LET,2025-06-30T15:24:43Z,Added today,1 bedroom terraced house
9,163970711,2,1.0,8,"Twickenham Gateway, Bradshaw Close, London, TW1",51.45047,-0.33002,Flat,new,2025-06-30T15:30:04Z,2700,monthly,False,False,rent,False,,/properties/163970711#/?channel=RES_LET,2025-06-30T15:24:20Z,Added today,2 bedroom flat


In [32]:
filtered_df.to_csv("../data/properties.csv")