# Clean the Property Data Gathered

## Import Packages

In [1]:
# Web - Scraping and API Requests
import requests
from httpx import AsyncClient, Response
from parsel import Selector
import parsel
import jmespath
import asyncio

# Data Manipulation and Analysis
import pandas as pd
from pprint import pprint 
import json
from typing import List
from typing import TypedDict

# Database Connection
from sqlalchemy import create_engine
from sqlalchemy import inspect, text

# File and System Operations
import os
import sys

In [2]:
pd.set_option('display.max_columns', None) # Display all columns in any given DataFrame

In [3]:
# This allows one to reload the custom package without having to install it again
%load_ext autoreload 

In [4]:
# this allows one to reload the custom package without having to install it again
%autoreload 1

sys.path.insert(0,'../src/')

# Import the custom package and sub-packages
%aimport rental_utils
%aimport rental_utils.functions
%aimport rental_utils.sql_queries

In [5]:
# Load SQL Magic for Jupyter Notebooks
%load_ext sql
%config SqlMagic.displaylimit = None
%config SqlMagic.autocommit=True # for engines that do not support autommit

## Clean the data for selected urls

In [6]:
with open("../data/rightmove_properties.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [7]:
display(data[0])

{'id': 163818515,
 'bedrooms': 5,
 'bathrooms': 3,
 'numberOfImages': 24,
 'numberOfFloorplans': 1,
 'numberOfVirtualTours': 0,
 'summary': "A stunning five-bedroom family home located on one of Knightsbridge's most beautiful and private garden squares, SW7. This delightful property boasts two reception rooms, open-plan kitchen, dining /family area, conservatory leading to a private patio garden, study area, master bedroom with en...",
 'displayAddress': 'Trevor Square, Knightsbridge SW7',
 'countryCode': 'GB',
 'location': {'latitude': 51.500536, 'longitude': -0.164367},
 'propertyImages': {'images': [{'url': '129k/128314/163818515/128314_17896_IMG_00_0000.jpeg',
    'caption': 'First Floor Receptio',
    'srcUrl': 'https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/129k/128314/163818515/128314_17896_IMG_00_0000_max_476x317.jpeg'},
   {'url': '129k/128314/163818515/128314_17896_IMG_01_0000.jpeg',
    'caption': 'Raised Ground Floor ',
    'srcUrl': 'https://media.rightmove.co.uk:443

### Normalise th JSON so that it turns into a dataframe

In [8]:
data_norm = pd.json_normalize(data, max_level = 1)
data_norm.head(1)

Unnamed: 0,id,bedrooms,bathrooms,numberOfImages,numberOfFloorplans,numberOfVirtualTours,summary,displayAddress,countryCode,propertySubType,premiumListing,featuredProperty,distance,transactionType,commercial,development,residential,students,auction,feesApply,feesApplyText,displaySize,showOnMap,propertyUrl,contactUrl,staticMapUrl,channel,firstVisibleDate,keywords,keywordMatchType,saved,hidden,onlineViewingsAvailable,hasBrandPlus,displayStatus,enquiredTimestamp,enquiryAddedTimestamp,enquiryCalledTimestamp,heading,isRecent,enhancedListing,addedOrReduced,formattedBranchName,formattedDistance,propertyTypeFullDescription,location.latitude,location.longitude,propertyImages.images,propertyImages.mainImageSrc,propertyImages.mainMapImageSrc,listingUpdate.listingUpdateReason,listingUpdate.listingUpdateDate,price.amount,price.frequency,price.currencyCode,price.displayPrices,customer.branchId,customer.brandPlusLogoURI,customer.contactTelephone,customer.branchDisplayName,customer.branchName,customer.brandTradingName,customer.branchLandingPageUrl,customer.development,customer.showReducedProperties,customer.commercial,customer.showOnMap,customer.enhancedListing,customer.developmentContent,customer.buildToRent,customer.buildToRentBenefits,customer.brandPlusLogoUrl,productLabel.productLabelText,productLabel.spotlightLabel,lozengeModel.matchingLozenges
0,163818515,5,3,24,1,0,A stunning five-bedroom family home located on...,"Trevor Square, Knightsbridge SW7",GB,House,False,True,,rent,False,False,True,False,False,True,Permitted payments and tenant protection infor...,,True,/properties/163818515#/?channel=RES_LET,/property-to-rent/contactBranch.html?propertyI...,,RENT,2025-06-26T10:36:08Z,[],no_keyword,False,False,False,True,,,,,Featured Property,False,False,Added on 26/06/2025,"by Farrar & Co, Chelsea - Lettings",,5 bedroom house,51.500536,-0.164367,[{'url': '129k/128314/163818515/128314_17896_I...,https://media.rightmove.co.uk:443/dir/crop/10:...,https://media.rightmove.co.uk:443/dir/crop/10:...,new,2025-06-26T10:42:04Z,2750,weekly,GBP,"[{'displayPrice': '£11,917 pcm', 'displayPrice...",128314,/company/clogo_32153_0000.jpg,020 3835 4312,"Farrar & Co, Chelsea - Lettings",Chelsea - Lettings,Farrar & Co,/estate-agents/agent/Farrar-and-Co/Chelsea---L...,False,True,False,True,False,,False,[],https://media.rightmove.co.uk:443/company/clog...,,False,[]


### Filter out only the desired columns

In [9]:
def filter_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filters the input DataFrame to retain only the columns relevant for property analysis.

    Args:
        df (pd.DataFrame): The DataFrame to filter.

    Returns:
        pd.DataFrame: A DataFrame containing only the selected columns of interest.
    """
    # Define the list of columns to keep in the filtered DataFrame
    base_cols = [
        'id',
        'bedrooms',
        'bathrooms',
        'numberOfImages',
        'displayAddress',
        'location.latitude',
        'location.longitude',
        'propertySubType',
        'listingUpdate.listingUpdateReason',
        'listingUpdate.listingUpdateDate',
        'price.amount',
        'price.frequency',
        'premiumListing',
        'featuredProperty',
        'transactionType',
        'students',
        'displaySize',
        'propertyUrl',
        'firstVisibleDate',
        'addedOrReduced',
        'propertyTypeFullDescription'
    ]
    # Assign the columns of interest (can be extended or modified if needed)
    columns_of_interest = base_cols
    # Filter the DataFrame to include only the columns of interest
    filtered_df = df[columns_of_interest]
    # Create a price per bedroom column
    filtered_df = filtered_df.copy()
    filtered_df.loc[:, "price_per_bed"] = filtered_df["price.amount"] / filtered_df["bedrooms"]
    # Return the filtered DataFrame
    return filtered_df


filtered_df = filter_df(data_norm)
filtered_df.head()

Unnamed: 0,id,bedrooms,bathrooms,numberOfImages,displayAddress,location.latitude,location.longitude,propertySubType,listingUpdate.listingUpdateReason,listingUpdate.listingUpdateDate,price.amount,price.frequency,premiumListing,featuredProperty,transactionType,students,displaySize,propertyUrl,firstVisibleDate,addedOrReduced,propertyTypeFullDescription,price_per_bed
0,163818515,5,3,24,"Trevor Square, Knightsbridge SW7",51.500536,-0.164367,House,new,2025-06-26T10:42:04Z,2750,weekly,False,True,rent,False,,/properties/163818515#/?channel=RES_LET,2025-06-26T10:36:08Z,Added on 26/06/2025,5 bedroom house,550.0
1,163977209,2,2,9,"Anson Road, London",51.555435,-0.128682,Flat,new,2025-06-30T16:41:03Z,2600,monthly,False,False,rent,False,,/properties/163977209#/?channel=RES_LET,2025-06-30T16:35:10Z,Added today,2 bedroom flat,1300.0
2,163977203,2,1,13,"Tufnell Park Road, London",51.557703,-0.127115,Flat,new,2025-06-30T16:41:03Z,2400,monthly,False,False,rent,False,,/properties/163977203#/?channel=RES_LET,2025-06-30T16:35:09Z,Added today,2 bedroom flat,1200.0
3,163977179,1,1,5,"Elm Grove Road, London, W5",51.506504,-0.297362,Flat Share,new,2025-06-30T16:40:04Z,950,monthly,False,False,rent,False,,/properties/163977179#/?channel=RES_LET,2025-06-30T16:34:56Z,Added today,1 bedroom flat share,950.0
4,163977128,3,2,13,"Montpelier Road, Finchley",51.599838,-0.180289,Terraced,new,2025-06-30T16:40:01Z,2850,monthly,False,False,rent,False,,/properties/163977128#/?channel=RES_LET,2025-06-30T16:34:04Z,Added today,3 bedroom terraced house,950.0


In [10]:
filtered_df = rental_utils.functions.filter_df(df=filtered_df)


### Define a Function that Cleans Certain Column Names

In [11]:
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Renames selected columns of a DataFrame to make them more readable and SQL-friendly.
    Specifically, it replaces nested JSON column names (with dots) with simpler names.
    """
    # Create a mapping of the columns whose names we are changing
    rename_map = {
        "location.latitude": "latitude",
        "location.longitude": "longitude",
        "listingUpdate.listingUpdateReason": "listingUpdateReason",
        "listingUpdate.listingUpdateDate": "listingUpdateDate",
        "price.amount": "priceAmount",
        "price.frequency": "priceFrequency",
    }
    # then, actually rename the columns
    return df.rename(columns=rename_map)

In [12]:
clean_df = rental_utils.functions.clean_column_names(filtered_df)
#clean_df = clean_column_names(filtered_df)
clean_df.dtypes

id                               int64
bedrooms                         int64
bathrooms                        int64
numberOfImages                   int64
displayAddress                  object
latitude                       float64
longitude                      float64
propertySubType                 object
listingUpdateReason             object
listingUpdateDate               object
priceAmount                      int64
priceFrequency                  object
premiumListing                    bool
featuredProperty                  bool
transactionType                 object
students                          bool
displaySize                     object
propertyUrl                     object
firstVisibleDate                object
addedOrReduced                  object
propertyTypeFullDescription     object
price_per_bed                  float64
dtype: object

filtered_df.to_csv("../data/properties.csv")

### Create a Database and Save The Data to It

#### Create the Database if It Has Not Been Already

In [13]:
# Import as a separate sub-module to reduce the lenght of text
from rental_utils import sql_queries as sqlq

engine = sqlq.get_sql_engine("../data/properties.db")

with engine.connect() as conn:
    pass

#### Save the Data the Database as A New Table

In [None]:


## Execute the drop table query (Drop the old table and start a fresh one)
with engine.connect() as connection:
    connection.execute(text(sqlq.DROP_PROPERTIES_TABLE_SQL_QUERY))

## Execute the CREATE TABLE query to create a blank table
with engine.connect() as connection:
    connection.execute(text(sqlq.CREATE_TABLE_SQL_QUERY))

## Save the dataframe into that table, extending it by default
sqlq.make_table(clean_df, "properties_data", engine)


### Check if it can be Extracted OK

In [15]:
with engine.connect() as connection:
    rightmove_data = pd.read_sql(text(sqlq.GET_PROPERTIES_DATA_SQL_QUERY), connection)
rightmove_data.head()

Unnamed: 0,id,price_per_bed,travel_time,distance,bedrooms,bathrooms,numberOfImages,displayAddress,latitude,longitude,propertySubType,listingUpdateReason,listingUpdateDate,priceAmount,priceFrequency,premiumListing,featuredProperty,transactionType,students,displaySize,propertyUrl,firstVisibleDate,addedOrReduced,propertyTypeFullDescription
0,161782598,1150.0,,,2,2,10,"Martello Street, Hackney, London",51.542839,-0.057569,Flat,price_reduced,2025-06-30T16:34:40Z,2300,monthly,0,0,rent,0,65 sq. m.,/properties/161782598#/?channel=RES_LET,2025-05-10T13:15:57Z,Reduced today,2 bedroom flat
1,161786591,766.666667,,,3,2,12,"Penrhyn Avenue, London, E17 5BJ",51.59803,-0.020891,End of Terrace,price_reduced,2025-06-30T16:35:49Z,2300,monthly,1,0,rent,0,90 sq. m.,/properties/161786591#/?channel=RES_LET,2025-05-10T16:30:55Z,Reduced today,3 bedroom end of terrace house
2,162252746,346.0,,,2,1,4,"Long Street, London, Shoredtich, E2",51.52905,-0.07643,Apartment,price_reduced,2025-06-30T16:35:02Z,692,weekly,0,0,rent,0,,/properties/162252746#/?channel=RES_LET,2025-05-21T15:30:03Z,Reduced today,2 bedroom apartment
3,162753398,800.0,,,3,1,12,"Folkestone Road, London",51.585121,-0.01614,Flat,price_reduced,2025-06-30T16:36:12Z,2400,monthly,1,0,rent,0,80 sq. m.,/properties/162753398#/?channel=RES_LET,2025-06-02T20:53:51Z,Reduced today,3 bedroom flat
4,163298408,1025.0,,,2,3,7,"Hoptree Close, N12",51.61738,-0.18552,Flat,price_reduced,2025-06-30T16:35:11Z,2050,monthly,0,0,rent,0,,/properties/163298408#/?channel=RES_LET,2025-06-13T15:34:39Z,Reduced today,2 bedroom flat
