In [2]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
import hashlib

In [3]:
# Scraping script
base_url = "https://www.rightmove.co.uk/property-to-rent/find.html?searchLocation=London&useLocationIdentifier=true&locationIdentifier=REGION%5E87490&rent=To+rent&radius=0.0&_includeLetAgreed=on&includeLetAgreed=false&index="
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-GB,en;q=0.9",
    "Referer": "https://www.google.com/",
    "Connection": "keep-alive",
}

addresses = []
prices = []
descriptions = []
bedrooms = []
bathrooms = []

for page_number in range(0, 84):  # 84 pages
    url = base_url + str(page_number * 24)  # Rightmove paginates in multiples of 24
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    listings_section = soup.find("section", class_="ResultsList_resultsSection__MVSi7")
    property_cards = listings_section.find_all("div", class_="PropertyCard_propertyCardContainer__VSRSA") if listings_section else []

    for card in property_cards:
        # Scrap address
        address_tag = card.find("address")
        address = address_tag.get_text(strip=True) if address_tag else "No address found"
        addresses.append(address)

        # Scrape price
        price_tag = card.find("div", class_="PropertyPrice_price__VL65t")
        price = price_tag.get_text(strip=True) if price_tag else "No price found"
        prices.append(price)

        # Scrape description
        desc_tag = card.find("p", class_="PropertyCardSummary_summary__oIv57")
        description = desc_tag.get_text(strip=True) if desc_tag else "No description found"
        descriptions.append(description)

        #Scarpe bed count
        bed_tag = card.find("div", class_="PropertyInformation_bedContainer___rN7d")
        bed_count = bed_tag.find("span").get_text(strip=True) if bed_tag else "0"
        bedrooms.append(bed_count)

        # Scrape bathroom count
        bath_tag = card.find("div", class_="PropertyInformation_bathContainer__ut8VY")
        bath_count = bath_tag.find("span").get_text(strip=True) if bath_tag else "0"
        bathrooms.append(bath_count)

    print(f"Scraped {len(addresses)} listings so far")




Scraped 24 listings so far
Scraped 48 listings so far
Scraped 72 listings so far
Scraped 96 listings so far
Scraped 120 listings so far
Scraped 144 listings so far
Scraped 168 listings so far
Scraped 192 listings so far
Scraped 216 listings so far
Scraped 240 listings so far
Scraped 264 listings so far
Scraped 288 listings so far
Scraped 312 listings so far
Scraped 336 listings so far
Scraped 360 listings so far
Scraped 384 listings so far
Scraped 408 listings so far
Scraped 432 listings so far
Scraped 456 listings so far
Scraped 480 listings so far
Scraped 504 listings so far
Scraped 528 listings so far
Scraped 552 listings so far
Scraped 576 listings so far
Scraped 600 listings so far
Scraped 624 listings so far
Scraped 648 listings so far
Scraped 672 listings so far
Scraped 696 listings so far
Scraped 720 listings so far
Scraped 744 listings so far
Scraped 768 listings so far
Scraped 792 listings so far
Scraped 816 listings so far
Scraped 840 listings so far
Scraped 864 listings so 

In [4]:
# Convert to df
df = pd.DataFrame({
    "address" : addresses,
    "price" : prices,
    "description" : descriptions,
    "bedrooms": bedrooms,
    "bathrooms": bathrooms
})

print(df.tail())

                                        address       price  \
1003             Wandsworth Bridge Road, Fulham  £1,699 pcm   
1004  One Park Drive, Canary Wharf, London, E14  £3,800 pcm   
1005     Kensington High Street, Kensington, W8  £1,777 pcm   
1006               Ansleigh Place, Notting Hill  £8,996 pcm   
1007                Cleveland Square, Bayswater  £3,792 pcm   

                                            description bedrooms bathrooms  
1003  A one double bedroom top floor flat set on Wan...        1         1  
1004  ** Available Now ** Experience luxury living i...        1         1  
1005  ** Landlord pay for the gas central heating/ho...        0         1  
1006  A spectacular three-bedroom architect designed...        3         3  
1007  A modern and well presented top floor two bedr...        2         2  


In [5]:
# Remove duplicates
print(df.shape)
# Find and remove duplicates based on all columns
df.drop_duplicates(inplace=True)

# If you want to check for duplicates based on specific columns (e.g., 'address' and 'postcode'):
# df.drop_duplicates(subset=['address', 'postcode'], inplace=True)

# Reset index after dropping duplicates (optional)
df.reset_index(drop=True, inplace=True)

# Print final DataFrame to verify
print(df.shape)

(1008, 5)
(996, 5)


In [None]:
# Replace empty bathroom trings with null
df = df.replace({"bathrooms": {"": np.nan}})

# Replacing NaN with 1 as they are mostly studios
df['bathrooms'] = df['bathrooms'].fillna(0)

# Replacing 0 bedrooms with 1 again, most likely studio
df['bedrooms'] = df['bedrooms'].replace(0, 1)

# Now check for NaNs
print(df[df['bathrooms'] == 0])


Empty DataFrame
Columns: [address, price, description, bedrooms, bathrooms]
Index: []


In [7]:
# Clean address
df['address'] = df['address'].str.replace("\n", "", regex=False)

In [8]:
API_KEY = "AIzaSyDYyfbNSjyEdJUCNj2nam5R5OznHE31wXI"
def get_coordinates(address):
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {
        "address": address,
        "key": API_KEY
    }

    response = requests.get(base_url, params = params)
    data = response.json()

    if data["status"] == "OK":
        # Extract latitude and longitude from the response
        latitude = data["results"][0]["geometry"]["location"]["lat"]
        longitude = data["results"][0]["geometry"]["location"]["lng"]
        return latitude, longitude
    else:
        return None, None 
    
# Apply the function to fetch latitude and longitude for each address
df[['latitude', 'longitude']] = df['address'].apply(lambda x: pd.Series(get_coordinates(x)))

# Print updated DataFrame
print(df['latitude'].head())

0    51.518926
1    51.359806
2    51.564169
3    51.529622
4    51.429787
Name: latitude, dtype: float64


In [9]:
# Identify rows where 'postcode' is NaN
null_coordinates = df[df['latitude'].isna()]

# Drop these rows correctly using their index
df.drop(index=null_coordinates.index, inplace=True)

# Reset index after dropping (optional)
df.reset_index(drop=True, inplace=True)

In [10]:
def extract_and_remove_postcode(address):
    # Regex to match full and outgoing postcodes, case-insensitive, allowing for no space before/after
    postcode_regex = r'\s?\b[A-Z]{1,2}\d+[A-Z]?\b\s?'

    # Search for the postcode in the address (case-insensitive)
    postcode_match = re.search(postcode_regex, address.strip().upper())
    
    if postcode_match:
        postcode = postcode_match.group().strip()  # Extract the postcode and remove surrounding spaces
        # Remove the postcode from the address
        address = re.sub(postcode_regex, '', address).strip()  # Remove postcode from address
        return postcode, address
    else:
        return None, address
    
# Apply the function to extract postcode and remove it from address
df[['postcode', 'address']] = df['address'].apply(lambda x: pd.Series(extract_and_remove_postcode(x)))

# Print updated DataFrame
print(df)

                                   address       price  \
0             Merchant Square, Paddington,  £9,966 pcm   
1                     Grove Avenue, Sutton  £1,675 pcm   
2          Wilberforce Road, Finsbury Park  £2,500 pcm   
3             Strathmore Court, Park Road,  £8,790 pcm   
4                     Graveney Road London  £2,650 pcm   
..                                     ...         ...   
981         Wandsworth Bridge Road, Fulham  £1,699 pcm   
982  One Park Drive, Canary Wharf, London,  £3,800 pcm   
983    Kensington High Street, Kensington,  £1,777 pcm   
984           Ansleigh Place, Notting Hill  £8,996 pcm   
985            Cleveland Square, Bayswater  £3,792 pcm   

                                           description bedrooms bathrooms  \
0    A stunning interior designed apartment with wa...        4         3   
1    **AVAILABLE APRIL**Leaders are pleased to offe...        2         1   
2    This rather beautiful first floor maisonette w...        2         

In [11]:
# Use google geocode api to extarct postcodes for address that don't contain one
API_KEY = "AIzaSyDYyfbNSjyEdJUCNj2nam5R5OznHE31wXI"

def get_postcode(address):
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    params = {
        "address": f"1 {address}, UK",
        "key": API_KEY
    }
    
    response = requests.get(base_url, params=params)
    data = response.json()

    
    if data["status"] == "OK":
        # Extract postcode from the results
        for result in data["results"]:
            for component in result["address_components"]:
                if "postal_code" in component["types"]:
                    return component["long_name"]

    print(f"No postcode found for {address}")  # Debugging statement
    return None 
    
# Loop through rows and fetch postcode for missing values
for index, row in df[df['postcode'].isna()].iterrows():
    address = row["address"]
    print(f"\n📝 Processing row {index}: {address}")  # Print row being processed

    postcode = get_postcode(address)

    if postcode:
        df.at[index, "postcode"] = postcode
        print(f"✅ Postcode updated in DataFrame: {postcode} for {address}")  # Confirmation message
    else:
        print(f"⚠️ Could not find postcode for {address}")  # Warning if postcode not found

# Identify rows where 'postcode' is NaN
null_postcode = df[df['postcode'].isna()]

# Drop these rows correctly using their index
df.drop(index=null_postcode.index, inplace=True)

# Reset index after dropping (optional)
df.reset_index(drop=True, inplace=True)


📝 Processing row 1: Grove Avenue, Sutton
✅ Postcode updated in DataFrame: SM1 2DA for Grove Avenue, Sutton

📝 Processing row 2: Wilberforce Road, Finsbury Park
✅ Postcode updated in DataFrame: N4 2SN for Wilberforce Road, Finsbury Park

📝 Processing row 9: Hill Street, Mayfair
✅ Postcode updated in DataFrame: W1J for Hill Street, Mayfair

📝 Processing row 14: John Harrison Way, North Greenwich
✅ Postcode updated in DataFrame: SE10 0BL for John Harrison Way, North Greenwich

📝 Processing row 21: Duke Street, London
✅ Postcode updated in DataFrame: W1U 3EA for Duke Street, London

📝 Processing row 24: Queen's Gate Place, London
✅ Postcode updated in DataFrame: SW7 5PE for Queen's Gate Place, London

📝 Processing row 25: Coxwell Boulevard, London
✅ Postcode updated in DataFrame: NW9 4AB for Coxwell Boulevard, London

📝 Processing row 26: Harrowdene Gardens, Teddington
✅ Postcode updated in DataFrame: TW11 0DH for Harrowdene Gardens, Teddington

📝 Processing row 29: Flat 10 16 - 18 Warwic

In [12]:
# Cleaning address column
def clean_address(address):
    address = address.strip()
    address = address.rstrip(",")
    address = address.replace(":", "")
    return address

df['address'] = df['address'].apply(clean_address)

def clean_price(price):
    return int(price.replace('£', '').replace(',', '').split()[0])  # Remove £, commas, and take only the number part

# Apply to the column
df['price'] = df['price'].apply(clean_price)

print(df.head(30))
print(df.dtypes)

                                           address  price  \
0                      Merchant Square, Paddington   9966   
1                             Grove Avenue, Sutton   1675   
2                  Wilberforce Road, Finsbury Park   2500   
3                      Strathmore Court, Park Road   8790   
4                             Graveney Road London   2650   
5        Moluccas Point, 24 Canal Approach, London   2200   
6       Strathmore Court, Park Road, St Johns Wood  10500   
7                                             Oval   4000   
8                             Fountain Road London   2650   
9                             Hill Street, Mayfair   3727   
10          Penthouse, Strathmore Court, Park Road  10030   
11  Streatleigh Court, Streatham High Road, London   3500   
12      Strathmore Court, Park Road, St Johns Wood   6990   
13                            Upper Street, London   2150   
14              John Harrison Way, North Greenwich   1700   
15                   Sou

In [None]:
# Create property id using hashing
def create_property_id(row):
    # Concatenate the relevant columns as a string
    concat_str = f"{row['latitude']}_{row['longitude']}_{row['price']}"
    # Hash the concatenated string using MD5 and return the hexadecimal digest
    return hashlib.md5(concat_str.encode()).hexdigest()

df['property_id'] = df.apply(create_property_id, axis=1)
print(df['property_id'])

# Property id has \r carriage making it awkward to join in sql, needs to be striped
df['property_id'] = df['property_id'].str.replace("\r", "", regex=True)

0      c03d82e0169b9646f9628c766a820a96
1      0eaca9c01a72e47b55656d34229fbbf1
2      0e91ce654b6fd59d84c47e7c66fcb6ad
3      6afcfb6a633839bcb514e31960abbe15
4      45112861947e9869c0cff0d2b041d13b
                     ...               
977    340a452241e98c08eed32732da02181b
978    439595d7e8259e368b64d5cb06a3a60a
979    19cd1f1ec1bdcab9551e17b17768c270
980    f097356ceac5c44860af0dc4dee9f915
981    4e85d09779c4d6c9c6cebe5e50acfc17
Name: property_id, Length: 982, dtype: object


In [14]:
df.drop_duplicates(inplace=True)
print(df.shape)


(982, 9)


In [15]:
df.to_csv("london_flats.csv", index=False)