# Raw Preprocessing for Real Estate Data
This notebook transforms the landing real estate data to its raw form, outputting the final dataframe to the `raw` data directory

In [1]:
# Import dependencies
import pandas as pd
import re
import numpy as np
import ast
import geopy

In [2]:
# Import Real Estate Data
LANDING_DATA_DIR = f"../../data/landing"
RAW_DATA_DIR = f"../../data/raw"
file_path = f'{LANDING_DATA_DIR}/domain_scraped_data (1).csv'
df = pd.read_csv(file_path)
df.count()

Unnamed: 0           12227
name                 12227
cost_text            12227
coordinates          12227
rooms                12227
parking              12227
property_type        12117
desc                 12226
property_features    12227
internal_area          274
land_area              698
dtype: int64

## Reformat columns 

In [3]:
#Rename Link Column
df = df.rename(columns={'Unnamed: 0': 'listing_link'})

#Rename Address Column
df = df.rename(columns={'name': 'address'})

In [4]:
# Check for duplicates
duplicate_rows = df[df.duplicated()]
num_duplicates = len(duplicate_rows)

# Print the number of duplicate rows
print("Number of Duplicate Rows:", num_duplicates)

Number of Duplicate Rows: 0


## Price extraction
The cost is split into price per week, month and year columns using RegEx.

### Define Functions

In [5]:
def extract_price(text):
    """
    Function extracts numeric values from the provided text

    Args:
        text (string): the text from which numeric values are to be extracted

    Returns:
        int: the numeric value(s) from the provided text
    """
    
    # Remove commas from the text
    text = text.replace(',', '')
    
    match = re.search(r'\d+', text)
    if match:
        return int(match.group())
    else:
        return None

def extract_price_with_month(text):
    """
    Function extracts the price with month

    Args:
        text (str): the text from which the price is to be extracted
    
    Returns:
        int: the price within the provided string
    """
    
    # Remove commas from the text
    text = text.replace(',', '')
    
    match = re.findall(r'\d+', text)
    if match:
        if len(match) >= 2:
            num_lst = [float(x) for x in match]

            if (min(num_lst) < 8 and 
                ('week' in text or 'w' in text or 'pw' in text)):
                return max(num_lst) * 52 / 12
            return max(num_lst)    
        return int(match[0])
    else:
        return None

def extract_price_with_week(text):
    """
    Function extracts the price, handling cases with both 'week' and 'month' 
    in the description as weekly prices.

    Args:
        text (str): the text from which the price is to be extracted
    
    Returns:
        int: the price within the provided string
    """
    
    match = re.search(r'\d+', text)
    if match:
        return int(match.group())
    else:
        return None

### Apply to data

In [6]:
# Apply the function to create 'price_per_week', 'price_per_year', and 
# 'price_per_month' columns
df['price_per_year'] = df['cost_text'].apply(
    lambda x: extract_price(x) if 
    re.search(r'(annually|p\.a\.|year|\bpa\b|p\.a)', x, re.IGNORECASE) 
    else None
)
df['price_per_month'] = df['cost_text'].apply(
    lambda x: extract_price_with_month(x) if
    re.search(r'(month|mth|pcm|\bmo\b)', x, re.IGNORECASE) else None
)

df['price_per_week'] = df['cost_text'].apply(
    lambda x: extract_price_with_week(x) 
    if re.search(r'week', x, re.IGNORECASE) and 
    re.search(r'month', x, re.IGNORECASE) else None
)

# Check if all 'price_per' columns are empty
all_price_columns_empty = df[['price_per_year', 'price_per_month']]. \
    isna().all(axis=1)

# Fill 'price_per_week' with the price if all 'price_per' columns are empty
df['price_per_week'] = df.apply(lambda row: extract_price(row['cost_text']) 
                                if all_price_columns_empty[row.name] 
                                else None, axis=1)

In [7]:
df.count()

listing_link         12227
address              12227
cost_text            12227
coordinates          12227
rooms                12227
parking              12227
property_type        12117
desc                 12226
property_features    12227
internal_area          274
land_area              698
price_per_year           5
price_per_month        608
price_per_week       11206
dtype: int64

In [8]:
# Save current dataframe into raw data folder 
file_path = f'{RAW_DATA_DIR}/current_realestate_data.csv'
df.to_csv(file_path, index=False)

## Check Reasonability of Rent Prices
We must check whether rent prices are witin a reasonable range or not.

In [9]:
# Define the reasonable range
min_rent = 50
max_rent = 5000000

# Check if rent price column values are outside the reasonable range
invalid_weekly_rent= ~((df['price_per_week'] >= min_rent) & 
                       (df['price_per_week'] <= max_rent))
invalid_monthly_rent= ~((df['price_per_month'] >= min_rent) & 
                        (df['price_per_month'] <= max_rent))
invalid_yearly_rent= ~((df['price_per_year'] >= min_rent) & 
                       (df['price_per_year'] <= max_rent))

# Filter the DataFrame to get rows with invalid ages
invalid_weekly_rent_df = df[invalid_weekly_rent]
invalid_monthly_rent_df = df[invalid_monthly_rent]
invalid_yearly_rent_df = df[invalid_yearly_rent]

# Get distinct values from invalid entries
distinct_values_weekly = invalid_weekly_rent_df['price_per_week'].unique()
distinct_values_monthly = invalid_monthly_rent_df['price_per_month'].unique()
distinct_values_yearly = invalid_yearly_rent_df['price_per_year'].unique()

# Print the distinct values
print("Distinct values in weekly")
print(distinct_values_weekly)
print("Distinct values in 'monthly")
print(distinct_values_monthly)
print("Distinct values in 'yearly")
print(distinct_values_yearly)

Distinct values in weekly
[           nan 2.00000000e+00 4.60000000e+01 4.77796377e+08
 4.00000000e+01 1.00000000e+00 2.80000000e+01 3.20000000e+01
 2.50000000e+01 3.30000000e+01 4.00000000e+00 3.40000000e+01]
Distinct values in 'monthly
[nan 40.]
Distinct values in 'yearly
[nan]


## Convert Area to Number

In [10]:
# Define function
def extract_numeric_area(area_text):
    """
    Function extracts numeric values from the `Area` column

    Args:
        area_text (str): the string from which numeric values must be extracted

    Returns:
        int: the number extracted from the string
    """
    
    if pd.notna(area_text):
        digits = ''.join(filter(str.isdigit, area_text))
        if digits:
            return int(digits)

    return None

# Apply the function to create a new 'numeric_area' column
df['numeric_area'] = df['internal_area'].map(extract_numeric_area)

## Determine Validity of Postcodes

In [11]:
# Create postcode column
df['postcode'] = df['address'].str.extract(r'(\d{4})$')
df.head()

# Check if the 'postcode' column values consist of four digits
valid_postcodes_mask = df['postcode'].str.match(r'^\d{4}$')

# Create a new DataFrame with only rows where the postcode is invalid
invalid_postcodes_df = df[~valid_postcodes_mask]

# Get the distinct invalid postcodes
distinct_invalid_postcodes = invalid_postcodes_df['postcode'].unique()

# Print the distinct invalid postcodes
print("Distinct Invalid Postcodes:")
print(distinct_invalid_postcodes)

Distinct Invalid Postcodes:
[]


## Check Validity of Area

In [12]:
# Define the reasonable range
min_area = 0
max_area= 1000000

# Check if area column values are outside the reasonable range
invalid_area = ~((df['numeric_area'] >= min_area) & 
                 (df['numeric_area'] <= max_area))

# Filter the DataFrame to get rows with invalid ages
invalid_area_df = df[invalid_area]

# Get distinct values from 'Column1'
distinct_values_area = invalid_area_df['numeric_area'].unique()

# Print the distinct values
print("Distinct values in area")
print(distinct_values_area)

Distinct values in area
[nan]


## Extract Number of Parking Spaces

In [13]:
# Function to extract the number of parking spaces
def extract_parking_info(parking_text):
    """
    Function extracts the number of parking spaces from the provided text.

    Args:
        parking_text (str): the text from which the number of parking spaces
                            will be extractes

    Returns:
        int: the number of parking spaces
    """
    
    if pd.notna(parking_text):
        
        # Extract all digits from the text using a regular expression
        digits = ''.join(filter(str.isdigit, parking_text))
        
        if digits:
            return int(digits)
    
    return None

# Apply the function to create a new 'parking_spaces' column
df['parking_spaces'] = df['parking'].map(extract_parking_info)

## Validity of Number of Parking Spaces

In [14]:
# Define the reasonable range
min_parking = 0
max_parking = 100000

# Check if values in parking spaces column are outside the reasonable range
invalid_parking = ~((df['parking_spaces'] >= min_parking) & 
                    (df['parking_spaces'] <= max_parking))

# Filter the DataFrame to get rows with invalid ages
invalid_parking_df = df[invalid_parking]

# Get distinct values from 'Column1'
distinct_values_parking = invalid_parking_df['parking_spaces'].unique()

# Print the distinct values
print("Distinct values in 'parking")
print(distinct_values_parking)

Distinct values in 'parking
[nan]


## Preprocess Bedrooms and Bathrooms

In [15]:
# Function to extract the number of beds and baths
def extract_rooms_info(rooms_text):
    """
    Function extracts the number of beds and baths from the provided text

    Args:
        rooms_text (str): the string from which the number of beds is to be 
                            extracted

    Returns:
        int: _description_
    """

    rooms_info = rooms_text.split()
    
    # Handle cases with commas in the number
    bedrooms_info = rooms_info[0].replace(',', '')
    
    # Extract the 3rd character (index 2) if there are enough elements
    bedrooms = int(bedrooms_info[2]) \
        if len(bedrooms_info) > 2 and bedrooms_info[2].isdigit() else None

    if len(rooms_info) > 2:
        
        # Handle cases with commas in the number
        bathrooms_info = rooms_info[2].replace(',', '')
        
        # Extract the 3rd character (index 2) if there are enough elements
        bathrooms = int(bathrooms_info[1]) \
            if len(bathrooms_info) > 1 and bathrooms_info[1].isdigit() else None
    else:
        bathrooms = None
    
    return bedrooms, bathrooms

# Apply function to create 'bedrooms' and 'bathrooms' columns, skipping header
df['bedrooms'], df['bathrooms'] = zip(*df['rooms'].map(extract_rooms_info))

# Use applymap to cast numeric values to int, preserving None and NaN values
df = df.applymap(lambda x: int(x) \
    if pd.notna(x) and isinstance(x, (int, float)) else x)

## Validity of Bedrooms and Bathrooms Columns

In [16]:
# Define the reasonable range
min_room = 0
max_room = 100

# Check if the bedrooms/bathroom column values are outside the reasonable range
invalid_bedrooms = ~((df['bedrooms'] >= min_room) & 
                     (df['bedrooms'] <= max_room))
invalid_bathrooms = ~((df['bathrooms'] >= min_room) & 
                      (df['bathrooms'] <= max_room))

# Filter the DataFrame to get rows with invalid ages
invalid_bedrooms_df = df[invalid_bedrooms]
invalid_bathrooms_df = df[invalid_bathrooms]

# Get distinct values from invalid entries
distinct_values_bedrooms = invalid_bedrooms_df['bedrooms'].unique()
distinct_values_bathrooms = invalid_bathrooms_df['bathrooms'].unique()

# Print the distinct values
print("Distinct values in 'bedrooms")
print(distinct_values_bedrooms)
print("Distinct values in 'bathrooms")
print(distinct_values_bathrooms)

Distinct values in 'bedrooms
[nan]
Distinct values in 'bathrooms
[nan]


In [17]:
# Reorder Columns 
df.count()

listing_link         12227
address              12227
cost_text            12227
coordinates          12227
rooms                12227
parking              12227
property_type        12117
desc                 12226
property_features    12227
internal_area          274
land_area              698
price_per_year           5
price_per_month        608
price_per_week       11206
numeric_area           274
postcode             12227
parking_spaces       10139
bedrooms             12194
bathrooms            12194
dtype: int64

In [18]:
# Save the DataFrame to a CSV file
save_file_path = f'{RAW_DATA_DIR}/raw_real_estate_data.csv'
df.to_csv(save_file_path, index=False)