Splitting is done first to avoid any information leak

In [1]:
import os
import subprocess
import pandas as pd
import numpy as np 
import sklearn 
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


# Real estate 2023 

In [2]:
## Feature distribution and overview
CURATED_DATA_DIR = "../data/curated"
df = pd.read_csv(f"{CURATED_DATA_DIR}/real_estate_2023.csv")
print(df.count()) 
# 11660 instances in total: property_type, parking_spaces,
#  bedrooms, bathrooms are missing
df[df['bedrooms'].isnull()]

property_type                   11573
price_per_week                  11660
postcode                        11660
parking_spaces                   9685
bedrooms                        11648
bathrooms                       11648
furnished_found                 11660
schools_count                   11660
hospital_count                  11660
open_space_count                11660
public_transport_stops_count    11660
bus_stops_count                 11660
train_stops_count               11660
crime_count                     11660
distance_to_cbd                 11660
Population                      11660
Income                          11660
dtype: int64


Unnamed: 0,property_type,price_per_week,postcode,parking_spaces,bedrooms,bathrooms,furnished_found,schools_count,hospital_count,open_space_count,public_transport_stops_count,bus_stops_count,train_stops_count,crime_count,distance_to_cbd,Population,Income
2001,House,450.0,3024,,,,False,5.0,0.0,124.0,100.0,99.0,1.0,1234.0,33.895038,35914.84,2022.98
3142,Carspace,270.0,3051,,,,False,1.0,14.0,53.0,32.0,9.0,1.0,4517.0,1.217566,16352.84,1747.5
3288,Studio,587.0,3053,,,,False,2.0,2.0,35.0,52.0,38.0,0.0,1129.0,0.719035,18257.82,1179.32
3336,Apartment / Unit / Flat,230.769231,3053,,,,False,2.0,2.0,35.0,52.0,38.0,0.0,1129.0,0.586397,18257.82,1179.32
4295,House,1154.0,3076,,,,False,11.0,1.0,339.0,158.0,157.0,1.0,632.0,18.61358,37144.82,1727.64
5738,Carspace,190.0,3141,,,,False,5.0,0.0,49.0,67.0,32.0,2.0,865.0,4.19576,26978.24,2199.02
6053,House,430.0,3150,,,,False,21.0,2.0,251.0,276.0,274.0,2.0,1031.0,19.914675,63253.42,1933.8
6373,House,730.769231,3162,,,,False,0.0,5.0,3.0,27.0,12.0,0.0,1611.0,9.721373,18395.44,2429.2
9147,Vacant land,350.0,3340,,,,False,7.0,0.0,7.0,0.0,0.0,0.0,2066.0,48.19956,26256.74,1838.84
9709,Vacant land,538.461538,3523,,,,False,2.0,1.0,0.0,0.0,0.0,0.0,664.0,121.760233,3999.76,1031.36


# Split
Stratified split based on postcode, which makes sure that this is a good representative of the population

In [3]:
# some postcodes have only 1 advertised property --> unable to stratify
agg = df.groupby("postcode").count()
one_instance_postcode = list(agg[agg['price_per_week'] == 1].index)
print(one_instance_postcode)


# stratified sampling, add 1-advertised-property postcodes to training set
# index of one-instance
exclude_ind = list(df[df['postcode'].isin(one_instance_postcode)].index)
print(exclude_ind)
exclude_set = df.loc[exclude_ind]
print(f"exclude_set.shape: {exclude_set.shape}")
remaining_set = df.drop(exclude_ind)
print(f"remaining_set.shape: {remaining_set.shape}")

# stratified split
x_train, x_test, y_train, y_test = train_test_split(
    remaining_set.drop("price_per_week", axis = 1), 
    remaining_set['price_per_week'], 
    test_size = 0.2, random_state = 42, 
    stratify = remaining_set['postcode'])

# results
x_train = pd.concat([x_train, exclude_set.drop("price_per_week",axis = 1)])
y_train = pd.concat([y_train, exclude_set['price_per_week']])
print(f"x_train.shape: {x_train.shape}")
print(f"y_train.shape: {y_train.shape}")
print(f"x_test.shape: {x_test.shape}")
print(f"y_test.shape: {y_test.shape}")

[3091, 3260, 3331, 3579, 3620, 3988]
[4523, 8758, 8848, 9937, 9946, 11617]
exclude_set.shape: (6, 17)
remaining_set.shape: (11654, 17)
x_train.shape: (9329, 16)
y_train.shape: (9329,)
x_test.shape: (2331, 16)
y_test.shape: (2331,)


In [4]:
def save_csv(df, file_path):
    # save a df to a file_path
    df.to_csv(file_path, index = False)
save_csv(x_train, 
         file_path = f"{CURATED_DATA_DIR}/real_estate_2023_train_x.csv")
save_csv(x_test, 
         file_path = f"{CURATED_DATA_DIR}/real_estate_2023_test_x.csv")
save_csv(y_train, 
         file_path = f"{CURATED_DATA_DIR}/real_estate_2023_train_y.csv")
save_csv(y_test, 
         file_path = f"{CURATED_DATA_DIR}/real_estate_2023_test_y.csv")