In [19]:
# !pip3 install category_encoders

In [24]:
!pip3 install scikit-learn



# Libraries

In [127]:
import glob
import math

import numpy as np
import pandas as pd

from collections import Counter
from pprint import pprint

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
# from category_encoders import OneHotEncoder
# from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

# Load Data

In [128]:
file_paths = glob.glob('Kaggle_HDB/*.csv')

df = []
for file in file_paths:
    print(file)
    df_temp = pd.read_csv(file, parse_dates=['month'])
    df.append(df_temp)

# 3. Concatenate all DataFrames into one
df_combined = pd.concat(df, ignore_index=True).reset_index(drop=True)
df_combined.head()

Kaggle_HDB\resale-flat-prices-based-on-approval-date-1990-1999.csv
Kaggle_HDB\resale-flat-prices-based-on-approval-date-2000-feb-2012.csv
Kaggle_HDB\resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv
Kaggle_HDB\resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv
Kaggle_HDB\resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv


Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,
1,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0,
2,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0,
3,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000.0,
4,1990-01-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0,


# Preprocessing

In [129]:
# def round_to_multiple(x, m=5):
#     return m * round(float(x) / m)

def round_to_multiple(x, m=5):
    return int(math.ceil(float(x) / m)) * m

In [130]:
# Preprocess floor_area_sqm
# - Round price to nearest 10
df_combined['floor_area_sqm'] = df_combined['floor_area_sqm'].apply(lambda x: round_to_multiple(x, 10))

In [131]:
# Preprocess resale_price
# - Round price to nearest 50k
df_combined['resale_price'] = df_combined['resale_price'].apply(lambda x: int(round_to_multiple(x, 50000) / 1000))

In [132]:
# Preprocess flat_model
def preprocess_flat_model(x):
    x = x.lower()
    if 'maisonette' in x:
        return 'maisonette'
    else:
        return x
    
df_combined['flat_model'] = df_combined['flat_model'].apply(lambda x: preprocess_flat_model(x))

In [None]:
# Preprocess storey_range
def preprocess_storey_avg(x):
    x = x.lower()
    x = x.split(' to ')
    avg = (int(x[1]) + int(x[0]))/2
    return int(avg)
    
def preprocess_storey_height(x):
    x = x.lower()
    x = x.split(' to ')
    avg = (int(x[1]) + int(x[0]))/2
    if avg < 5:
        return 'low floor'
    elif avg >=10:
        return 'high floor'
    else:
        return 'mid floor'
        
df_combined['storey_range_avg'] = df_combined['storey_range'].apply(lambda x: preprocess_storey_avg(x))
df_combined['storey_range_height'] = df_combined['storey_range'].apply(lambda x: preprocess_storey_height(x))
df_combined[['storey_low', 'storey_high']] = df_combined['storey_range'].str.split(' TO ', expand=True).astype(int)

In [134]:
# Preprocess lease_commence_date
def lease_to_years(s):
    try:
        y, m = map(int, s.split('-'))
        return y + m / 12.0
    except:
        return np.nan

df_combined['remaining_lease_yr'] = df_combined['remaining_lease'].apply(lease_to_years)
df_combined['flat_age'] = df_combined['month'].dt.year - df_combined['lease_commence_date']
df_combined['flat_age_normalized'] = df_combined['flat_age'].apply(lambda x: round_to_multiple(x, 5))


In [135]:
# Preprocess flat_type
def preprocess_flat_type(x):
    x = x.replace('-', ' ')
    return x

df_combined['flat_type'] = df_combined['flat_type'].apply(lambda x: preprocess_flat_type(x))

In [None]:
list_columns = [
    'town',
    'flat_type',
    # 'storey_range',
    # 'storey_range_avg',
    'storey_range_height',
    'storey_low', 'storey_high',
    'floor_area_sqm',
    'flat_model',
    # 'lease_commence_date', 
    # 'remaining_lease',
    # 'remaining_lease_yr',
    'flat_age',
    'flat_age_normalized',
    'resale_price'
    ]

for col in list_columns:
    print('')
    print(col)
    pprint(Counter(df_combined[col]))


town
Counter({'TAMPINES': 73101,
         'YISHUN': 63309,
         'BEDOK': 61400,
         'JURONG WEST': 60708,
         'WOODLANDS': 58735,
         'ANG MO KIO': 48237,
         'HOUGANG': 45476,
         'BUKIT BATOK': 40444,
         'CHOA CHU KANG': 33388,
         'BUKIT MERAH': 30518,
         'PASIR RIS': 30267,
         'TOA PAYOH': 28525,
         'QUEENSTOWN': 25922,
         'CLEMENTI': 25755,
         'GEYLANG': 25679,
         'BUKIT PANJANG': 24241,
         'KALLANG/WHAMPOA': 24030,
         'SENGKANG': 23903,
         'JURONG EAST': 22864,
         'SERANGOON': 21125,
         'BISHAN': 19522,
         'PUNGGOL': 12336,
         'SEMBAWANG': 11016,
         'MARINE PARADE': 7388,
         'CENTRAL AREA': 6345,
         'BUKIT TIMAH': 2283,
         'LIM CHU KANG': 64})

flat_type
Counter({'4 ROOM': 309314,
         '3 ROOM': 272580,
         '5 ROOM': 170408,
         'EXECUTIVE': 62641,
         '2 ROOM': 9863,
         '1 ROOM': 1273,
         'MULTI GENERATION':

In [None]:
# 4. Handle missing values
for col in ['remaining_lease_yr', 'flat_age']:
    df_combined[col].fillna(df_combined[col].median(), inplace=True)

In [None]:
# 8. Encode categorical variables
cat_cols = ['town', 'flat_type', 'flat_model', 'storey_low', 'storey_high']
ordinal_cols = ['storey_low', 'storey_high']

# Label encode ordinal cols
ord_enc = OrdinalEncoder()
df[ordinal_cols] = ord_enc.fit_transform(df[ordinal_cols])

# One-hot encode nominal categories
nom_nom_cols = ['town', 'flat_type', 'flat_model']
df = pd.concat([df, pd.get_dummies(df[nom_nom_cols], drop_first=True)], axis=1)
df.drop(columns=nom_nom_cols, inplace=True)

# 9. Final cleanup & feature–target split
X = df.drop(columns=['resale_price', 'month', 'block', 'street_name', 'lease_commence_date'])
y = df['resale_price']

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import shap

# 1. Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2. Define and tune models (example: Random Forest)
rf = RandomForestRegressor(random_state=42)
params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_leaf': [1, 3]
}
gs = GridSearchCV(rf, params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gs.fit(X_train, y_train)
best_rf = gs.best_estimator_
print("Best RF params:", gs.best_params_)

# 3. Compare to baseline: Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# 4. Evaluate both models on test set
def evaluate(model, X_t, y_t, name="model"):
    pred = model.predict(X_t)
    mae = mean_absolute_error(y_t, pred)
    rmse = mean_squared_error(y_t, pred, squared=False)
    r2 = r2_score(y_t, pred)
    print(f"{name} → MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.3f}")
    return pred

pred_lr = evaluate(lr, X_test, y_test, name="LinearRegression")
pred_rf = evaluate(best_rf, X_test, y_test, name="RandomForest")

# 5. SHAP interpretation for best model
explainer = shap.Explainer(best_rf, X_train)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test, show=False)  # global importance
shap.plots.beeswarm(shap_values, show=False)
shap.plots.bar(shap_values.abs, show=False)

# (Optional) Save test predictions
results = X_test.copy()
results['actual'] = y_test
results['pred_rf'] = pred_rf
results.to_csv("test_predictions.csv", index=False)
