In [15]:
# Import all the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline

In [16]:
# Load the feature engineered dataset

df = pd.read_csv('HDB_Resale_Prices_Features_Engineered.csv')

  df = pd.read_csv('HDB_Resale_Prices_Features_Engineered.csv')


In [17]:
# Ensure all the values in 'postal_code' are strings

df['postal_code'] = df['postal_code'].astype(str)

In [18]:
# Checke null values and data types

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193875 entries, 0 to 193874
Data columns (total 96 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   month                               193875 non-null  object 
 1   town                                193875 non-null  object 
 2   flat_type                           193875 non-null  object 
 3   block                               193875 non-null  object 
 4   street_name                         193875 non-null  object 
 5   floor_area_sqm                      193875 non-null  float64
 6   flat_model                          193875 non-null  object 
 7   remaining_lease                     193875 non-null  object 
 8   resale_price                        193875 non-null  float64
 9   year                                193875 non-null  int64  
 10  quarter                             193875 non-null  int64  
 11  storey_avg                

In [19]:
# Identify numeric and categorical features

numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'resale_price' in numeric_features:
    numeric_features.remove('resale_price')
    
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

In [20]:
# Split the dataset into training and testing sets

x = df.drop(['resale_price'],axis=1)
y = df['resale_price']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

print(len(x_train),len(x_test))

155100 38775


In [21]:
# Define a custom RMSE function

def rmse(y_true,y_pred):
    return round(np.sqrt(mean_squared_error(y_true,y_pred)),2)

# Function to train model and display R2 and RMSE

def results(train_x,train_y,test_x,test_y,model,model_name):
    model.fit(train_x,train_y)
    y_pred = model.predict(test_x)
    model_r2 = round(r2_score(test_y,y_pred),2)
    model_rmse = round(rmse(test_y,y_pred),0)

    print(f"{model_name} R²:   {model_r2}")
    print(f"{model_name} RMSE: {model_rmse}")

    return model_r2, model_rmse

In [22]:
# Preprocessor for the ML pipeline

preprocessor = ColumnTransformer([
    ('num',StandardScaler(),numeric_features),
    ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_features)
])

In [23]:
# Linear Regression Model

linreg_model = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

# Evaluate Linear Regression Model

linreg_r2,linreg_rmse = results(x_train,y_train,x_test,y_test,linreg_model,'Linear Regression')

Linear Regression R²:   0.96
Linear Regression RMSE: 34593.0


In [None]:
rf_model = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',RandomForestRegressor(
        n_estimators=400,
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1,
        verbose=0
    ))
])

# Evaluate Random Forest Model

rf_r2,rf_rmse = results(x_train,y_train,x_test,y_test,rf_model,'Random Forest')

Random Forest R²:   0.97
Random Forest RMSE: 27833.0


In [25]:
# DataFrame to compare model performance

results = pd.DataFrame({
    'Model': ['Linear Regression','Random Forest'],
    'R2': [linreg_r2,rf_r2],
    'RMSE': [linreg_rmse,rf_rmse]
})

# Display the model results

print(results)

               Model    R2     RMSE
0  Linear Regression  0.96  34593.0
1      Random Forest  0.97  27833.0


In [26]:
# Since Random Forest performed better, we will analyze its feature importances

# Extract categorical feature names after one-hot encoding

ohe = rf_model.named_steps['preprocessor'].named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(categorical_features)

# Combine numeric and categorical feature names

all_feature_names = np.concatenate([numeric_features,ohe_feature_names])

# Extract feature importances from the Random Forest model

rf_reg = rf_model.named_steps['regressor']
importances = rf_reg.feature_importances_

# Create a DataFrame for feature importances

feature_df = pd.DataFrame({
    'feature':all_feature_names,
    'importance':importances
}).sort_values(by='importance',ascending=False)

In [27]:
print(feature_df)

                      feature  importance
0              floor_area_sqm    0.425893
13     pt_time_to_cbd_min_adj    0.166683
1                        year    0.131115
25        lease_commence_year    0.093945
12         pt_time_to_cbd_min    0.011263
...                       ...         ...
7937       postal_code_530695    0.000000
7943       postal_code_530702    0.000000
12756      postal_code_760361    0.000000
7954       postal_code_530804    0.000000
12818      postal_code_760602    0.000000

[14450 rows x 2 columns]


In [28]:
feature_df.to_csv("feature_importances.csv", index=False)