In [42]:
# Import all the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline

In [43]:
# Load the feature engineered dataset

df = pd.read_csv('HDB_Resale_Prices_Features_Engineered.csv')

  df = pd.read_csv('HDB_Resale_Prices_Features_Engineered.csv')


In [44]:
# Ensure all the values in 'postal_code' are strings

df['postal_code'] = df['postal_code'].astype(str)

In [45]:
# Checke null values and data types

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194170 entries, 0 to 194169
Data columns (total 86 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   month                               194170 non-null  object 
 1   town                                194170 non-null  object 
 2   flat_type                           194170 non-null  object 
 3   block                               194170 non-null  object 
 4   street_name                         194170 non-null  object 
 5   storey_range                        194170 non-null  object 
 6   floor_area_sqm                      194170 non-null  float64
 7   flat_model                          194170 non-null  object 
 8   lease_commence_date                 194170 non-null  int64  
 9   remaining_lease                     194170 non-null  object 
 10  resale_price                        194170 non-null  float64
 11  postal_code               

In [46]:
# Identify numeric and categorical features

numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'resale_price' in numeric_features:
    numeric_features.remove('resale_price')
    
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

In [47]:
# Split the dataset into training and testing sets

x = df.drop(['resale_price'],axis=1)
y = df['resale_price']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

print(len(x_train),len(x_test))

155336 38834


In [48]:
# Define a custom RMSE function

def rmse(y_true,y_pred):
    return round(np.sqrt(mean_squared_error(y_true,y_pred)),2)

# Function to train model and display R2 and RMSE

def results(x_train,y_train,x_test,y_test,model,model_name):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    model_r2 = round(r2_score(y_test,y_pred),2)
    model_rmse = round(rmse(y_test,y_pred),0)

    print(f"{model_name} R²:   {model_r2}")
    print(f"{model_name} RMSE: {model_rmse}")

    return model_r2, model_rmse

In [49]:
# Preprocessor for the ML pipeline

preprocessor = ColumnTransformer([
    ('num',StandardScaler(),numeric_features),
    ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_features)
])

In [50]:
# Linear Regression Model

linreg_model = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

# Evaluate Linear Regression Model

linreg_r2,linreg_rmse = results(x_train,y_train,x_test,y_test,linreg_model,'Linear Regression')

Linear Regression R²:   0.96
Linear Regression RMSE: 34802.0


In [None]:
# Random Forest Model

rf_model = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',RandomForestRegressor(
        n_estimators=200,
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1,
        verbose=0
    ))
])

# Evaluate Random Forest Model

rf_r2,rf_rmse = results(x_train,y_train,x_test,y_test,rf_model,'Random Forest')

: 

In [None]:
# DataFrame to compare model performance

results = pd.DataFrame({
    'Model': ['Linear Regression','Random Forest'],
    'R2': [linreg_r2,rf_r2],
    'RMSE': [linreg_rmse,rf_rmse]
})

# Display the model results

print(results)

               Model    R2     RMSE
0  Linear Regression  0.96  34802.0
1      Random Forest  0.98  27633.0


In [None]:
# Since Random Forest performed better, we will analyze its feature importances

# Extract categorical feature names after one-hot encoding

ohe = rf_model.named_steps['preprocessor'].named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(categorical_features)

# Combine numeric and categorical feature names

all_feature_names = np.concatenate([numeric_features,ohe_feature_names])

# Extract feature importances from the Random Forest model

rf_reg = rf_model.named_steps['regressor']
importances = rf_reg.feature_importances_

# Create a DataFrame for feature importances

feature_df = pd.DataFrame({
    'feature':all_feature_names,
    'importance':importances
}).sort_values(by='importance',ascending=False)

In [None]:
feature_df.to_csv("feature_importances.csv", index=False)