Evaluating Regression methods:
    - Linear Regression
    - XGBoost Regressor
    - Random Forest Regressor
    
To ensure model isn't overfitted, we need to perform cross-validation -> average R-Squared isn't too far away from previous R-squared

In [1]:
# import relevant library
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Scikit-learn
import sklearn
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold

# statsmodel
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

# check xgboost version
import xgboost as xg

# import randomForest
from sklearn.ensemble import RandomForestRegressor

# import K-neighbors
from sklearn.neighbors import KNeighborsRegressor

In [2]:
# read csv file
url = "transformed_cluster_56.csv"
Clustering_HDB_sector_code_56_df = pd.read_csv(url)

#### Features to use:
    - remaining_months
    - storey_avg
    - nearest_station_distance
    - distance_from_CBD
    - nearest_mall_distance

In [3]:
Clustering_HDB_sector_code_56_df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,sector_code,distance_from_CBD,nearest_station,nearest_station_distance,nearest_mall,nearest_mall_distance,adjusted_price_per_sqm,storey_avg,remaining_months,KMeanscluster
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,...,56,8.685298,Ang Mo Kio,0.960938,AMK Hub,1.017286,6619.458211,11,736,4
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,...,56,9.789287,Mayflower,0.189871,Broadway Plaza,0.867983,4684.382419,2,727,3
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,56,10.908694,Lentor,0.535118,Broadway Plaza,1.528024,4909.232776,2,749,1
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,...,56,9.169984,Ang Mo Kio,0.932844,myVillage At Serangoon Garden,0.892900,4892.424109,5,745,4
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,56,10.949497,Lentor,0.501153,Broadway Plaza,1.571906,4965.445364,2,749,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5925,2022-08,ANG MO KIO,EXECUTIVE,613,ANG MO KIO AVE 4,04 TO 06,149.0,Apartment,1996,72 years 10 months,...,56,10.597248,Yio Chu Kang,0.727962,Broadway Plaza,1.035896,6174.496644,5,874,1
5926,2022-10,ANG MO KIO,EXECUTIVE,614,ANG MO KIO AVE 4,07 TO 09,149.0,Apartment,1996,72 years 08 months,...,56,10.544589,Yio Chu Kang,0.763189,Broadway Plaza,1.004970,7382.550336,8,872,1
5927,2022-08,ANG MO KIO,EXECUTIVE,533,ANG MO KIO AVE 5,01 TO 03,149.0,Adjoined flat,1980,56 years 07 months,...,56,10.035255,Ang Mo Kio,0.658269,Jubilee Square,0.632316,5771.812081,2,679,2
5928,2022-04,ANG MO KIO,EXECUTIVE,504,ANG MO KIO AVE 8,07 TO 09,163.0,Adjoined flat,1980,57 years 05 months,...,56,10.110655,Ang Mo Kio,0.602717,Jubilee Square,0.360208,6064.880666,8,689,2


In [4]:
Cluster0 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 0]
Cluster1 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 1]
Cluster2 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 2]
Cluster3 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 3]
Cluster4 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 4]


#### Checking for multicollinearity

In [5]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# This is needed to calculate VIF in the next step.
X_train1 = sm.add_constant(X_train)
# For each column (variable) in the above DataFrame
for i in range(X_train1.shape[1]):
    
    # Calculate VIF for that variable
    v = vif(X_train1.values, i)
    
    # First column is constant
    if i == 0:
        print("VIF for intercept :", v)
        
    # All other columns contain predictor variables
    else:
        print("VIF for {} :{}".format(X_train1.columns[i],round(v,4)))

VIF for intercept : 445.8637423178209
VIF for remaining_months :1.3343
VIF for storey_avg :1.3387
VIF for nearest_station_distance :1.2833
VIF for distance_from_CBD :1.3801
VIF for nearest_mall_distance :1.332


In [6]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  239085.55716779552
mean_absolute_error :  372.96375758071196
model R² value :  0.8774861732618909


#### Linear Regression across entire sector code 56 has MAE of 378

In [7]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  263985.55573893513
mean_absolute_error :  403.3390973953011
model R² value :  0.8663557305132116


In [8]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  193549.78517898687
mean_absolute_error :  345.1481869869139
model R² value :  0.6958727832143268


In [9]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  204514.89642300946
mean_absolute_error :  354.6396008898314
model R² value :  0.9332842243827277


In [10]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  231512.3476792973
mean_absolute_error :  382.6470223097119
model R² value :  0.7922167639988925


In [11]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  191028.77121497644
mean_absolute_error :  339.01373539377323
model R² value :  0.8460383894448051


#### Linear Regression across entire sector code 56 of each individual cluster has vastly improved results for 4 out of 5 clusters!

### Training using XGBoost Regressor

In [12]:
# checking for significance
X = X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  169933.14976101334
mean_absolute_error :  313.279429668505
model R² value :  0.9066868942913728


#### XGBoost Regressor produces much better results than Linear Regression

In [13]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  214971.06848109027
mean_absolute_error :  356.2396330561676
model R² value :  0.9110032000710252


In [14]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  168233.90150484978
mean_absolute_error :  289.79955072030447
model R² value :  0.7099870824225063


In [15]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  183458.75226285006
mean_absolute_error :  315.09329409860976
model R² value :  0.9372997510427828


In [16]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  133522.8338802947
mean_absolute_error :  288.1723040362041
model R² value :  0.912612115771835


In [17]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  147379.45143787217
mean_absolute_error :  300.65539098391883
model R² value :  0.913066158437645


#### Similarly, splitting sector 56 produces slightly better results

### Training using Random Forest Regressor

In [18]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  190307.90115902637
mean_absolute_error :  327.67264603315596
model R² value :  0.910831184482949


In [19]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  217568.63135948312
mean_absolute_error :  356.3834340038786
model R² value :  0.8996545421998033


In [20]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  202337.9633014565
mean_absolute_error :  329.99625759316126
model R² value :  0.695057881647011


In [21]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  186791.45460130705
mean_absolute_error :  322.03808363848947
model R² value :  0.9300552281100587


In [22]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  136649.84391190225
mean_absolute_error :  277.6450946078516
model R² value :  0.8761195710215327


In [23]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  159658.80740922337
mean_absolute_error :  309.497504135252
model R² value :  0.9015932574960821


### Training using KNeighbours

In [24]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  301245.3967826815
mean_absolute_error :  418.83555352240654
model R² value :  0.8405896373773101


In [25]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  357088.79431251605
mean_absolute_error :  461.98623003168603
model R² value :  0.840426725859983


In [26]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  254510.51292806288
mean_absolute_error :  374.94624497868284
model R² value :  0.5910433254001879


In [27]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  284155.07964279415
mean_absolute_error :  409.13133211962474
model R² value :  0.9064750622636195


In [28]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  287577.679250603
mean_absolute_error :  408.72682337098576
model R² value :  0.7524684989392835


In [29]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  239016.39578429915
mean_absolute_error :  379.9558191288294
model R² value :  0.8419202204688159


#### After comparing the different models, XGBoost produces the best results. We'll do cross-validation to ensure our model isn't overfitted

In [30]:
from sklearn.model_selection import KFold

X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Instantiation
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)
  
# # Fitting the model
# xgb_r.fit(X_train, y_train)
  
# RMSE Computation

cross_validation = RepeatedKFold(n_splits=10, n_repeats=3)
scores = cross_val_score(xgb_r, X, y, scoring='neg_mean_absolute_error', cv=cross_validation, n_jobs=-1)
cv_scores = cross_val_score(xgb_r, X, y, cv=5, scoring='r2')

# # Predict the model
xgb_r.fit(X_train, y_train)
pred = xgb_r.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print(f"These are the stats for cross validation using xgboost:\n")
print(scores)
print("cv_scores")
print(cv_scores)
print("rmse")
print(rmse)

These are the stats for cross validation using xgboost:

[-296.22544423 -306.76930717 -304.26187405 -316.29123311 -317.74794564
 -313.72457226 -324.26621236 -302.08509683 -305.43590841 -298.3923692
 -291.02115419 -281.1168422  -326.8437982  -307.4772699  -311.34420576
 -324.42940105 -321.61221276 -320.91622157 -310.72077344 -308.42536715
 -302.33806987 -315.23029051 -308.18303588 -318.78827616 -316.49594694
 -308.21545687 -307.71018363 -303.03011158 -316.64881364 -292.89308897]
cv_scores
[0.83863663 0.9060953  0.8951502  0.90981281 0.91623048]
rmse
401.44100307415744


### To conclude:
#### XGBoost performs best on Sector Code 56, and clustering the flats before performing regression improves score

In [31]:
Clustering_HDB_sector_code_56_df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,sector_code,distance_from_CBD,nearest_station,nearest_station_distance,nearest_mall,nearest_mall_distance,adjusted_price_per_sqm,storey_avg,remaining_months,KMeanscluster
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,...,56,8.685298,Ang Mo Kio,0.960938,AMK Hub,1.017286,6619.458211,11,736,4
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,...,56,9.789287,Mayflower,0.189871,Broadway Plaza,0.867983,4684.382419,2,727,3
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,56,10.908694,Lentor,0.535118,Broadway Plaza,1.528024,4909.232776,2,749,1
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,...,56,9.169984,Ang Mo Kio,0.932844,myVillage At Serangoon Garden,0.892900,4892.424109,5,745,4
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,56,10.949497,Lentor,0.501153,Broadway Plaza,1.571906,4965.445364,2,749,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5925,2022-08,ANG MO KIO,EXECUTIVE,613,ANG MO KIO AVE 4,04 TO 06,149.0,Apartment,1996,72 years 10 months,...,56,10.597248,Yio Chu Kang,0.727962,Broadway Plaza,1.035896,6174.496644,5,874,1
5926,2022-10,ANG MO KIO,EXECUTIVE,614,ANG MO KIO AVE 4,07 TO 09,149.0,Apartment,1996,72 years 08 months,...,56,10.544589,Yio Chu Kang,0.763189,Broadway Plaza,1.004970,7382.550336,8,872,1
5927,2022-08,ANG MO KIO,EXECUTIVE,533,ANG MO KIO AVE 5,01 TO 03,149.0,Adjoined flat,1980,56 years 07 months,...,56,10.035255,Ang Mo Kio,0.658269,Jubilee Square,0.632316,5771.812081,2,679,2
5928,2022-04,ANG MO KIO,EXECUTIVE,504,ANG MO KIO AVE 8,07 TO 09,163.0,Adjoined flat,1980,57 years 05 months,...,56,10.110655,Ang Mo Kio,0.602717,Jubilee Square,0.360208,6064.880666,8,689,2


In [32]:
xgb_r

In [33]:
from numpy import asarray

row = [736, 11, 0.960938, 8.685298, 1.017286]
new_data = asarray([row])
yhat = xgb_r.predict(new_data)

In [34]:
yhat

array([5987.1475], dtype=float32)

#### adjusted value was 6619.458 per sqm, predicted 6621.418

In [35]:
# saving trained XGBoost model for future use

import pickle

with open('sector_code_56_cluster_4_model.pkl', 'wb') as f:
    pickle.dump(xgb_r, f)