Evaluating Regression methods:
    - Linear Regression
    - XGBoost Regressor
    - Random Forest Regressor
    
To ensure model isn't overfitted, we need to perform cross-validation -> average R-Squared isn't too far away from previous R-squared

In [1]:
# import relevant library
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Scikit-learn
import sklearn
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold

# statsmodel
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

# check xgboost version
import xgboost as xg

# import randomForest
from sklearn.ensemble import RandomForestRegressor

# import K-neighbors
from sklearn.neighbors import KNeighborsRegressor

In [2]:
# read csv file
url = "transformed_cluster_56.csv"
Clustering_HDB_sector_code_56_df = pd.read_csv(url)

#### Features to use:
    - remaining_months
    - storey_avg
    - nearest_station_distance
    - distance_from_CBD
    - nearest_mall_distance

In [3]:
Clustering_HDB_sector_code_56_df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,sector_code,distance_from_CBD,nearest_station,nearest_station_distance,nearest_mall,nearest_mall_distance,adjusted_price_per_sqm,storey_avg,remaining_months,KMeanscluster
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,...,56,8.685298,Ang Mo Kio,0.960938,AMK Hub,1.017286,6619.458211,11,736,4
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,...,56,9.789287,Mayflower,0.189871,Broadway Plaza,0.867983,4684.382419,2,727,3
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,56,10.908694,Lentor,0.535118,Broadway Plaza,1.528024,4909.232776,2,749,1
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,...,56,9.169984,Ang Mo Kio,0.932844,myVillage At Serangoon Garden,0.892900,4892.424109,5,745,4
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,56,10.949497,Lentor,0.501153,Broadway Plaza,1.571906,4965.445364,2,749,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5925,2022-08,ANG MO KIO,EXECUTIVE,613,ANG MO KIO AVE 4,04 TO 06,149.0,Apartment,1996,72 years 10 months,...,56,10.597248,Yio Chu Kang,0.727962,Broadway Plaza,1.035896,6174.496644,5,874,1
5926,2022-10,ANG MO KIO,EXECUTIVE,614,ANG MO KIO AVE 4,07 TO 09,149.0,Apartment,1996,72 years 08 months,...,56,10.544589,Yio Chu Kang,0.763189,Broadway Plaza,1.004970,7382.550336,8,872,1
5927,2022-08,ANG MO KIO,EXECUTIVE,533,ANG MO KIO AVE 5,01 TO 03,149.0,Adjoined flat,1980,56 years 07 months,...,56,10.035255,Ang Mo Kio,0.658269,Jubilee Square,0.632316,5771.812081,2,679,2
5928,2022-04,ANG MO KIO,EXECUTIVE,504,ANG MO KIO AVE 8,07 TO 09,163.0,Adjoined flat,1980,57 years 05 months,...,56,10.110655,Ang Mo Kio,0.602717,Jubilee Square,0.360208,6064.880666,8,689,2


In [4]:
Cluster0 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 0]
Cluster1 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 1]
Cluster2 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 2]
Cluster3 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 3]
Cluster4 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 4]


#### Checking for multicollinearity

In [5]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# This is needed to calculate VIF in the next step.
X_train1 = sm.add_constant(X_train)
# For each column (variable) in the above DataFrame
for i in range(X_train1.shape[1]):
    
    # Calculate VIF for that variable
    v = vif(X_train1.values, i)
    
    # First column is constant
    if i == 0:
        print("VIF for intercept :", v)
        
    # All other columns contain predictor variables
    else:
        print("VIF for {} :{}".format(X_train1.columns[i],round(v,4)))

VIF for intercept : 443.0305443991785
VIF for remaining_months :1.3424
VIF for storey_avg :1.3459
VIF for nearest_station_distance :1.2944
VIF for distance_from_CBD :1.391
VIF for nearest_mall_distance :1.3377


In [6]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  236722.53080185395
mean_absolute_error :  378.3894173628105
model R² value :  0.8743278559279626


#### Linear Regression across entire sector code 56 has MAE of 374

In [7]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  276956.5738999814
mean_absolute_error :  405.95126253586267
model R² value :  0.8753414093081994


In [8]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  187865.48125822665
mean_absolute_error :  333.775037484697
model R² value :  0.6267808033871868


In [9]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  202965.0258425292
mean_absolute_error :  345.91742143391224
model R² value :  0.9312749025083071


In [10]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  225168.6074828546
mean_absolute_error :  358.42422312673585
model R² value :  0.7814817225331376


In [11]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  208176.68404984567
mean_absolute_error :  356.7029348710452
model R² value :  0.8492454288314382


#### Linear Regression across entire sector code 56 of each individual cluster has vastly improved results for 4 out of 5 clusters!

### Training using XGBoost Regressor

In [12]:
# checking for significance
X = X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  186610.8474242807
mean_absolute_error :  328.01478355411655
model R² value :  0.9107162128161452


#### XGBoost Regressor produces much better results than Linear Regression

In [13]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  224574.21332764448
mean_absolute_error :  367.1909933858317
model R² value :  0.9088135055420359


In [14]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  136314.8985344042
mean_absolute_error :  279.6965967914935
model R² value :  0.7681904782605948


In [15]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  141325.03061825148
mean_absolute_error :  283.8713629486638
model R² value :  0.9514185692265659


In [16]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  126201.57565060981
mean_absolute_error :  272.2959341795653
model R² value :  0.8981271499030334


In [17]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  156128.07116728695
mean_absolute_error :  304.76844746577234
model R² value :  0.8893268302126314


#### Similarly, splitting sector 56 produces slightly better results

### Training using Random Forest Regressor

In [18]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  183417.54045009564
mean_absolute_error :  325.1205512875998
model R² value :  0.9045092941859436


In [19]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  241936.73829774174
mean_absolute_error :  366.8890069959915
model R² value :  0.8924779325302808


In [20]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  163818.33599055637
mean_absolute_error :  304.14834739760624
model R² value :  0.7219885193333558


In [21]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  166646.84166241103
mean_absolute_error :  313.0119903907928
model R² value :  0.9438428772930381


In [22]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  137280.5779852535
mean_absolute_error :  295.52125244659527
model R² value :  0.8880323870403097


In [23]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  146399.3054362795
mean_absolute_error :  295.72211841634663
model R² value :  0.9082609851147817


### Training using KNeighbours

In [24]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  283007.77043094457
mean_absolute_error :  407.8312357509677
model R² value :  0.8560937621766785


In [25]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  358387.88796747953
mean_absolute_error :  473.9542765388079
model R² value :  0.8437381829644433


In [26]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  219662.51764080074
mean_absolute_error :  356.6030383281706
model R² value :  0.6366671805690556


In [27]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  233654.71296783106
mean_absolute_error :  375.93132393469193
model R² value :  0.9176996799041572


In [28]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  271414.8101938254
mean_absolute_error :  387.56625654597445
model R² value :  0.7632400265794208


In [29]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  226969.44768790607
mean_absolute_error :  359.0401738261424
model R² value :  0.8438323188793909


#### After comparing the different models, XGBoost produces the best results. We'll do cross-validation to ensure our model isn't overfitted

In [31]:
from sklearn.model_selection import KFold

X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Instantiation
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)
  
# # Fitting the model
# xgb_r.fit(X_train, y_train)
  
# RMSE Computation

cross_validation = RepeatedKFold(n_splits=10, n_repeats=3)
scores = cross_val_score(xgb_r, X, y, scoring='neg_mean_absolute_error', cv=cross_validation, n_jobs=-1)
cv_scores = cross_val_score(xgb_r, X, y, cv=5, scoring='r2')

# # Predict the model
xgb_r.fit(X_train, y_train)
pred = xgb_r.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print(f"These are the stats for cross validation using xgboost:\n")
print(scores)
print("cv_scores")
print(cv_scores)
print("rmse")
print(rmse)

These are the stats for cross validation using xgboost:

[-302.00422991 -299.01023185 -317.03314669 -305.83053951 -319.68353082
 -317.23653401 -301.74289743 -312.75874566 -314.6220513  -314.10489171
 -313.43245463 -299.5888082  -312.4859449  -302.0549481  -305.27424666
 -326.87277102 -299.83160679 -298.07211    -322.3736634  -316.00055466
 -312.09683156 -294.68692686 -315.64416591 -306.80067049 -316.96451991
 -305.35379948 -310.28887984 -303.32723434 -314.45254562 -320.6199618 ]
cv_scores
[0.83863663 0.9060953  0.8951502  0.90981281 0.91623048]
rmse
407.4361098499947


### To conclude:
#### XGBoost performs best on Sector Code 56, and clustering the flats before performing regression improves score

In [40]:
Clustering_HDB_sector_code_56_df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,sector_code,distance_from_CBD,nearest_station,nearest_station_distance,nearest_mall,nearest_mall_distance,adjusted_price_per_sqm,storey_avg,remaining_months,KMeanscluster
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,...,56,8.685298,Ang Mo Kio,0.960938,AMK Hub,1.017286,6619.458211,11,736,4
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,...,56,9.789287,Mayflower,0.189871,Broadway Plaza,0.867983,4684.382419,2,727,3
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,56,10.908694,Lentor,0.535118,Broadway Plaza,1.528024,4909.232776,2,749,1
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,...,56,9.169984,Ang Mo Kio,0.932844,myVillage At Serangoon Garden,0.892900,4892.424109,5,745,4
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,56,10.949497,Lentor,0.501153,Broadway Plaza,1.571906,4965.445364,2,749,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5925,2022-08,ANG MO KIO,EXECUTIVE,613,ANG MO KIO AVE 4,04 TO 06,149.0,Apartment,1996,72 years 10 months,...,56,10.597248,Yio Chu Kang,0.727962,Broadway Plaza,1.035896,6174.496644,5,874,1
5926,2022-10,ANG MO KIO,EXECUTIVE,614,ANG MO KIO AVE 4,07 TO 09,149.0,Apartment,1996,72 years 08 months,...,56,10.544589,Yio Chu Kang,0.763189,Broadway Plaza,1.004970,7382.550336,8,872,1
5927,2022-08,ANG MO KIO,EXECUTIVE,533,ANG MO KIO AVE 5,01 TO 03,149.0,Adjoined flat,1980,56 years 07 months,...,56,10.035255,Ang Mo Kio,0.658269,Jubilee Square,0.632316,5771.812081,2,679,2
5928,2022-04,ANG MO KIO,EXECUTIVE,504,ANG MO KIO AVE 8,07 TO 09,163.0,Adjoined flat,1980,57 years 05 months,...,56,10.110655,Ang Mo Kio,0.602717,Jubilee Square,0.360208,6064.880666,8,689,2


In [41]:
xgb_r

In [44]:
from numpy import asarray

row = [736, 11, 0.960938, 8.685298, 1.017286]
new_data = asarray([row])
yhat = xgb_r.predict(new_data)

In [45]:
yhat

array([6621.418], dtype=float32)

#### adjusted value was 6619.458 per sqm, predicted 6621.418