Evaluating Regression methods:
    - Linear Regression
    - XGBoost Regressor
    - Random Forest Regressor
    
To ensure model isn't overfitted, we need to perform cross-validation -> average R-Squared isn't too far away from previous R-squared

In [53]:
# import relevant library
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Scikit-learn
import sklearn
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold

# statsmodel
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

# check xgboost version
import xgboost as xg

# import randomForest
from sklearn.ensemble import RandomForestRegressor

# import K-neighbors
from sklearn.neighbors import KNeighborsRegressor

In [19]:
# read csv file
url = "transformed_cluster_56.csv"
Clustering_HDB_sector_code_56_df = pd.read_csv(url)

#### Features to use:
    - remaining_months
    - storey_avg
    - nearest_station_distance
    - distance_from_CBD
    - nearest_mall_distance

In [20]:
Clustering_HDB_sector_code_56_df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,sector_code,distance_from_CBD,nearest_station,nearest_station_distance,nearest_mall,nearest_mall_distance,adjusted_price_per_sqm,storey_avg,remaining_months,KMeanscluster
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,...,56,8.685298,Ang Mo Kio,0.960938,AMK Hub,1.017286,6619.458211,11,736,4
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,...,56,9.789287,Mayflower,0.189871,Broadway Plaza,0.867983,4684.382419,2,727,3
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,56,10.908694,Lentor,0.535118,Broadway Plaza,1.528024,4909.232776,2,749,1
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,...,56,9.169984,Ang Mo Kio,0.932844,myVillage At Serangoon Garden,0.892900,4892.424109,5,745,4
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,56,10.949497,Lentor,0.501153,Broadway Plaza,1.571906,4965.445364,2,749,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5925,2022-08,ANG MO KIO,EXECUTIVE,613,ANG MO KIO AVE 4,04 TO 06,149.0,Apartment,1996,72 years 10 months,...,56,10.597248,Yio Chu Kang,0.727962,Broadway Plaza,1.035896,6174.496644,5,874,1
5926,2022-10,ANG MO KIO,EXECUTIVE,614,ANG MO KIO AVE 4,07 TO 09,149.0,Apartment,1996,72 years 08 months,...,56,10.544589,Yio Chu Kang,0.763189,Broadway Plaza,1.004970,7382.550336,8,872,1
5927,2022-08,ANG MO KIO,EXECUTIVE,533,ANG MO KIO AVE 5,01 TO 03,149.0,Adjoined flat,1980,56 years 07 months,...,56,10.035255,Ang Mo Kio,0.658269,Jubilee Square,0.632316,5771.812081,2,679,2
5928,2022-04,ANG MO KIO,EXECUTIVE,504,ANG MO KIO AVE 8,07 TO 09,163.0,Adjoined flat,1980,57 years 05 months,...,56,10.110655,Ang Mo Kio,0.602717,Jubilee Square,0.360208,6064.880666,8,689,2


In [29]:
Cluster0 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 0]
Cluster1 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 1]
Cluster2 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 2]
Cluster3 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 3]
Cluster4 = Clustering_HDB_sector_code_56_df[Clustering_HDB_sector_code_56_df['KMeanscluster'] == 4]


#### Checking for multicollinearity

In [21]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# This is needed to calculate VIF in the next step.
X_train1 = sm.add_constant(X_train)
# For each column (variable) in the above DataFrame
for i in range(X_train1.shape[1]):
    
    # Calculate VIF for that variable
    v = vif(X_train1.values, i)
    
    # First column is constant
    if i == 0:
        print("VIF for intercept :", v)
        
    # All other columns contain predictor variables
    else:
        print("VIF for {} :{}".format(X_train1.columns[i],round(v,4)))

VIF for intercept : 451.01267616572073
VIF for remaining_months :1.3179
VIF for storey_avg :1.317
VIF for nearest_station_distance :1.3076
VIF for distance_from_CBD :1.3978
VIF for nearest_mall_distance :1.3338


In [22]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  236247.07739913694
mean_absolute_error :  374.70844548093123
model R² value :  0.873671216721844


#### Linear Regression across entire sector code 56 has MAE of 374

In [33]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  291596.8051917944
mean_absolute_error :  422.2549147390067
model R² value :  0.8672714714049731


In [34]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  182225.88364948393
mean_absolute_error :  339.4750364974232
model R² value :  0.6709875908624479


In [35]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  185987.8979819276
mean_absolute_error :  333.4766520182286
model R² value :  0.9344226218692135


In [36]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  237475.82536231086
mean_absolute_error :  372.6043550913554
model R² value :  0.7802264111353444


In [37]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# training model
baseline_model = LinearRegression()

# fitting the model
baseline_model.fit(X_train, y_train)

# making predictions
predictions = baseline_model.predict(X_test)

# model evaluation
print('Model errors :')
print('mean_squared_error : ', mean_squared_error(y_test, predictions))
print('mean_absolute_error : ', mean_absolute_error(y_test, predictions))
print('model R² value : ', sklearn.metrics.r2_score(y_test, predictions))

Model errors :
mean_squared_error :  198085.07080875008
mean_absolute_error :  346.4703709759377
model R² value :  0.8678672973182502


#### Linear Regression across entire sector code 56 of each individual cluster has vastly improved results for 4 out of 5 clusters!

### Training using XGBoost Regressor

In [39]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  165041.89575430058
mean_absolute_error :  308.2890594025533
model R² value :  0.909995263333648


#### XGBoost Regressor produces much better results than Linear Regression

In [40]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  203646.13556128985
mean_absolute_error :  345.00813635317445
model R² value :  0.9095968725535628


In [41]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  141737.16106943984
mean_absolute_error :  293.1262725721621
model R² value :  0.8009086139782926


In [42]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  174466.90129277852
mean_absolute_error :  316.1876311596868
model R² value :  0.9354105974168794


In [43]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  143928.05894686273
mean_absolute_error :  287.4815092132056
model R² value :  0.9024451884113861


In [44]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# defining model
xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

# fit and predict
xgb_r.fit(X_train,y_train)
y_pred = xgb_r.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  128537.96309183515
mean_absolute_error :  282.51634568606045
model R² value :  0.8833212444129164


#### Similarly, splitting sector 56 produces slightly better results

### Training using Random Forest Regressor

In [51]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  188200.0634350149
mean_absolute_error :  329.599087947229
model R² value :  0.9079262082641457


In [46]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  192947.40139211426
mean_absolute_error :  326.2086428980727
model R² value :  0.9116723540613596


In [47]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  183320.73564368708
mean_absolute_error :  327.598862570316
model R² value :  0.6469074840463774


In [48]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  174633.4616044088
mean_absolute_error :  318.37291973087474
model R² value :  0.9337686596478572


In [49]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  145213.48485690265
mean_absolute_error :  293.22467942696414
model R² value :  0.916942076356694


In [50]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
rfr = RandomForestRegressor(max_depth=23)

# fit and predict
rfr.fit(X_train,y_train)
y_pred = rfr.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  154533.2344957356
mean_absolute_error :  311.58466242909077
model R² value :  0.8877871527312745


### Training using KNeighbours

In [54]:
# checking for significance
X = Clustering_HDB_sector_code_56_df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Clustering_HDB_sector_code_56_df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  299527.20311222016
mean_absolute_error :  424.58015672672013
model R² value :  0.8367759952158578


In [55]:
# checking for significance
X = Cluster0[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster0['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  386045.18483667594
mean_absolute_error :  473.1423231074284
model R² value :  0.8227514197507841


In [56]:
# checking for significance
X = Cluster1[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster1['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  236315.9032326559
mean_absolute_error :  369.2308746080097
model R² value :  0.6434605235482582


In [57]:
# checking for significance
X = Cluster2[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster2['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  269963.37596966577
mean_absolute_error :  403.7820439008112
model R² value :  0.8951357011063006


In [58]:
# checking for significance
X = Cluster3[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster3['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  221497.76335107835
mean_absolute_error :  358.2301722311132
model R² value :  0.8543587849731326


In [59]:
# checking for significance
X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = Cluster4['adjusted_price_per_sqm']
# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# model init
neigh = KNeighborsRegressor(n_neighbors=3)

# fit and predict
neigh.fit(X_train,y_train)
y_pred = neigh.predict(X_test)

# testing accuracy
# model evaluation
print('mean_squared_error : ', mean_squared_error(y_test, y_pred))
print('mean_absolute_error : ', mean_absolute_error(y_test, y_pred))
print('model R² value : ', sklearn.metrics.r2_score(y_test, y_pred))

mean_squared_error :  243396.20561177382
mean_absolute_error :  377.21464219724584
model R² value :  0.8518450475756293


#### After comparing the different models, XGBoost produces the best results. We'll do cross-validation to ensure our model isn't overfitted

In [None]:
from sklearn.model_selection import KFold

# Load the data
data = pd.read_csv('data.csv')

# Split the data into k folds
kf = KFold(n_splits=5, shuffle=True)

# Create a linear regression model
model = LinearRegression()

# Perform cross-validation
errors = []
for train_index, test_index in kf.split(data):
    X_train, X_test = data.iloc[train_index], data.iloc[test_index]
    y_train, y_test = data['target'].iloc[train_index], data['target'].iloc[test_index]

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Evaluate the model on the test data
    error = np.mean((model.predict(X_test) - y_test)**2)
    errors.append(error)

# Print the average error
print('Average error:', np.mean(errors))