Refer to 03. Regression with Sector Code 56 (Archive) in /archive, XGBoost Regressor had the best results

In [88]:
# import relevant library
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Scikit-learn
import sklearn
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold

# statsmodel
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

# check xgboost version
import xgboost as xg

# pickle
import os
import pickle

In [2]:
# read csv file
url = "transformed_cluster.csv"
df = pd.read_csv(url)

#### Features to use:
    - remaining_months
    - storey_avg
    - nearest_station_distance
    - distance_from_CBD
    - nearest_mall_distance

In [3]:
df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,sector_code,distance_from_CBD,nearest_station,nearest_station_distance,nearest_mall,nearest_mall_distance,adjusted_price_per_sqm,storey_avg,remaining_months,KMeanscluster
0,2017-01,BUKIT MERAH,2 ROOM,45,TELOK BLANGAH DR,07 TO 09,45.0,Improved,1976,58 years,...,10,4.850986,Telok Blangah,0.118943,Alexandra Retail Centre,0.939034,6835.034438,8,696,3
1,2017-01,BUKIT MERAH,3 ROOM,63,TELOK BLANGAH HTS,04 TO 06,73.0,New Generation,1976,58 years 08 months,...,10,4.616984,Telok Blangah,0.477042,Alexandra Retail Centre,1.069136,5812.741056,5,704,4
2,2017-01,BUKIT MERAH,3 ROOM,61,TELOK BLANGAH HTS,07 TO 09,67.0,New Generation,1976,58 years 08 months,...,10,4.480625,Telok Blangah,0.609295,Alexandra Retail Centre,1.199945,6370.760091,8,704,4
3,2017-01,BUKIT MERAH,3 ROOM,44,TELOK BLANGAH DR,07 TO 09,65.0,Improved,1976,58 years,...,10,4.764832,Telok Blangah,0.105708,Alexandra Retail Centre,1.053998,7011.007066,8,696,3
4,2017-01,BUKIT MERAH,3 ROOM,55,TELOK BLANGAH DR,07 TO 09,73.0,New Generation,1978,60 years,...,10,4.508882,Telok Blangah,0.448751,Alexandra Retail Centre,1.197843,6621.021616,8,720,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142121,2021-09,BUKIT MERAH,5 ROOM,17A,TELOK BLANGAH CRES,07 TO 09,119.0,Improved,2002,79 years 05 months,...,91,3.398333,Tiong Bahru,1.166959,Tiong Bahru Plaza,1.174917,7034.885667,8,953,1
142122,2022-03,BUKIT MERAH,4 ROOM,14A,TELOK BLANGAH CRES,04 TO 06,99.0,Model A,1999,76 years 07 months,...,91,3.484771,Tiong Bahru,1.190679,Tiong Bahru Plaza,1.193206,6653.525855,5,919,0
142123,2022-08,BUKIT MERAH,4 ROOM,14A,TELOK BLANGAH CRES,10 TO 12,100.0,Model A,1999,76 years 03 months,...,91,3.484771,Tiong Bahru,1.190679,Tiong Bahru Plaza,1.193206,6660.000000,11,915,0
142124,2022-09,BUKIT MERAH,4 ROOM,17A,TELOK BLANGAH CRES,28 TO 30,100.0,Model A,2002,78 years 05 months,...,91,3.398333,Tiong Bahru,1.166959,Tiong Bahru Plaza,1.174917,7650.000000,29,941,1


In [4]:
# checking for significance
X = df[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
y = df['adjusted_price_per_sqm']

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# This is needed to calculate VIF in the next step.
X_train1 = sm.add_constant(X_train)
# For each column (variable) in the above DataFrame
for i in range(X_train1.shape[1]):
    
    # Calculate VIF for that variable
    v = vif(X_train1.values, i)
    
    # First column is constant
    if i == 0:
        print("VIF for intercept :", v)
        
    # All other columns contain predictor variables
    else:
        print("VIF for {} :{}".format(X_train1.columns[i],round(v,4)))

VIF for intercept : 47.53692563235843
VIF for remaining_months :1.3227
VIF for storey_avg :1.1746
VIF for nearest_station_distance :1.1056
VIF for distance_from_CBD :1.2558
VIF for nearest_mall_distance :1.1691


In [9]:
sector_code_list = df['sector_code'].unique()
sector_code_list

array([10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 26, 27, 30, 31, 32, 33,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 50, 51, 52, 53, 54,
       55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 73, 75, 76, 79,
       80, 81, 82, 85, 90, 91], dtype=int64)

In [90]:
count = 0
for sector_code in sector_code_list:
#     print(sector_code)
    sector_code_df = df[df['sector_code'] == sector_code]
    
#     KMeanscluster_list = sector_code_df['KMeanscluster'].unique()
    
    for i in range(0, 5):
        current_cluster = sector_code_df[sector_code_df['KMeanscluster'] == i]
        
        cluster_len = len(current_cluster)
            
        count += cluster_len
        
        print(f"current sector code is {sector_code}, current cluster is {i}, sanity check is {cluster_len}")
        if cluster_len == 0:
            pass
        else:
        
            X = current_cluster[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
            y = current_cluster[['adjusted_price_per_sqm']]

            # Splitting
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

            # Instantiation
            xgb_r = xg.XGBRegressor(max_depth=5, eta=0.5, subsample=0.8, colsample_bytree=0.8,learning_rate=0.1)

            xgb_r.fit(X_train, y_train)
            # Saving trained model
            file_name = "sector_code_" + str(sector_code) + "_cluster_" + str(i) + "_model.pkl"
            print(file_name)
            
            file_path = os.path.join("model", file_name)
            with open(file_path, 'wb') as f:
                pickle.dump(xgb_r, f)

    print("====")


current sector code is 10, current cluster is 0, sanity check is 223
sector_code_10_cluster_0_model.pkl
current sector code is 10, current cluster is 1, sanity check is 159
sector_code_10_cluster_1_model.pkl
current sector code is 10, current cluster is 2, sanity check is 221
sector_code_10_cluster_2_model.pkl
current sector code is 10, current cluster is 3, sanity check is 156
sector_code_10_cluster_3_model.pkl
current sector code is 10, current cluster is 4, sanity check is 313
sector_code_10_cluster_4_model.pkl
====
current sector code is 11, current cluster is 0, sanity check is 14
sector_code_11_cluster_0_model.pkl
current sector code is 11, current cluster is 1, sanity check is 11
sector_code_11_cluster_1_model.pkl
current sector code is 11, current cluster is 2, sanity check is 0
current sector code is 11, current cluster is 3, sanity check is 0
current sector code is 11, current cluster is 4, sanity check is 0
====
current sector code is 12, current cluster is 0, sanity check i

sector_code_33_cluster_0_model.pkl
current sector code is 33, current cluster is 1, sanity check is 360
sector_code_33_cluster_1_model.pkl
current sector code is 33, current cluster is 2, sanity check is 265
sector_code_33_cluster_2_model.pkl
current sector code is 33, current cluster is 3, sanity check is 213
sector_code_33_cluster_3_model.pkl
current sector code is 33, current cluster is 4, sanity check is 117
sector_code_33_cluster_4_model.pkl
====
current sector code is 35, current cluster is 0, sanity check is 41
sector_code_35_cluster_0_model.pkl
current sector code is 35, current cluster is 1, sanity check is 137
sector_code_35_cluster_1_model.pkl
current sector code is 35, current cluster is 2, sanity check is 134
sector_code_35_cluster_2_model.pkl
current sector code is 35, current cluster is 3, sanity check is 77
sector_code_35_cluster_3_model.pkl
current sector code is 35, current cluster is 4, sanity check is 97
sector_code_35_cluster_4_model.pkl
====
current sector code is

sector_code_53_cluster_0_model.pkl
current sector code is 53, current cluster is 1, sanity check is 1281
sector_code_53_cluster_1_model.pkl
current sector code is 53, current cluster is 2, sanity check is 1073
sector_code_53_cluster_2_model.pkl
current sector code is 53, current cluster is 3, sanity check is 1366
sector_code_53_cluster_3_model.pkl
current sector code is 53, current cluster is 4, sanity check is 1795
sector_code_53_cluster_4_model.pkl
====
current sector code is 54, current cluster is 0, sanity check is 1524
sector_code_54_cluster_0_model.pkl
current sector code is 54, current cluster is 1, sanity check is 2034
sector_code_54_cluster_1_model.pkl
current sector code is 54, current cluster is 2, sanity check is 1625
sector_code_54_cluster_2_model.pkl
current sector code is 54, current cluster is 3, sanity check is 1676
sector_code_54_cluster_3_model.pkl
current sector code is 54, current cluster is 4, sanity check is 1733
sector_code_54_cluster_4_model.pkl
====
current se

sector_code_75_cluster_1_model.pkl
current sector code is 75, current cluster is 2, sanity check is 283
sector_code_75_cluster_2_model.pkl
current sector code is 75, current cluster is 3, sanity check is 617
sector_code_75_cluster_3_model.pkl
current sector code is 75, current cluster is 4, sanity check is 692
sector_code_75_cluster_4_model.pkl
====
current sector code is 76, current cluster is 0, sanity check is 1414
sector_code_76_cluster_0_model.pkl
current sector code is 76, current cluster is 1, sanity check is 2126
sector_code_76_cluster_1_model.pkl
current sector code is 76, current cluster is 2, sanity check is 1987
sector_code_76_cluster_2_model.pkl
current sector code is 76, current cluster is 3, sanity check is 2079
sector_code_76_cluster_3_model.pkl
current sector code is 76, current cluster is 4, sanity check is 2058
sector_code_76_cluster_4_model.pkl
====
current sector code is 79, current cluster is 0, sanity check is 514
sector_code_79_cluster_0_model.pkl
current sector

In [80]:
from numpy import asarray

# X = Cluster4[['remaining_months', 'storey_avg', 'nearest_station_distance', 'distance_from_CBD', 'nearest_mall_distance']]
# taking from row 2 

row = [696, 8, 0.118942995, 4.85098633804542, 0.9390336199695]
new_data = asarray([row])
yhat = xgb_r.predict(new_data)
yhat

array([6386.2363], dtype=float32)