# AdaBoost Regression

In [1]:
import re
import io

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostRegressor
from sklearn.naive_bayes import GaussianNB

#For hyperparamter tuning
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
# import csv
hdb_model_data = pd.read_csv('final_hdb_resale_prices.csv')
hdb_model_data.sample(5)

Unnamed: 0.1,Unnamed: 0,month,town,flat_type,storey_range,floor_area_sqm,flat_model,resale_price,Latitude,Longitude,...,year,no.of bto,resale application,demand ratio,remaining_lease_months,address,Distance to nearest MRT,Distance to CBD,Distance to nearest mall,isMatureEstate
101944,101944,2021-04,PUNGGOL,EXECUTIVE,14,128.0,Premium Apartment,600000.0,1.398563,103.9111,...,2021,13756,26436.0,0.91,984,186 PUNGGOL CTRL,1812.429497,14298.41291,534.836412,0
93268,93268,2021-06,CHOA CHU KANG,5 ROOM,5,125.0,Improved,508000.0,1.393717,103.743763,...,2021,13756,26436.0,0.91,878,544 CHOA CHU KANG ST 52,585.264814,17002.5789,166.391281,0
142655,142655,2023-01,SENGKANG,5 ROOM,8,120.0,Improved,570000.0,1.38762,103.902159,...,2023,20000,,,903,124 RIVERVALE DR,868.719224,12767.41786,241.492299,0
66876,66876,2020-03,BEDOK,3 ROOM,5,64.0,Simplified,265000.0,1.332739,103.910912,...,2020,7314,26436.0,1.13,792,126 BEDOK RESERVOIR RD,348.09987,8551.829022,2207.754311,1
25942,25942,2018-04,TOA PAYOH,5 ROOM,2,122.0,Improved,808000.0,1.339163,103.846243,...,2018,17556,22005.0,1.02,993,121 LOR 2 TOA PAYOH,157.129106,6095.854306,885.743353,1


In [3]:
#obtaining selected years data
years_needed = [2020,2021,2022]
hdb_model_data = hdb_model_data.loc[hdb_model_data['year'].isin(years_needed)]

#obtain cpi_adjusted_price_per_sqm
hdb_model_data['cpi_adjusted_price_per_sqm'] = hdb_model_data['cpi_adjusted_price']/hdb_model_data['floor_area_sqm']

hdb_model_data.head()

Unnamed: 0.1,Unnamed: 0,month,town,flat_type,storey_range,floor_area_sqm,flat_model,resale_price,Latitude,Longitude,...,no.of bto,resale application,demand ratio,remaining_lease_months,address,Distance to nearest MRT,Distance to CBD,Distance to nearest mall,isMatureEstate,cpi_adjusted_price_per_sqm
63275,63275,2020-01,ANG MO KIO,3 ROOM,5,73.0,New Generation,265000.0,1.365445,103.842715,...,7314,26436.0,1.13,667,208 ANG MO KIO AVE 1,908.970521,9026.295266,775.593122,1,3619.754795
63276,63276,2020-01,ANG MO KIO,3 ROOM,20,70.0,Model A,470000.0,1.365561,103.845169,...,7314,26436.0,1.13,1100,307C ANG MO KIO AVE 1,687.185319,9015.122154,561.028714,1,6695.082857
63277,63277,2020-01,ANG MO KIO,3 ROOM,2,73.0,New Generation,230000.0,1.365098,103.847381,...,7314,26436.0,1.13,676,319 ANG MO KIO AVE 1,586.98069,8949.443986,489.97866,1,3141.673973
63278,63278,2020-01,ANG MO KIO,3 ROOM,5,73.0,New Generation,280000.0,1.366197,103.841505,...,7314,26436.0,1.13,663,216 ANG MO KIO AVE 1,800.631299,9123.690385,806.304304,1,3824.646575
63279,63279,2020-01,ANG MO KIO,3 ROOM,8,68.0,New Generation,220000.0,1.372032,103.857625,...,7314,26436.0,1.13,708,556 ANG MO KIO AVE 10,927.322849,9734.443856,1059.12169,1,3226.041176


In [4]:
#removing features
hdb_model_data.drop(columns=['Unnamed: 0', 'month', 'town', 'flat_type', 'flat_model', 'resale_price', 
                              'Latitude', 'Longitude', 'cpi_multiplier', 'cpi_adjusted_price', 'year', 'demand ratio', 'address', 
                             'Distance to nearest mall'], inplace=True)
hdb_model_data.dtypes

storey_range                    int64
floor_area_sqm                float64
no.of bto                       int64
resale application            float64
remaining_lease_months          int64
Distance to nearest MRT       float64
Distance to CBD               float64
isMatureEstate                  int64
cpi_adjusted_price_per_sqm    float64
dtype: object

In [5]:
hdb_model_data.sample(5)

Unnamed: 0,storey_range,floor_area_sqm,no.of bto,resale application,remaining_lease_months,Distance to nearest MRT,Distance to CBD,isMatureEstate,cpi_adjusted_price_per_sqm
116437,11,91.0,20064,30370.0,996,1005.887616,19345.42387,0,5004.58989
87071,5,67.0,13756,26436.0,743,950.87938,9044.703006,1,4962.772657
122122,14,60.0,20064,30370.0,766,1216.86449,10251.36199,0,5745.924
83347,11,104.0,7314,26436.0,767,455.587303,14272.72946,1,4698.063462
89650,8,121.0,13756,26436.0,910,912.488005,12493.70237,0,4386.570248


In [6]:
X = hdb_model_data.iloc[:,0:8]
y= hdb_model_data.iloc[:,-1]  
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


Set Random Forest Regressor

In [7]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 8,  #no. of trees to be used in the model
                        max_depth = 15,  # maximum number of splits each tree can take. Too low the model will be trained less and have high bias
                        min_samples_split = 2, #every node have 2 subnodes
                        max_features = 6)#maximum features in each tree

Using GridSearchCV, find best parameters for Adaboost

In [8]:
# define the model with default hyperparameters
model = AdaBoostRegressor(base_estimator = regressor)

# define the grid of values to search
grid = dict()
grid['n_estimators'] = [50, 100, 150, 200, 250, 300]
grid['learning_rate'] = [0.1, 0.5, 1.0, 1.5, 2.0]

# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=2)

# execute the grid search
grid_result = grid_search.fit(X_train, y_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#Use the trained model to predict the test data
y_pred = grid_result.predict(X_test)

Best: 0.938269 using {'learning_rate': 0.1, 'n_estimators': 300}


In [9]:
r2 = metrics.r2_score(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
adjusted_r2 = 1 - (1-r2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)


print("r2 score: " + str(r2))
print("mean squared error: " + str(mse))
print("root mean squared error: " + str(rmse))
print("adjusted r2 score: " + str(adjusted_r2))

r2 score: 0.94695229820853
mean squared error: 112703.57343677609
root mean squared error: 335.71352882595613
adjusted r2 score: 0.9469250208197697
