# AdaBoost Regression

In [1]:
import re
import io

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostRegressor
from sklearn.naive_bayes import GaussianNB

#For hyperparamter tuning
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
# import csv
hdb_model_data = pd.read_csv('hdb_model_data.csv')
hdb_model_data.sample(5)

Unnamed: 0,storey_range,floor_area_sqm,no.of bto,resale application,remaining_lease_months,Distance to nearest MRT,Distance to CBD,isMatureEstate,cpi_adjusted_price_per_sqm
42253,0.0,0.307292,0.505255,0.0,0.672755,0.271455,0.525565,0.0,0.130717
14857,0.25,0.239583,0.0,0.0,0.942161,0.181093,0.171742,1.0,0.495647
45005,0.0,0.369792,0.505255,0.0,0.499239,0.536031,0.434593,1.0,0.21131
61618,0.0,0.1875,1.0,1.0,0.683409,0.161051,0.328449,1.0,0.402826
67087,0.0,0.364583,1.0,1.0,0.383562,0.116271,0.300361,1.0,0.354611


In [3]:
hdb_model_data.dtypes

storey_range                  float64
floor_area_sqm                float64
no.of bto                     float64
resale application            float64
remaining_lease_months        float64
Distance to nearest MRT       float64
Distance to CBD               float64
isMatureEstate                float64
cpi_adjusted_price_per_sqm    float64
dtype: object

In [4]:
hdb_model_data.sample(5)

Unnamed: 0,storey_range,floor_area_sqm,no.of bto,resale application,remaining_lease_months,Distance to nearest MRT,Distance to CBD,isMatureEstate,cpi_adjusted_price_per_sqm
47356,0.125,0.088542,0.505255,0.0,0.127854,0.106754,0.232235,1.0,0.196749
73987,0.0,0.276042,1.0,1.0,0.487062,0.223863,0.703284,0.0,0.19298
15894,0.1875,0.088542,0.0,0.0,0.302892,0.178832,0.387596,1.0,0.136805
19480,0.0625,0.21875,0.0,0.0,0.936073,0.392546,0.633031,0.0,0.213528
24228,0.0,0.088542,0.505255,0.0,0.277017,0.258267,0.429352,1.0,0.177059


In [5]:
X = hdb_model_data.iloc[:,0:8]
y= hdb_model_data.iloc[:,-1]  
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


Used for loop to find best learning rate from 0.1 to 2.0

In [6]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 8,  #no. of trees to be used in the model
                        max_depth = 15,  # maximum number of splits each tree can take. Too low the model will be trained less and have high bias
                        min_samples_split = 2, #every node have 2 subnodes
                        max_features = 6)#maximum features in each tree

In [7]:
# define the model with default hyperparameters
model = AdaBoostRegressor(base_estimator = regressor)

# define the grid of values to search
grid = dict()
grid['n_estimators'] = [50, 100, 150, 200, 250, 300]
grid['learning_rate'] = [0.1, 0.5, 1.0, 1.5, 2.0]

# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=2)

# execute the grid search
grid_result = grid_search.fit(X_train, y_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#Use the trained model to predict the test data
y_pred = grid_result.predict(X_test)

Best: 0.938334 using {'learning_rate': 0.1, 'n_estimators': 300}


In [9]:
r2 = metrics.r2_score(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
adjusted_r2 = 1 - (1-r2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)


print("r2 score: " + str(r2))
print("mean squared error: " + str(mse))
print("root mean squared error: " + str(rmse))
print("adjusted r2 score: " + str(adjusted_r2))

r2 score: 0.9470179064594428
mean squared error: 0.0006093672494420253
root mean squared error: 0.024685365086261644
adjusted r2 score: 0.9469906628067674
