# AdaBoost Regression

In [1]:
import re
import io

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostRegressor
from sklearn.naive_bayes import GaussianNB

#For hyperparamter tuning
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
# import csv
hdb_model_data = pd.read_csv('hdb_model_data_regression.csv')
hdb_model_data.sample(5)

Unnamed: 0,storey_range,floor_area_sqm,no.of bto,resale application,remaining_lease_months,Distance to nearest MRT,Distance to CBD,isMatureEstate,cpi_adjusted_price_per_sqm
19956,2,98.0,7314,26436.0,678,523.84539,7779.10428,1,4334.506531
57492,8,113.0,20064,30370.0,1137,710.344006,13578.96966,0,6712.881463
62171,11,101.0,20064,30370.0,896,761.689291,18916.84474,0,4561.762178
76968,5,120.0,20064,30370.0,638,303.15923,7391.58329,1,6739.046667
28092,14,130.0,13756,26436.0,958,968.76702,14980.20623,0,5055.0


In [3]:
X = hdb_model_data.iloc[:,0:8]
y= hdb_model_data.iloc[:,-1]  
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


Set Random Forest Regressor

In [4]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 8,  #no. of trees to be used in the model
                        max_depth = 15,  # maximum number of splits each tree can take. Too low the model will be trained less and have high bias
                        min_samples_split = 2, #every node have 2 subnodes
                        max_features = 6)#maximum features in each tree

Using GridSearchCV, find best parameters for Adaboost

In [5]:
# define the model with default hyperparameters
model = AdaBoostRegressor(base_estimator = regressor)

# define the grid of values to search
grid = dict()
grid['n_estimators'] = [50, 100, 150, 200, 250, 300]
grid['learning_rate'] = [0.1, 0.5, 1.0, 1.5, 2.0]

# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=2)

# execute the grid search
grid_result = grid_search.fit(X_train, y_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#Use the trained model to predict the test data
y_pred = grid_result.predict(X_test)

Best: 0.938342 using {'learning_rate': 0.1, 'n_estimators': 300}


In [6]:
r2 = metrics.r2_score(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
msle = metrics.mean_squared_log_error(y_test, y_pred)
rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
adjusted_r2 = 1 - (1-r2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)


print("r2 score: " + str(r2))
print("mean squared error: " + str(mse))
print("mean squared log error: " + str(msle))
print("root mean squared error: " + str(rmse))
print("adjusted r2 score: " + str(adjusted_r2))

r2 score: 0.9470276968107928
mean squared error: 112543.38380329193
mean squared log error: 0.004041023637356333
root mean squared error: 335.4748631466923
adjusted r2 score: 0.9470004581923641
