# AdaBoost Regression

In [1]:
import re
import io

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostRegressor
from sklearn.naive_bayes import GaussianNB

#For hyperparamter tuning
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
# import csv
hdb_model_data = pd.read_csv('hdb_model_data_regression.csv')
hdb_model_data.sample(5)

Unnamed: 0,storey_range,floor_area_sqm,no.of bto,resale application,remaining_lease_months,Distance to nearest MRT,Distance to CBD,isMatureEstate,cpi_adjusted_price_per_sqm
6387,11,117.0,7314,26436.0,955,697.019712,11282.15562,1,5497.053846
68231,14,93.0,20064,30370.0,1101,917.358304,18677.67414,0,6178.412903
28813,2,113.0,13756,26436.0,1118,1406.761448,14852.14414,0,4992.371681
36857,23,120.0,13756,26436.0,636,1802.951494,7266.385596,1,7245.5
52833,5,124.0,20064,30370.0,849,1317.081362,15184.46232,1,4848.337903


In [3]:
X = hdb_model_data.iloc[:,0:8]
y= hdb_model_data.iloc[:,-1]  
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


Set Random Forest Regressor

In [4]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 8,  #no. of trees to be used in the model
                        max_depth = 15,  # maximum number of splits each tree can take. Too low the model will be trained less and have high bias
                        min_samples_split = 2, #every node have 2 subnodes
                        max_features = 6)#maximum features in each tree

Using GridSearchCV, find best parameters for Adaboost

In [5]:
# define the model with default hyperparameters
model = AdaBoostRegressor(base_estimator = regressor)

# define the grid of values to search
grid = dict()
grid['n_estimators'] = [50, 100, 150, 200, 250, 300]
grid['learning_rate'] = [0.1, 0.5, 1.0, 1.5, 2.0]

# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=2)

# execute the grid search
grid_result = grid_search.fit(X_train, y_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#Use the trained model to predict the test data
y_pred = grid_result.predict(X_test)

Best: 0.938364 using {'learning_rate': 0.1, 'n_estimators': 300}


In [6]:
print('Testing Data Results:')
r2 = metrics.r2_score(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
msle = metrics.mean_squared_log_error(y_test, y_pred)
rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
adjusted_r2 = 1 - (1-r2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)


print("r2 score: " + str(r2))
print("mean squared error: " + str(mse))
print("mean squared log error: " + str(msle))
print("root mean squared error: " + str(rmse))
print("adjusted r2 score: " + str(adjusted_r2))

Testing Data Results:
r2 score: 0.9470390736745287
mean squared error: 112519.2128561211
mean squared log error: 0.0040413220107782
root mean squared error: 335.43883623713145
adjusted r2 score: 0.9470118409061392


In [7]:
y_train_pred = grid_result.predict(X_train)

print('Training Data Results:')
r2_train = metrics.r2_score(y_train, y_train_pred)
mse_train = metrics.mean_squared_error(y_train, y_train_pred)
msle_train = metrics.mean_squared_log_error(y_train, y_train_pred)
rmse_train = metrics.mean_squared_error(y_train, y_train_pred, squared=False)
adjusted_r2_train = 1 - (1-r2)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)


print("r2 score: " + str(r2_train))
print("mean squared error: " + str(mse_train))
print("mean squared log error: " + str(msle_train))
print("root mean squared error: " + str(rmse_train))
print("adjusted r2 score: " + str(adjusted_r2_train))

Training Data Results:
r2 score: 0.973183034423303
mean squared error: 57021.01520784959
mean squared log error: 0.0022891807397651077
root mean squared error: 238.7907351800936
adjusted r2 score: 0.9470322683256481
