# Gradient Boosting

In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [5]:
#read in df - this is with collinear variables removed and with one hot encoded data
hdb=pd.read_csv(r"C:\Users\Long Bing\Desktop\HDB Project\Resale_Flats_Dataset_2012_Onwards_Non_Collinear_OHE.csv")

In [6]:
hdb.head()

Unnamed: 0,floor_area_sqm,resale_price,remaining_lease,BEDOK,BISHAN,BUKIT BATOK,BUKIT MERAH,BUKIT PANJANG,BUKIT TIMAH,CENTRAL AREA,...,28 TO 30,31 TO 33,31 TO 35,34 TO 36,36 TO 40,37 TO 39,40 TO 42,43 TO 45,46 TO 48,49 TO 51
0,44,257800,66,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,44,263000,65,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,44,275000,65,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,45,260000,73,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,45,226000,73,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Train Test Split

In [7]:
hdb.columns

Index(['floor_area_sqm', 'resale_price', 'remaining_lease', 'BEDOK', 'BISHAN',
       'BUKIT BATOK', 'BUKIT MERAH', 'BUKIT PANJANG', 'BUKIT TIMAH',
       'CENTRAL AREA', 'CHOA CHU KANG', 'CLEMENTI', 'GEYLANG', 'HOUGANG',
       'JURONG EAST', 'JURONG WEST', 'KALLANG/WHAMPOA', 'MARINE PARADE',
       'PASIR RIS', 'PUNGGOL', 'QUEENSTOWN', 'SEMBAWANG', 'SENGKANG',
       'SERANGOON', 'TAMPINES', 'TOA PAYOH', 'WOODLANDS', 'YISHUN', '01 TO 05',
       '04 TO 06', '06 TO 10', '07 TO 09', '10 TO 12', '11 TO 15', '13 TO 15',
       '16 TO 18', '16 TO 20', '19 TO 21', '21 TO 25', '22 TO 24', '25 TO 27',
       '26 TO 30', '28 TO 30', '31 TO 33', '31 TO 35', '34 TO 36', '36 TO 40',
       '37 TO 39', '40 TO 42', '43 TO 45', '46 TO 48', '49 TO 51'],
      dtype='object')

In [8]:
X=hdb.drop('resale_price',axis=1)
y=hdb['resale_price']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=111)

# Model Training and Evaluation (using default hyperparameters)

In [48]:
gdboost = GradientBoostingRegressor()

In [49]:
gdboost.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [50]:
y_pred= gdboost.predict(X_test)

In [51]:
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

MAE: 45302.95148160297
MSE: 3649362201.4001994
RMSE: 60409.95117859473


In [52]:
metrics.r2_score(y_pred,y_test)

0.7189847856801325

In [53]:
#while the gradient boost algorithm gave quite a good R2, it is not quite as good as what we saw
#on with the Random Forest algorithem.
#we can try to tune the hyperparamters to further optimise the model.

# Hyperparameter Tuning

In [54]:
from sklearn.model_selection import RandomizedSearchCV

In [55]:
#using RandomizedSearch to find optimal parameters for the model, namely:
#1. learning_rate
#2. n_estimators
#3. max_depth
#4. min_samples_leaf
#5. min_samples_split

In [56]:
LR = {'learning_rate':[0.05,0.10,0.15,0.20,0.25],
     'n_estimators':[50,75,100,125,150,175,200,225,250],
     'max_depth':[5,7,10,15,20,25],
     'min_samples_leaf':[5,10,15,20,25],
     'min_samples_split':[10,20,30,40,50]}

In [64]:
param_tune = RandomizedSearchCV(estimator=gdboost,param_distributions=LR,scoring='r2',cv=2,n_iter=50)

In [65]:
param_tune.fit(X_train,y_train)

RandomizedSearchCV(cv=2, error_score=nan,
                   estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                             

In [68]:
pd.DataFrame(param_tune.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,param_learning_rate,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,50.268245,0.653451,0.785901,0.009972,150,40,5,15,0.05,"{'n_estimators': 150, 'min_samples_split': 40,...",0.918635,0.915981,0.917308,0.001327,25
1,22.997163,0.043809,0.430385,0.00748,50,10,10,25,0.25,"{'n_estimators': 50, 'min_samples_split': 10, ...",0.919063,0.917564,0.918313,0.00075,19
2,17.491392,0.044015,0.287234,0.000999,50,20,5,15,0.05,"{'n_estimators': 50, 'min_samples_split': 20, ...",0.890569,0.889222,0.889896,0.000673,45
3,86.634037,0.360162,1.402255,0.00698,225,30,10,20,0.25,"{'n_estimators': 225, 'min_samples_split': 30,...",0.916217,0.91504,0.915629,0.000588,29
4,23.578959,0.158569,0.303724,0.000497,150,40,20,7,0.25,"{'n_estimators': 150, 'min_samples_split': 40,...",0.912885,0.909325,0.911105,0.00178,33
5,54.598028,0.35281,0.764939,0.010989,250,50,15,10,0.1,"{'n_estimators': 250, 'min_samples_split': 50,...",0.917874,0.914588,0.916231,0.001643,28
6,16.662722,0.542073,0.212928,0.001499,100,40,5,7,0.1,"{'n_estimators': 100, 'min_samples_split': 40,...",0.893405,0.89181,0.892608,0.000797,44
7,22.913685,0.19656,0.28426,0.000979,200,50,10,5,0.25,"{'n_estimators': 200, 'min_samples_split': 50,...",0.905779,0.90344,0.904609,0.001169,40
8,113.451305,0.088264,1.885973,0.018975,250,50,5,25,0.1,"{'n_estimators': 250, 'min_samples_split': 50,...",0.919833,0.918807,0.91932,0.000513,16
9,47.988254,0.383718,0.791887,0.0,125,20,15,20,0.25,"{'n_estimators': 125, 'min_samples_split': 20,...",0.920592,0.918853,0.919723,0.00087,13


In [69]:
param_tune.best_params_

{'n_estimators': 125,
 'min_samples_split': 50,
 'min_samples_leaf': 10,
 'max_depth': 20,
 'learning_rate': 0.15}

In [70]:
param_tune.best_score_

0.9213700292316589

In [71]:
gboost_pred = param_tune.predict(X_test)

In [72]:
print('MAE:', metrics.mean_absolute_error(y_test, gboost_pred))
print('MSE:', metrics.mean_squared_error(y_test, gboost_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, gboost_pred)))

MAE: 27262.732291845703
MSE: 1448011946.6879272
RMSE: 38052.7521565514


In [None]:
#Gradient Boosting yield slightly better results than Random Forest, with higher R2 and lower RMSE.
#However Gradient Boosting is computationally expensive and required a much longer time
#for model training. Additionally, the parameters used might not be the most optimal yet,
#as n_iter was only set to 50. Better results might be possible with different parameter
#values and more iterations.