## Bagging Regressor

In [1]:
# importing required libraries

import pandas as pd
import numpy as np

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Hyperparameters tunning library
from sklearn.model_selection import GridSearchCV


## Bagging Regressor

from sklearn.ensemble import BaggingRegressor

In [2]:
# importing datasets from sklearn library

from sklearn import datasets

boston = datasets.load_boston()


In [3]:
# features and labels

X_boston, y_boston =  boston.data, boston.target

print('Dataset features names :' + str(boston.feature_names))
print('Dataset feature size :' + str(boston.data.shape))
print('Dataset target size :' + str(boston.target.shape))

Dataset features names :['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
Dataset feature size :(506, 13)
Dataset target size :(506,)


In [4]:
# splitting the data into train test 

X_train, X_test, y_train, y_test = train_test_split(X_boston, y_boston, test_size=0.20, train_size=0.80, random_state=2)

print('Train feature size :', X_train.shape)
print('Train label size :', y_train.shape)

print('Test feature size :', X_test.shape)
print('Test label size :', y_test.shape)

Train feature size : (404, 13)
Train label size : (404,)
Test feature size : (102, 13)
Test label size : (102,)


In [5]:
# Creating multiple regressor model

lr = LinearRegression()
dt = DecisionTreeRegressor()
svr = SVR()

In [6]:
lr.fit(X_train, y_train)
svr.fit(X_train, y_train)
dt.fit(X_train, y_train)

DecisionTreeRegressor()

In [7]:
y_pred_lr = lr.predict(X_test)
y_pred_svr = svr.predict(X_test)
y_pred_dt = dt.predict(X_test)

In [8]:
print("R^2 score for LR :", r2_score(y_test, y_pred_lr))
print("R^2 score for SVR :", r2_score(y_test, y_pred_svr))
print("R^2 score for DT :", r2_score(y_test, y_pred_dt))

R^2 score for LR : 0.7789207451814428
R^2 score for SVR : 0.2749755336369747
R^2 score for DT : 0.7122129090467433


As we can see Linear Regressor performed well 

## Implementing the Bagging with default features

In [9]:
# Implementing the Bagging with default features

bag_regressor = BaggingRegressor(random_state=1)
bag_regressor.fit(X_train, y_train)

BaggingRegressor(random_state=1)

In [10]:
y_pred = bag_regressor.predict(X_test)

In [11]:
print('Training Coefficient of R^2 : %.3f'%bag_regressor.score(X_train, y_train))
print('Test Coefficient of R^2 : %.3f'%bag_regressor.score(X_test, y_test))

Training Coefficient of R^2 : 0.978
Test Coefficient of R^2 : 0.861


As we can see the R^2 score for training is higher than Test, model is overfitting. Now we will use GridSearchCV to find best parameter to train the model to avoid overfitting

In [12]:
%%time

n_samples = boston.data.shape[0]
n_features = boston.data.shape[1]

params = {
            'base_estimator':[None, LinearRegression(), SVR(), DecisionTreeRegressor()],
            'n_estimators' :[10, 20, 40, 50, 60, 80, 100],
            'max_samples' : [0.25, 0.5, 0.8, 1.0],
            'max_features': [0.25, 0.5, 0.8, 1.0],
            'bootstrap': [True, False],
            'bootstrap_features': [True, False]
         }

bagging_regressor_grid = GridSearchCV(BaggingRegressor(random_state=1, n_jobs=-1), param_grid =params, cv=5, n_jobs=-1, verbose=1)
bagging_regressor_grid.fit(X_train, y_train)

Fitting 5 folds for each of 1792 candidates, totalling 8960 fits
Wall time: 6min 23s


GridSearchCV(cv=5, estimator=BaggingRegressor(n_jobs=-1, random_state=1),
             n_jobs=-1,
             param_grid={'base_estimator': [None, LinearRegression(), SVR(),
                                            DecisionTreeRegressor()],
                         'bootstrap': [True, False],
                         'bootstrap_features': [True, False],
                         'max_features': [0.25, 0.5, 0.8, 1.0],
                         'max_samples': [0.25, 0.5, 0.8, 1.0],
                         'n_estimators': [10, 20, 40, 50, 60, 80, 100]},
             verbose=1)

In [13]:
print('Train R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_train, y_train))
print('Test R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_test, y_test))
print('Best R^2 Score Through Grid Search : %.3f'%bagging_regressor_grid.best_score_)
print('Best Parameters : ',bagging_regressor_grid.best_params_)

Train R^2 Score : 0.980
Test R^2 Score : 0.870
Best R^2 Score Through Grid Search : 0.877
Best Parameters :  {'base_estimator': None, 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 40}


we can further train the model separately with best params and save the model