In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Creating a dummy dataset

In [2]:
from sklearn.datasets import make_classification

X,y = make_classification(n_samples=9999,n_features = 10, n_informative = 5)

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=28)

In [3]:
print(X_train.shape)
print(X_test.shape)
print()
print(y_train.shape)
print(y_test.shape)

(6999, 10)
(3000, 10)

(6999,)
(3000,)


# Creating StandAlone Models and Checking their accuracy

## Decision Tree Base Model

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(max_depth=4,random_state=28)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print("Accuracy of Decision Tree: ",np.round(accuracy_score(y_test,y_pred),4))

Accuracy of Decision Tree:  0.8077


## Bagging Classifier : Row sampling with replacement

In [5]:
from sklearn.ensemble import BaggingClassifier

clf_bagging = BaggingClassifier(estimator= DecisionTreeClassifier(random_state=28,max_depth=4),
                               n_estimators = 250,
                               max_samples = 0.3,
                               bootstrap=True,
                                random_state=28,
                               n_jobs = -1)

clf_bagging.fit(X_train,y_train)

y_pred = clf_bagging.predict(X_test)
print("Accuracy of Bagging: ",np.round(accuracy_score(y_test,y_pred),4))
print("Nos of samples used per base: ",clf_bagging.estimators_samples_[0].shape)
print("Nos of fetaures used: ",clf_bagging.estimators_features_[0].shape)

Accuracy of Bagging:  0.8147
Nos of samples used per base:  (2099,)
Nos of fetaures used:  (10,)


In [6]:
clf_bagging.estimators_features_[0]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

## Bagging Classifier: Pasting (bootstarp wo replacement)

In [7]:
from sklearn.ensemble import BaggingClassifier

clf_bagging = BaggingClassifier(estimator= DecisionTreeClassifier(random_state=28,max_depth=4),
                               n_estimators = 250,
                               max_samples = 0.3,
                               bootstrap=False,
                                random_state=28,
                               n_jobs = -1)

clf_bagging.fit(X_train,y_train)

y_pred = clf_bagging.predict(X_test)
print("Accuracy of Bagging: ",np.round(accuracy_score(y_test,y_pred),4))
print("Nos of samples used per base: ",clf_bagging.estimators_samples_[0].shape)
print("Nos of fetaures used: ",clf_bagging.estimators_features_[0].shape)

Accuracy of Bagging:  0.8133
Nos of samples used per base:  (2099,)
Nos of fetaures used:  (10,)


## Random Subspaces: Column sampling but not row sampling

In [8]:
from sklearn.ensemble import BaggingClassifier

clf_bagging = BaggingClassifier(estimator= DecisionTreeClassifier(random_state=28,max_depth=4),
                               n_estimators = 250,
                               max_samples = 1.0,
                               bootstrap=False,
                                max_features=0.5,
                                random_state=28,
                               n_jobs = -1)

clf_bagging.fit(X_train,y_train)

y_pred = clf_bagging.predict(X_test)
print("Accuracy of Bagging: ",np.round(accuracy_score(y_test,y_pred),4))
print("Nos of samples used per base: ",clf_bagging.estimators_samples_[0].shape)
print("Nos of fetaures used: ",clf_bagging.estimators_features_[0].shape)

Accuracy of Bagging:  0.8193
Nos of samples used per base:  (6999,)
Nos of fetaures used:  (5,)


## Random patches: Both Row and Column Sampling

In [9]:
from sklearn.ensemble import BaggingClassifier

clf_bagging = BaggingClassifier(estimator= DecisionTreeClassifier(random_state=28,max_depth=4),
                               n_estimators = 250,
                               max_samples = 0.3,
                               bootstrap=False,
                                 max_features=0.5,
                                random_state=28,
                               n_jobs = -1)

clf_bagging.fit(X_train,y_train)

y_pred = clf_bagging.predict(X_test)
print("Accuracy of Bagging: ",np.round(accuracy_score(y_test,y_pred),4))
print("Nos of samples used per base: ",clf_bagging.estimators_samples_[0].shape)
print("Nos of fetaures used: ",clf_bagging.estimators_features_[0].shape)

Accuracy of Bagging:  0.8217
Nos of samples used per base:  (2099,)
Nos of fetaures used:  (5,)


Both row and Column sampling gives best results when dealing with high dimensional data.  
Row sampling between 25-50% gives best results.  
Column Sampling between 0.4 to 0.7


## OOB Score

In [10]:
from sklearn.ensemble import BaggingClassifier

clf_bagging = BaggingClassifier(estimator= DecisionTreeClassifier(random_state=28,max_depth=4),
                               n_estimators = 250,
                               max_samples = 0.3,
                               bootstrap=True,
                                 max_features=0.5,
                                random_state=28,
                                oob_score= True,
                               n_jobs = -1)

clf_bagging.fit(X_train,y_train)

y_pred = clf_bagging.predict(X_test)
print("Accuracy of Bagging: ",np.round(accuracy_score(y_test,y_pred),4))
print("Nos of samples used per base: ",clf_bagging.estimators_samples_[0].shape)
print("Nos of fetaures used: ",clf_bagging.estimators_features_[0].shape)
print("OOB Accuracy: ",np.round(clf_bagging.oob_score_,4))

Accuracy of Bagging:  0.819
Nos of samples used per base:  (2099,)
Nos of fetaures used:  (5,)
OOB Accuracy:  0.821


## Grid SearchCV for best Parameters

In [11]:
from sklearn.model_selection import GridSearchCV

param_dict = {
    'n_estimators': [100,200,400],
    'max_samples': [0.2,0.4,0.6,1.0],
    'max_features': [0.2,0.4,0.6,1.0],
    'bootstrap': [True,False]
}

grid = GridSearchCV(BaggingClassifier(),param_dict,cv = 5,n_jobs = -1)

grid.fit(X_train,y_train)

In [12]:
grid.best_params_

{'bootstrap': True,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 400}

In [13]:
grid.best_score_

0.9005570305320127