In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [3]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier 
X,y= make_classification(n_samples=10000, n_features=10, n_informative=3, random_state=42)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# without bagging
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
y_pred = tree.predict(X_test)
print("Accuracy without bagging:", accuracy_score(y_test, y_pred))

Accuracy without bagging: 0.9265


classic bagging

In [6]:
bag= BaggingClassifier(
                        estimator=DecisionTreeClassifier(),
                        n_estimators=500,
                        max_samples=0.5,
                        bootstrap=True,
                        n_jobs= -1,
                        verbose=1,
                        random_state=42)

In [7]:
bag.fit(X_train, y_train)
y_pred_bag = bag.predict(X_test)
print("Accuracy with bagging:", accuracy_score(y_test, y_pred_bag))

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    3.1s remaining:   22.6s


Accuracy with bagging: 0.95


[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    3.5s finished
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.0s finished


pasting

In [8]:
pasting= BaggingClassifier(
                        estimator=DecisionTreeClassifier(),
                        n_estimators=500,
                        max_samples=0.5,
                        bootstrap=False,
                        n_jobs= -1,
                        verbose=1,
                        random_state=42)

In [9]:
pasting.fit(X_train, y_train)
y_pred_pasting = pasting.predict(X_test)
print("Accuracy with pasting:", accuracy_score(y_test, y_pred_pasting))

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    1.5s remaining:   11.0s


Accuracy with pasting: 0.9525


[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    1.9s finished
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.0s finished


random subspace

In [10]:
sub= BaggingClassifier(
                        estimator=DecisionTreeClassifier(),
                        n_estimators=500,
                        max_samples=1,
                        max_features=0.5,
                        bootstrap=False,
                        bootstrap_features=True, #with replacement
                        n_jobs= -1,
                        verbose=1,
                        random_state=42)

In [11]:
sub.fit(X_train, y_train)
y_pred_sub = sub.predict(X_test)
print("Accuracy with sub space:", accuracy_score(y_test, y_pred_sub))

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.1s


Accuracy with sub space: 0.4985


[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.0s finished


random patches

In [12]:
pac= BaggingClassifier(
                        estimator=DecisionTreeClassifier(),
                        n_estimators=500,
                        max_samples=0.4,
                        max_features=0.5,
                        bootstrap=False,
                        bootstrap_features=True, #with replacement
                        n_jobs= -1,
                        verbose=1,
                        random_state=42)

In [13]:
pac.fit(X_train, y_train)
y_pred_pac = pac.predict(X_test)
print("Accuracy with patches:", accuracy_score(y_test, y_pred_pac))

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.7s remaining:    5.7s


Accuracy with patches: 0.9415


[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    1.0s finished
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   2 out of  16 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:    0.1s finished


## oob score

In [None]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    oob_score=True,
    random_state=42,
    n_jobs=-1,

)
bag.fit(X_train, y_train)
print("Out-of-bag score:", bag.oob_score_)  


Out-of-bag score: 0.942875


In [18]:
y_pred = bag.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))

Accuracy 0.945


In [27]:
bag.predict_proba

<bound method BaggingClassifier.predict_proba of BaggingClassifier(estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, n_jobs=-1, oob_score=True, random_state=42)>

In this case (since the base estimator has a pre
dict_proba() method) the decision function returns the class probabilities for each
training instance.

In [None]:
bag.oob_decision_function_  

array([[0.07532468, 0.92467532],
       [0.        , 1.        ],
       [0.97461929, 0.02538071],
       ...,
       [0.01017812, 0.98982188],
       [0.98209719, 0.01790281],
       [0.99741602, 0.00258398]])

## bagging tips
Bagging generally gives better results than Pasting  
Good results come around the 25% to 50% row sampling mark  
Random patches and subspaces should be used while dealing with high dimensional data  
To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV

apply grid search for best params 

In [22]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_samples': [0.1, 0.25, 0.5],
    'max_features': [0.5, 0.75, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False]
}

In [26]:
search=GridSearchCV(
    estimator=BaggingClassifier(),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1)
search.fit(X_train, y_train)
print("Best parameters:", search.best_params_)
search.best_score_

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'bootstrap': False, 'bootstrap_features': False, 'max_features': 0.75, 'max_samples': 0.5, 'n_estimators': 200}


np.float64(0.952625)