In [58]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [59]:
X,y = make_classification(n_samples=10000, n_features=10, n_informative=2)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=2)

In [61]:
dt = DecisionTreeClassifier(random_state=2)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
print('Accuracy score: ', accuracy_score(y_test, y_pred))

Accuracy score:  0.8555


## 1. Bagging

In [62]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.5,
    bootstrap=True,
    random_state=42
    
)

bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print('Bagging score: ', accuracy_score(y_test, y_pred))

Bagging score:  0.906


## 2. Bagging Using SVM

In [63]:
bag = BaggingClassifier(
    estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=2
)
bag.fit(X_train,y_train)
y_pred = bag.predict(X_test)
print('Accuracy of SVM: ', accuracy_score(y_test, y_pred))

Accuracy of SVM:  0.9145


## 3. Pasting

In [65]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25, 
    bootstrap=False,
    random_state=2,
    verbose=1,
    n_jobs=-1
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy of Pasting: ', accuracy_score(y_test, y_pred))

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    6.0s remaining:   18.3s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    6.2s finished


Accuracy of Pasting:  0.9095


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.1s finished


## 4. Random Subspaces

In [66]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=False,
    bootstrap_features=True,
    max_features = 0.5,
    random_state=2
    
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy of Random subspaces: ', accuracy_score(y_test, y_pred))

Accuracy of Random subspaces:  0.911


In [67]:
bag.estimators_samples_[0].shape

(8000,)

In [68]:
bag.estimators_features_[5].shape

(5,)

## 4. Random Patches

In [85]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    max_features=0.5,
    bootstrap_features=True,
    random_state=2,
    verbose=1,
    n_jobs=-1
)

bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('Accuracy score of random patches: ', accuracy_score(y_test, y_pred))

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    5.1s remaining:   15.4s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    5.3s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


Accuracy score of random patches:  0.9125


[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.2s finished


In [78]:
bag.estimators_samples_

[array([3314, 3532, 5663, ..., 3523, 7892, 7609], dtype=int32),
 array([5951, 6631, 3196, ..., 3369, 3939,  578], dtype=int32),
 array([   0, 7708,  385, ..., 6809, 7035, 7969], dtype=int32),
 array([1180, 6722, 3129, ..., 4072, 5397, 1001], dtype=int32),
 array([6192, 4869, 1168, ..., 6095, 6713,  643], dtype=int32),
 array([3316, 3831,  147, ..., 4205, 4272, 6892], dtype=int32),
 array([6489, 5699, 5765, ..., 4018,  968, 3880], dtype=int32),
 array([2855, 2831, 7260, ..., 6645, 7951, 1700], dtype=int32),
 array([ 537, 1627,  517, ..., 4411,  675, 6290], dtype=int32),
 array([6807, 7373, 1063, ..., 5889, 4072, 7289], dtype=int32),
 array([6078, 7870, 5240, ..., 3635,  132, 5776], dtype=int32),
 array([4194, 1988, 5419, ...,  734, 5872,  789], dtype=int32),
 array([4182, 6580, 4523, ..., 5571, 6127, 4325], dtype=int32),
 array([4733, 6916,  852, ..., 3063, 6483, 5084], dtype=int32),
 array([2451, 7653, 3563, ..., 3282,  914, 6249], dtype=int32),
 array([ 239, 3864, 5428, ..., 1291, 488

In [80]:
bag.estimators_features_

[array([3, 3, 4, 2, 9], dtype=int32),
 array([1, 4, 7, 6, 6], dtype=int32),
 array([0, 2, 8, 7, 4], dtype=int32),
 array([9, 1, 0, 1, 9], dtype=int32),
 array([5, 6, 8, 6, 2], dtype=int32),
 array([6, 2, 1, 9, 6], dtype=int32),
 array([2, 2, 7, 6, 3], dtype=int32),
 array([1, 6, 1, 0, 4], dtype=int32),
 array([8, 3, 5, 7, 1], dtype=int32),
 array([5, 1, 0, 4, 9], dtype=int32),
 array([3, 5, 6, 9, 3], dtype=int32),
 array([0, 1, 0, 4, 8], dtype=int32),
 array([4, 9, 1, 8, 6], dtype=int32),
 array([0, 7, 7, 5, 4], dtype=int32),
 array([0, 0, 4, 2, 2], dtype=int32),
 array([0, 7, 9, 2, 0], dtype=int32),
 array([1, 4, 1, 3, 1], dtype=int32),
 array([3, 0, 6, 7, 6], dtype=int32),
 array([7, 3, 6, 4, 4], dtype=int32),
 array([0, 5, 9, 1, 9], dtype=int32),
 array([8, 7, 5, 6, 6], dtype=int32),
 array([7, 7, 9, 1, 8], dtype=int32),
 array([1, 7, 1, 8, 2], dtype=int32),
 array([6, 8, 1, 5, 2], dtype=int32),
 array([4, 8, 4, 1, 6], dtype=int32),
 array([1, 7, 3, 3, 6], dtype=int32),
 array([8, 7

## 5. OOB Score

In [87]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.5,
    bootstrap=True,
    oob_score=True,
    random_state=2

)
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
print('OOB score: ', bag.oob_score_)
print('Accuracy score: ', accuracy_score(y_test, y_pred))

OOB score:  0.898625
Accuracy score:  0.9115


## Applying Grid search CV

In [89]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [104]:
parameters = {
    # 'estimators': [
    #     DecisionTreeClassifier(),
    #     LogisticRegression(max_iter=1000),
    #     SVC(probability=True)
    # ],
    'n_estimators': [50, 100],
    'max_samples': [0.5,1.0],
    'bootstrap': [True, False],
    'max_features': [0.5,1.0],
    'bootstrap_features': [True, False]
}

In [105]:
grid = GridSearchCV(BaggingClassifier(), parameters, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [100]:
grid.best_estimator_

In [101]:
grid.best_params_

{'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 0.5,
 'max_samples': 0.5,
 'n_estimators': 50}