# Bagging, Boosting, and Stacking Ensemble Methods

## Bagging

![bagging](https://upload.wikimedia.org/wikipedia/commons/c/c8/Ensemble_Bagging.svg)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
#only for mac or when you get an ssl warning
import ssl
ssl._create_default_https_context = ssl._create_unverified_context #deactivate SSL Cert Error

In [4]:
url="http://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"

col_headers = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

df_diab = pd.read_csv(url, names=col_headers)
df_diab.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
len(df_diab)

768

In [6]:
# split the data into X and y
X = df_diab.drop(columns='class', axis=1)
y = df_diab['class']

In [7]:
# set up cross validation
k_folds = KFold(n_splits=10)

# build the model
DTC = DecisionTreeClassifier()

In [8]:
BC = BaggingClassifier(estimator=DTC,
                       n_estimators=80, #deploying 80 decision trees
                         random_state=12)

In [9]:
results = cross_val_score(BC, X, y, cv=k_folds)

In [10]:
results

array([0.64935065, 0.83116883, 0.72727273, 0.66233766, 0.77922078,
       0.83116883, 0.83116883, 0.85714286, 0.71052632, 0.78947368])

In [11]:
results.mean()

0.7668831168831168

In [12]:
BC.fit(X, y)

In [13]:
y_pred = BC.predict(X)

In [14]:
y_pred

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,

In [15]:
accuracy_score(y,y_pred)

1.0

## Boosting

![Boosting](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/Ensemble_Boosting.svg/1920px-Ensemble_Boosting.svg.png)

In [16]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier

In [17]:
bc = load_breast_cancer()

df_bc = pd.DataFrame(bc.data, columns=bc.feature_names)
df_bc.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [18]:
df_bc.shape

(569, 30)

In [19]:
df_bc.isnull().sum().sum()

0

In [20]:
X = bc.data
y = bc.target

In [21]:
k_folds = KFold(n_splits=8, random_state=5, shuffle=True) #to use random state, you need to have shuffle=True

In [22]:
GB = GradientBoostingClassifier()

In [23]:
#give me the specified or default parameters of the model
GB.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [24]:
results = cross_val_score(GB, X, y, cv=k_folds)
results

array([0.97222222, 0.98591549, 1.        , 0.97183099, 0.95774648,
       0.98591549, 1.        , 0.88732394])

## Stacking
![stacking](https://miro.medium.com/v2/resize:fit:946/1*T-JHq4AK3dyRNi7gpn9-Xw.png)

**Stacking** is a complex ensemble learning method that works by training a meta-model to combine the predictions of multiple base models. The meta-model is trained on the predictions of the base models, and it learns to weight the predictions of each base model to produce a more accurate prediction.

In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier

# Load a dataset (e.g., Iris dataset)
data = load_wine()
X, y = data.data, data.target



In [2]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)



In [3]:
# Define 2 different base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=2)),
    ('lr', LogisticRegression(max_iter=10000, random_state=2))
]

# Define the meta-model
meta_model = LogisticRegression(max_iter=1000, random_state=2)



In [4]:
# Create the stacking classifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)

# Train the stacking model
stacking_model.fit(X_train, y_train)



In [7]:
# Make predictions
y_pred = stacking_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', round(accuracy,2))


Accuracy: 0.94


## Comparison

| Method | How it works | Pros | Cons |
|---|---|---|---|
| Bagging | Trains multiple copies of the same model on different subsets of the training data. | Reduces variance. | Can be computationally expensive. |
| Boosting | Trains multiple models sequentially, each model focusing on the errors made by the previous models. | Reduces bias. | Can be more computationally expensive than bagging. |
| Stacking | Trains a meta-model to combine the predictions of multiple base models. | Can achieve better accuracy than bagging or boosting. | Can be more computationally expensive than bagging or boosting. |