In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
X, y = make_moons(n_samples=500, noise=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Hard Voting

In [7]:
from sklearn.svm import SVC

In [8]:
voting_clf = VotingClassifier(
estimators=[
    ('lr', LogisticRegression(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42)),
    ('svc', SVC(random_state=42))
])

voting_clf.fit(X_train, y_train)

In [9]:
voting_clf.named_estimators_

{'lr': LogisticRegression(random_state=42),
 'rf': RandomForestClassifier(random_state=42),
 'svc': SVC(random_state=42)}

In [10]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, '=', clf.score(X_test, y_test))

lr = 0.864
rf = 0.896
svc = 0.896


In [11]:
voting_clf.predict(X_test[:1])

array([1], dtype=int64)

In [12]:
voting_clf.estimators_

[LogisticRegression(random_state=42),
 RandomForestClassifier(random_state=42),
 SVC(random_state=42)]

In [None]:
# Generating predictions for thr first sample of the test set using individual base estimators

In [13]:
[clf.predict(X_test[:1]) for clf in voting_clf.estimators_]

[array([1], dtype=int64), array([1], dtype=int64), array([0], dtype=int64)]

In [14]:
voting_clf.score(X_test, y_test)

0.912

# Soft Voting

In [18]:
# configure the VotingClassifier for soft voting and enabling probability estimators for SVC

In [19]:
voting_clf.voting = "soft"
voting_clf.named_estimators['svc'].probability = True
voting_clf.fit(X_train, y_train)

In [20]:
voting_clf.score(X_test, y_test)

0.92

# Bagging and Pasting

# Bagging (Bootstrap Aggregating):
Uses the same data multiple times: Bagging involves creating multiple bootstrap samples (random samples with replacement) from the original dataset. These samples may contain duplicate instances.

Trains different parts of the same data with the same model: Each bootstrap sample is used to train a separate instance of the same model. The final prediction is then an average or majority vote of the predictions from each model.

Pasting
Uses it once and does not repeat: Pasting also involves creating multiple samples, but without replacement. Each instance is selected only once in each sample.

Trains different parts of the same data with the same model: Similar to bagging, pasting trains separate models on different samples.

So, in both bagging and pasting, different parts of the same data are used to train different models. The main difference lies in whether the sampling is done with or without replacement.

Bagging samples with replacement, while pasting samples without replacement. Both techniques aim to reduce overfitting and improve model generalization by introducing diversity through multiple models trained on different subsets of the data.

In [22]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [25]:
# Create a BaggingClassifier with DecisionTreeClassifier as the base estimator
bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                           n_estimators=500,
                           max_samples=100,
                           n_jobs=-1,
                           random_state=42)

In [26]:
bag_clf.fit(X_train, y_train)

In [27]:
bag_clf.score(X_test, y_test)

0.904

# Out-of-Bag Evaluation

In [28]:
# Create a baggingClassifier with out-of-bag (OOB) score calculation
bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                           n_estimators=500,
                           oob_score=True,
                           n_jobs=-1,
                           random_state=42)

In [29]:
bag_clf.fit(X_train, y_train)

In [30]:
bag_clf.oob_score_

0.896

# In bagging, on average, each base model is trained on about 63% of the original dataset. This is because, during bootstrap sampling, about 37% of the data points are not included in each bootstrap sample.

When you set the oob_score parameter to True in a bagging ensemble (e.g., BaggingClassifier or BaggingRegressor in scikit-learn), it indicates that you want to evaluate the model's performance on the out-of-bag (OOB) instances – the approximately 37% of data points that were not included in the bootstrap sample for each base model.

The OOB score serves as a validation metric without the need for a separate validation set. It helps estimate how well the ensemble is likely to generalize to unseen data. This is a useful feature of bagging, providing a built-in validation mechanism during the training process.

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
y_pred = bag_clf.predict(X_test)

In [33]:
accuracy_score(y_test, y_pred)

0.92

# Random Forests

In [35]:
rnd_clf = RandomForestClassifier(n_estimators=500,
                                max_leaf_nodes=16,
                                n_jobs=-1,
                                random_state=42)

In [36]:
rnd_clf.fit(X_train, y_train)

In [37]:
y_pred_rf = rnd_clf.predict(X_test)

In [38]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(max_features='sqrt', max_leaf_nodes=16),
                           n_estimators=500,
                           n_jobs=-1,
                           random_state=42)

In [39]:
bag_clf.fit(X_train, y_train)

In [40]:
y_pred_bag = bag_clf.predict(X_test)

In [41]:
np.all(y_pred_bag == y_pred_rf)

True

# Feature Importance

In [42]:
from sklearn.datasets import load_iris

In [44]:
iris = load_iris(as_frame=True)
rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)

In [45]:
rnd_clf.fit(iris.data, iris.target)

In [46]:
for score, anme in zip(rnd_clf.feature_importances_, iris.data.columns):
    print(round(score, 2) * 100, '%', name)

11.0 % svc
2.0 % svc
44.0 % svc
42.0 % svc


# Boosting

# AdaBoosting

In [48]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=30,
    learning_rate=0.5,
    random_state=42
)

In [49]:
ada_clf.fit(X_train, y_train)

In [50]:
ada_clf.score(X_test, y_test)

0.904

# Gradient Boosting

In [51]:
from sklearn.tree import DecisionTreeRegressor

In [52]:
np.random.seed(42)

In [54]:
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(100)

In [55]:
tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)

In [56]:
tree_reg1.fit(X, y)

In [61]:
y2 = y - tree_reg1.predict(X)

In [62]:
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=43)

In [63]:
tree_reg2.fit(X, y2)

In [64]:
y3 = y2 - tree_reg1.predict(X)

In [65]:
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=43)

In [67]:
tree_reg3.fit(X, y3)

In [73]:
X_new = np.array([[-0.4], [0.], [0.5]])
sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

array([-5.55111512e-17, -9.03979359e-02,  3.99129745e-02])

In [69]:
# Instead f writing the above manually, we can use GradientBoostingRegressor

In [70]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2,
                                n_estimators=3,
                                learning_rate=1.0,
                                random_state=42)

In [71]:
gbrt.fit(X, y)

# As the learning rate increases, we reduce the number of estimators so that it is possible to arrive at a reach point

When the learning rate is increased, it means that each weak learner's contribution to the ensemble is stronger. To avoid overfitting and find an optimal trade-off between accuracy and generalization, it's common to reduce the number of estimators (trees) in the ensemble.

Here's a breakdown of the idea:

Low Learning Rate (e.g., 0.01 or 0.1):
Each tree has a smaller impact on the final prediction.
Requires a larger number of trees to fit the model well.
High Learning Rate (e.g., 0.5 or 1.0):
Each tree has a more substantial impact on the final prediction.
Requires a smaller number of trees to fit the model well.
By decreasing the number of trees as the learning rate increases, you can control the complexity of the overall model. This helps prevent overfitting and ensures that the boosting algorithm converges to a good solution. This strategy is often part of the hyperparameter tuning process in gradient boosting, where the learning rate and the number of estimators are tuned together.

In scikit-learn'\s GradientBoostingRegressor or GradientBoostingClassifier, you can achieve this by adjusting the n_estimators parameter based on your chosen learning rate.

In [79]:
gbrt_best = GradientBoostingRegressor(
    max_depth=2,
    learning_rate=0.05,
    n_estimators=500,
    n_iter_no_change=10,
    random_state=42)

In [80]:
gbrt_best.fit(X, y)

# AdaBoost (Adaptive Boosting):
Takes all the data and corrects errors: AdaBoost adapts by assigning different weights to instances in the training set. It focuses more on instances that were misclassified by the previous weak learners.

Global approach to correcting errors: The entire dataset is considered during each iteration, and weights are adjusted to emphasize correcting errors made on the entire dataset.

Gradient Boosting:
Selects and corrects errors through gradients: Gradient boosting builds a sequence of weak learners, and each learner corrects the errors of the combined ensemble of the previous learners. The emphasis is on minimizing the loss function by finding the gradient of the loss with respect to the predicted values.
Local approach to correcting errors: Each new weak learner in gradient boosting focuses on the errors made by the current ensemble, rather than adjusting weights for the entire dataset. This leads to a more localized and targeted correction of errors.

In [81]:
gbrt_best.n_estimators_

92

# StackingClassifier

In [82]:
from sklearn.ensemble import StackingClassifier

In [83]:
stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svm', SVC(probability=True, random_state=42))
    ],
    final_estimator = RandomForestClassifier(random_state=43), # final estimator for blending estimators
    cv=5
)

In [84]:
stacking_clf.fit(X_train, y_train)

In [85]:
stacking_clf.score(X_test, y_test)

0.928

# Mentor Hours

In [86]:
df = pd.read_csv('mnist_784.csv')

In [87]:
df

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784,class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
69996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
69997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
69998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


In [88]:
X = df.drop('class', axis=1)
y = df['class'].copy()

In [92]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2)

In [94]:
X_train = X[:50000]
X_valid = X[10000:]
y_train = y[:50000]
y_valid = y[10000:]

In [95]:
y_train_1 = (y_train == 1)
y_valid_1 = (y_valid == 1)

# Hard Voting

In [96]:
voting_clf = VotingClassifier(
estimators=[
    ('lr', LogisticRegression(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42)),
    ('svc', SVC(random_state=42))
])

voting_clf.fit(X_train, y_train_1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [97]:
voting_clf.named_estimators_

{'lr': LogisticRegression(random_state=42),
 'rf': RandomForestClassifier(random_state=42),
 'svc': SVC(random_state=42)}

In [98]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, '=', clf.score(X_valid, y_valid_1))

lr = 0.9936666666666667
rf = 0.9989166666666667
svc = 0.99815


In [99]:
voting_clf.predict(X_valid[:1])

array([False])

In [100]:
voting_clf.estimators_

[LogisticRegression(random_state=42),
 RandomForestClassifier(random_state=42),
 SVC(random_state=42)]

In [101]:
[clf.predict(X_valid[:1]) for clf in voting_clf.estimators_]

[array([0], dtype=int64), array([0], dtype=int64), array([0], dtype=int64)]

In [102]:
voting_clf.score(X_valid, y_valid_1)

0.9982666666666666

# Soft Voting

In [103]:
voting_clf.voting = "soft"
voting_clf.named_estimators['svc'].probability = True
voting_clf.fit(X_valid, y_valid_1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [104]:
voting_clf.score(X_valid, y_valid_1)

0.9985666666666667