# Ensemble Learning

In [1]:
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

voting_clf = VotingClassifier(
    estimators=[
        ("lr", LogisticRegression(random_state=42)),
        ("rf", RandomForestClassifier(random_state=42)),
        ("svc", SVC(random_state=42)),
    ]
)
voting_clf.fit(X_train, y_train)

### Examine Accuracy

In [2]:
for name, clf in voting_clf.named_estimators_.items():
    print(f"{name} = {clf.score(X_test, y_test)}")

lr = 0.864
rf = 0.896
svc = 0.896


In [3]:
voting_clf.predict(X_test[:1])

array([1], dtype=int64)

In [4]:
pred = [clf.predict(X_test[:1]) for clf in voting_clf.estimators_]
print(pred)

[array([1], dtype=int64), array([1], dtype=int64), array([0], dtype=int64)]


In [5]:
voting_clf.score(X_test, y_test)

0.912

### Using Soft Voting

In [6]:
voting_clf.voting = "soft"
voting_clf.named_estimators["svc"].probability = True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

0.92

### Bagging and Pasting

In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag = BaggingClassifier(DecisionTreeClassifier(), n_jobs=-1, n_estimators=1000, max_samples=100, random_state=42)
bag.fit(X_train, y_train)

In [8]:
# OOB Eval
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, oob_score=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

bag_clf.oob_score_

0.896

In [9]:
from sklearn.metrics import accuracy_score

y_pred = bag_clf.predict(X_test)

accuracy_score(y_test, y_pred)

0.92

In [10]:
bag_clf.oob_decision_function_[:3]

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ]])

In [11]:
# Random Patches and Random Subspaces

# Random Forests
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)

rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

In [13]:
# Equivalent to the above RF
bag_clf = BaggingClassifier(DecisionTreeClassifier(max_features="sqrt", max_leaf_nodes=16), random_state=42, n_estimators=500, n_jobs=-1)

bag_clf.fit(X_train, y_train)

y_pred = bag_clf.predict(X_test)

accuracy_score(y_test, y_pred)

0.912

In [14]:
# Extra Trees. Just random forests with randomized thresholds

# Feature Importance
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)
rf_clf = RandomForestClassifier(random_state=42, n_estimators=500)
rf_clf.fit(iris.data, iris.target)

for score, name in zip(rf_clf.feature_importances_, iris.data.columns):
    print(round(score, 2), name)


0.11 sepal length (cm)
0.02 sepal width (cm)
0.44 petal length (cm)
0.42 petal width (cm)


In [19]:
from sklearn.ensemble import AdaBoostClassifier

ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), random_state=42, n_estimators=3000, learning_rate=0.5)
ab_clf.fit(X_train, y_train)
y_pred = ab_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.904

In [21]:
# Gradient Boosting
# Generate Quadratic Dataset
import numpy as np
from sklearn.tree import DecisionTreeRegressor

np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(100)
tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

In [22]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(random_state=42, max_depth=2)
tree_reg2.fit(X, y2)

In [23]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)

In [24]:
X_new = np.array([[-0.4], [0.1], [0.5]])
sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

array([0.49484029, 0.04021166, 0.75026781])

In [26]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=300, learning_rate=1.0, random_state=42)
gbrt.fit(X, y)

In [27]:
gbrt_best = GradientBoostingRegressor(max_depth=2, learning_rate=0.05, n_iter_no_change=10, random_state=42, n_estimators=500)
gbrt_best.fit(X, y)
gbrt_best.n_estimators_

92

In [34]:
# Histogram-Based Gradent Boosting. Bins the training data and optimized for large training sets
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing(as_frame=True)

print(housing)

hgb_reg = HistGradientBoostingRegressor(random_state=42)
hgb_reg.fit(housing.data, housing.target)

{'data':        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  
0        -1

In [11]:
# EX 1:
# Yes if you combine the models you may get a slightly better accuracy due to the law of large numbers. The increase would be marginal

# EX 2:
# A hard voting classifier just takes the most popular vote and a soft voting classifier takes the average

# EX 3:
# Bagging can be parallelized
# Pasting can be parallelized
# Boosting CAN NOT be parallelized
# Random forests CAN be parallelized
# Stacking can NOT be parallelized

# EX 4:
# It allows you to test on an independent data set

# EX 5:
# Extra trees use randomized thresholds which makes them more random than a RF. They are slower

# EX 6:
# You should increase the max depth and decrease the learning rate

# EX 7:
# You should increase the learning rate because you could be hitting local minima

# EX 8:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

digits = load_digits(as_frame=True)

X = digits.data
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf_clf = RandomForestClassifier(n_jobs=-1, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print(f"RandomForest: {accuracy_score(y_test, y_pred)}")

svc_clf = SVC(random_state=42)
svc_clf.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)
print(f"SVC: {accuracy_score(y_test, y_pred)}")

mlp_clf = MLPClassifier(random_state=42)
mlp_clf.fit(X_train, y_train)
y_pred = mlp_clf.predict(X_test)
print(f"MLP: {accuracy_score(y_test, y_pred)}") 


RandomForest: 0.9711111111111111
SVC: 0.9866666666666667
MLP: 0.9711111111111111
