**07 – Ensemble Learning and Random Forests**

In [None]:
import numpy as np
import os
np.random.seed(42)
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# Moon data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
plt.figure(figsize=(12,4))
plt.subplot(121)
plt.plot(X_train[:, 0][y_train==0], X_train[:, 1][y_train==0], "yo", alpha=0.5)
plt.plot(X_train[:, 0][y_train==1], X_train[:, 1][y_train==1], "bs", alpha=0.5)
plt.title('train data')
plt.axis([-1.5, 2.5, -1, 1.5])
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18, rotation=0)

plt.subplot(122)
plt.plot(X_test[:, 0][y_test==0], X_test[:, 1][y_test==0], "yo", alpha=0.5)
plt.plot(X_test[:, 0][y_test==1], X_test[:, 1][y_test==1], "bs", alpha=0.5)
plt.title('test data')
plt.axis([-1.5, 2.5, -1, 1.5])
plt.xlabel(r"$x_1$", fontsize=18)
plt.ylabel(r"$x_2$", fontsize=18, rotation=0)

# Voting classifiers

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

log_clf = ??
rnd_clf = ??
svm_clf = ??

# soft or hard
soft_voting_clf = VotingClassifier(??)

hard_voting_clf = VotingClassifier(??)

# train soft and hard voting classifiers
soft_voting_clf.??(X_train, y_train)
hard_voting_clf.??(X_train, y_train)

Generate accuracy scores for the classifiers 

In [None]:
from sklearn.metrics import accuracy_score

??
...
??
    

# Bagging ensembles & Random Forest

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# print accuracy score of a decision tree with default parameters
tree_clf = ??
tree_clf.??
y_pred_tree = tree_clf??
print(??)

In [None]:
from sklearn.ensemble import BaggingClassifier
BaggingClassifier?

In [None]:
from sklearn.ensemble import BaggingClassifier

# Use BaggingClassifier for DecisionTreeClassifier

bag_clf = BaggingClassifier(??)
bag_clf.??
y_pred = bag_clf.??

print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
rnd_clf.??

y_pred_rf = rnd_clf.??
print(accuracy_score(y_test, y_pred_rf))

In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap, linewidth=10)
    if contour:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(131)
plot_decision_boundary(tree_clf, X, y)
plt.title("Decision Tree", fontsize=14)
plt.subplot(132)
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
plt.subplot(133)
plot_decision_boundary(rnd_clf, X, y)
plt.title("Random Forest", fontsize=14)
plt.show()

## Feature importance

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)

rnd_clf.fit(??, ??)
# check feature importances
rnd_clf.??

In [None]:
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

# Gradient Boosting

In [None]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

plt.plot(X, y, 'b.')
plt.axis([-0.5, 0.5, -0.1, 0.8])
plt.xlabel("$x_1$", fontsize=16)
plt.ylabel("$y$", fontsize=16, rotation=0)

# Gradient Boosting using DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

Fit a first DecisionTreeRegressor with `max_depth=2`

In [None]:
tree_reg1 = ??
tree_reg1.??

Fit a second DecisionTreeRegressor on the residual error

In [None]:
y2 = y - ?? # residual error caused by the first tree
tree_reg2 = ??
tree_reg2.??

Fit a third DecisionTreeRegressor on the residual error

In [None]:
y3 = ?? # residual error caused by the second tree
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.??

Predict value at 0.8

In [None]:
X_new = np.array([[0.8]])

In [None]:
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [None]:
y_pred

In [None]:
def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None):
    x1 = np.linspace(axes[0], axes[1], 500)
    y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
    plt.plot(X[:, 0], y, data_style, label=data_label)
    plt.plot(x1, y_pred, style, linewidth=2, label=label)
    if label or data_label:
        plt.legend(loc="upper center", fontsize=16)
    plt.axis(axes)

plt.figure(figsize=(11,11))

plt.subplot(321)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h_1(x_1)$", style="g-", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Residuals and tree predictions", fontsize=16)

plt.subplot(322)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1)$", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Ensemble predictions", fontsize=16)

plt.subplot(323)
plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals")
plt.ylabel("$y - h_1(x_1)$", fontsize=16)

plt.subplot(324)
plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1)$")
plt.ylabel("$y$", fontsize=16, rotation=0)

plt.subplot(325)
plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(x_1)$", style="g-", data_style="k+")
plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16)
plt.xlabel("$x_1$", fontsize=16)

plt.subplot(326)
plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$")
plt.xlabel("$x_1$", fontsize=16)
plt.ylabel("$y$", fontsize=16, rotation=0)

plt.show()

# GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = ??
gbrt.??

plt.figure(figsize=(8,4))
plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions")
plt.title("learning_rate={}, n_estimators={}".format(gbrt.learning_rate, gbrt.n_estimators), fontsize=14)

plt.show()

## Gradient Boosting with Early stopping

In [None]:
import numpy as np
from sklearn.model_selection import ?? #method to split train and test data
X_train, X_val, y_train, y_val = ??(X, y, random_state=49)

In [None]:
gbrt = GradientBoostingRegressor(max_depth=3, n_estimators=120, random_state=42)
gbrt.??(??, ??) #train

Measure the validation error at each stage of training 

and find the best number of trees

In [None]:
from sklearn.metrics import mean_squared_error

# lists of mean squared validation errors at each stage
# use gbrt.staged_predict() to generate prediction at each stage of training

errors = ??
bst_n_estimators = np.??(errors)

Train another GBRT using n_estimator as the `best_n_estimator`

In [None]:
gbrt_best = GradientBoostingRegressor(??)

gbrt_best.fit(X_train, y_train)

In [None]:
min_error = np.min(errors)

In [None]:
plt.figure(figsize=(11, 4))

plt.subplot(121)
plt.plot(errors, "b.-")
plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], "k--")
plt.plot([0, 120], [min_error, min_error], "k--")
plt.plot(bst_n_estimators, min_error, "ko")
plt.text(bst_n_estimators, min_error*1.2, "Minimum", ha="center", fontsize=14)
plt.axis([0, 120, 0, 0.01])
plt.xlabel("Number of trees")
plt.title("Validation error", fontsize=14)

plt.subplot(122)
plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("Best model (%d trees)" % bst_n_estimators, fontsize=14)

plt.show()