# Practice Lab 12: 
## Ensemble Learning
In this lab we will use Ensemble methods for classification. \
Based on Chapter 7 from Aurelien Geron's book, Hands-on Machine Learning with Scikit-Learn Keras & Tensorflow.\
Original code examples from book in github [here](https://github.com/ageron/handson-ml2)

<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/dtrad/geoml_course/blob/master/Practice12_Ensembles.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
sklearn.set_config(print_changed_only=False)

### Exercise 1: 
Use the moons data set (below) to try the following three classifiers:  \
1 - RandomForestClassifier, 2 - LogisticRegression, 3 - SVC.\
Then create an ensemble using the class VotingClassifier with the three classifiers above.\
Try hard and soft voting.\
Compare the four classifiers.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Create three types of models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42)

In [None]:
# Combine them using a voting model (you need to put together several classifiers 
# in a list and select the type of voting)
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard',n_jobs=-1)

voting_clf.fit(X_train, y_train)

In [None]:
# Use the accuracy score from sklearn metrics, predict with each model independently and then using hard voting
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
# Use soft voting - All the classifiers in the list need a method predict_proba
# For SVC this is not the default so you need to redefine it.
# need to change SVC to probability = True, (try first without and check the error message)
svm_clf = SVC(gamma="scale", probability=True, random_state=42)
votings_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')

votings_clf.fit(X_train, y_train)
y_pred=votings_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

### Exercise 2:
Use a DT classifier with the moon data set but apply it many times to the moons data set using different random seeds. \
Plot them in separate figures and then together in one figure to show the effect of combining classifications

In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.45, -1, 1.5], alpha=0.5, contour=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
    if contour:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)

In [None]:
from sklearn.tree import DecisionTreeClassifier
for i in range(15):
    tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i)
    indices_with_replacement = np.random.randint(0, len(X_train), len(X_train))
    tree_clf.fit(X[indices_with_replacement], y[indices_with_replacement])
    #plt.figure()
    plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.45, -1, 1.5], alpha=0.02, contour=False)

### Exercise 3:  Bagging 
A "Bagging" classifier uses one type of classifier only.
Create a Bagging classifier formed with decision trees and compare with a decision tree. \
Try changing the DT parameters to get a similar classification (may not be possible).

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42, max_depth=5, criterion="gini", min_samples_leaf=3), n_estimators=100,
    max_samples=100, bootstrap=True, random_state=42, n_jobs=10)
bag_clf.fit(X_train, y_train)

In [None]:
y_pred = bag_clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

In [None]:
# Let us compare with a single Tree classifier
tree_clf = DecisionTreeClassifier(random_state=42, criterion='gini',max_depth=5, min_samples_leaf=3)
tree_clf.fit(X_train, y_train)

In [None]:
y_pred_tree = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_tree))

Let us plot the boundaries for these two classifiers (bagging and single tree)

In [None]:
fix, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)
plt.sca(axes[0])
plot_decision_boundary(tree_clf, X, y)
plt.title("Decision Tree", fontsize=14)
plt.sca(axes[1])
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
plt.ylabel("")
plt.show()

### Exercise 4: RandomForest
Compare the bagging classifier with a Random forest.
What is the difference between a bagging classifier made by trees, and a RandomForest?

In [None]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=3, random_state=42, n_jobs=10)
rnd_clf.fit(X_train, y_train)

In [None]:
y_pred_rf = rnd_clf.predict(X_test)

In [None]:
fix, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)
plt.sca(axes[0])
plot_decision_boundary(rnd_clf, X, y)
plt.title("Random Forest", fontsize=14)
plt.sca(axes[1])
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
plt.ylabel("")
plt.show()

### Exercise 5: RandomForest vs AdaBoost
Compare the Random forest classifier with AdABoost.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(max_depth=10,n_estimators=500, max_leaf_nodes=16, random_state=42)
rnd_clf.fit(X_train, y_train)

In [None]:

from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=10, max_leaf_nodes=16,), n_estimators=500, 
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)

In [None]:
fix, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)
plt.sca(axes[0])
plot_decision_boundary(rnd_clf, X, y)
plt.title("Random Forest", fontsize=14)
plt.sca(axes[1])
plot_decision_boundary(ada_clf, X, y)
plt.title("AdaBoost", fontsize=14)
plt.ylabel("")
plt.show()

### Exercise 6
Illustrate how GradBoosting works by implementing an iterative regression where updates are calculated from residuals. \
For this part, do not use the skl gradient boosting class but implement it in terms of Decision Trees calculated on the current residuals. \
Plot at each iteration the residuals and the prediction from the model plus updates until that iteration.\
Let us calculate data fitting for the following data:

In [None]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

For each iteration, we will calculate a new tree that fits the residuals.\
As usual, first residuals are the data (since predictions are null for a null model).

In [None]:
# we will only use DT and implement the gradient boosting by hand.
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

In [None]:
# calculate residuals at each stage by subtracting predictions from data (as usual)
y2 = y - tree_reg1.predict(X)
# Now calculate a new tree from the residuals.
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)

In [None]:
# and repeat!
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)

because of non-linearity of DT, we can't just add the models to predict.\
Instead we have to add the predictions:
$\sum$ predictions $\neq$ prediction $\sum$ of models.

In [None]:
# let us predict this value:
X_new = np.array([[0.8]])
y_pred = [tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3)]
print('partial predictions', y_pred,'\n sum of predictions=',sum(y_pred))

Let us now plot the residuals and full data fitting at each iteration.\
We define a function that takes a regressor list, the training data set and plots both data and prediction for a regular axis.

In [None]:
def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None):
    x1 = np.linspace(axes[0], axes[1], 500)
    y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
    plt.plot(X[:, 0], y, data_style, label=data_label)
    plt.plot(x1, y_pred, style, linewidth=2, label=label)
    if label or data_label:
        plt.legend(loc="upper center", fontsize=16)
    plt.axis(axes)

First column represents partial residual fitting, second column represents full fitting.\
For each plot on the first column, we pass the current tree and the residuals.\
For each plot on the second column, we pass a list with all trees, and the data.

In [None]:
plt.figure(figsize=(11,11))

plt.subplot(321)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h_1(x_1)$", style="g-", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Residuals and tree predictions", fontsize=16)

plt.subplot(322)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1)$", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Ensemble predictions", fontsize=16)

plt.subplot(323)
plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals")
plt.ylabel("$y - h_1(x_1)$", fontsize=16)

plt.subplot(324)
plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1)$")
plt.ylabel("$y$", fontsize=16, rotation=0)

plt.subplot(325)
plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(x_1)$", style="g-", data_style="k+")
plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16)
plt.xlabel("$x_1$", fontsize=16)

plt.subplot(326)
plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$")
plt.xlabel("$x_1$", fontsize=16)
plt.ylabel("$y$", fontsize=16, rotation=0)

plt.show()

#### Question to discuss in class: 
What is different between this method and for example steepest descent?\
Think of how we implemented Gradient Descent for Linear regression before.\
How would this work for classification?

### Exercise 7
Let us fit the same data set using sklearn GradBoosting class.\
Try with different number of estimators and different maximum depth.

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=3, n_estimators=15, learning_rate=.5, random_state=42)
gbrt.fit(X, y)

In [None]:
plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions")
plt.title("learning_rate={}, n_estimators={}".format(gbrt.learning_rate, gbrt.n_estimators))

### Exercise 8
Do the same fitting with xgboost.

In [None]:
try:
    import xgboost
except ImportError as ex:
    print("Error: the xgboost library is not installed.")
    xgboost = None

In [None]:
xgboost?

In [None]:
if xgboost is not None:  # you can install with pip if None
    xgb_reg = xgboost.XGBRegressor(n_estimators=11,max_depth=3,random_state=42)
    print(xgb_reg.fit(X, y))
    plot_predictions([xgb_reg], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions")

### Exercise 9: 
Plot the feature importance for the classification of the iris data set (given code).
Try for the MNIST data set.

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=10)
rnd_clf.fit(iris["data"], iris["target"])

In [None]:
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, '%0.3f' % score)

In [None]:
from tensorflow import keras
import tensorflow as tf
useSL=False
if useSL:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1)
    mnist.target = mnist.target.astype(np.uint8)
    print(type(mnist))
    mnist.keys()
else: #use TF 
    mnist = keras.datasets.mnist

In [None]:
if useSL:
    X, y = mnist["data"], mnist["target"]
    print(type(X),X.shape,type(y),y.shape)
else:
    (X,y),(Xt,yt) = mnist.load_data()
    X=X[:60000].reshape(60000,28*28) # to make it compatible with SL version
    y=y[:60000]

    print(type(X),X.shape,type(y),y.shape)

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=10)
rnd_clf.fit(X, y)

In [None]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.hot,
               interpolation="nearest")
    plt.axis("off")

In [None]:
plot_digit(rnd_clf.feature_importances_)
cbar = plt.colorbar(ticks=[rnd_clf.feature_importances_.min(), rnd_clf.feature_importances_.max()])
cbar.ax.set_yticklabels(['Not important', 'Very important'])

### Exercise 10: Regression (from sklearn-documentation)

In [None]:
# Author: Noel Dawe <noel.dawe@gmail.com>
# importing necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

# Create the dataset
rng = np.random.RandomState(1)
X = np.linspace(0, 6, 100)[:, np.newaxis]
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])

# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=6)
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6),
                          n_estimators=300, random_state=rng)

regr_1.fit(X, y)
regr_2.fit(X, y)

# Predict
y_1 = regr_1.predict(X)
y_2 = regr_2.predict(X)

# Plot the results
plt.figure()
plt.scatter(X, y, c="k", label="training samples")
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Boosted Decision Tree Regression")
plt.legend()
plt.show()