___

# Machine Learning in Geosciences ] 
Department of Applied Geoinformatics and Carthography, Charles University

Lukas Brodsky lukas.brodsky@natur.cuni.cz


## Boosting


This notebook covers these topics of the boosting ensemble learning: 

* Adaptive Boosting

* Gradient Boosting 

* Extreme Gradient Bosting

* Plotting the model

# Setup

In [None]:
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# Project dir
PROJECT_DIR = "./"
if os.path.isdir(PROJECT_DIR):
    print('Ok continue.')
else:
    print('Nok, set correct path to your project directory!')

In [None]:
# Install new packages 
# pip3 install xgboost
import xgboost 
# * OpenMP runtime is not installed
# dependency OpenMP package !!! 

# optionally 
# graphviz==0.16 # pip3 install graphviz

In [None]:
# Prepare simulated data (moons) 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# AdaBoost

In [None]:
# Run AdaBoost with DecisionTreeClassifier 

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)

In [None]:
# Plot decision boundary 

from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
    if contour:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)

In [None]:
plot_decision_boundary(ada_clf, X, y)

# Gradient Boosting Regressor

In [None]:
# prepare simulated non-linear data 

np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [None]:
plt.plot(X, y, 'b.')

### GBR 'manually'

In [None]:
from sklearn.tree import DecisionTreeRegressor

# initiall model 
tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

In [None]:
# caculate difference of labels y and the prediction of the initial model 
y2 = y - tree_reg1.predict(X)
# initialize second model 
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
# fit the model to the residuals 
tree_reg2.fit(X, y2)

In [None]:
# caculate difference of labels y2 and the prediction of the second model 
y3 = y2 - tree_reg2.predict(X)
# initial the third model 
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42) 
# fit the model to the residuals 
tree_reg3.fit(X, y3)

In [None]:
# test the model on value 0.8 
X_new = np.array([[0.8]])

In [None]:
tree_reg3.predict(X_new)

In [None]:
# create ensemble prediction 
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [None]:
y_pred

In [None]:
# Why sum? 

In [None]:
# Plot the intermediate models and the final ensemble 

In [None]:
def plot_predictions(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None):
    x1 = np.linspace(axes[0], axes[1], 500)
    y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)
    plt.plot(X[:, 0], y, data_style, label=data_label)
    plt.plot(x1, y_pred, style, linewidth=2, label=label)
    if label or data_label:
        plt.legend(loc="upper center", fontsize=16)
    plt.axis(axes)

plt.figure(figsize=(11,11))

plt.subplot(321)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h_1(x_1)$", style="g-", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Residuals and tree predictions", fontsize=16)

plt.subplot(322)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1)$", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Ensemble predictions", fontsize=16)

plt.subplot(323)
plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals")
plt.ylabel("$y - h_1(x_1)$", fontsize=16)

plt.subplot(324)
plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1)$")
plt.ylabel("$y$", fontsize=16, rotation=0)

plt.subplot(325)
plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label="$h_3(x_1)$", style="g-", data_style="k+")
plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16)
plt.xlabel("$x_1$", fontsize=16)

plt.subplot(326)
plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$")
plt.xlabel("$x_1$", fontsize=16)
plt.ylabel("$y$", fontsize=16, rotation=0)

# save_fig("gradient_boosting_plot")
plt.show()

### GBR Sklearn 

#### Use different learning rates & number of estimators

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

lr = 1.
estim = 3
gbrt = GradientBoostingRegressor(max_depth=2, learning_rate=lr, n_estimators=estim, random_state=42)
gbrt.fit(X, y)

In [None]:
lr = .1
estim = 200
gbrt_slow = GradientBoostingRegressor(max_depth=2, learning_rate=lr, n_estimators=estim, random_state=42)
gbrt_slow.fit(X, y)

#### Compare the two parametrizations

In [None]:
plt.figure(figsize=(11,4))

plt.subplot(121)
plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions")
plt.title("learning_rate={}, n_estimators={}".format(gbrt.learning_rate, gbrt.n_estimators), fontsize=14)

plt.subplot(122)
plot_predictions([gbrt_slow], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="Ensemble predictions")
plt.title("learning_rate={}, n_estimators={}".format(gbrt_slow.learning_rate, gbrt_slow.n_estimators), fontsize=14)

# save_fig("gbrt_learning_rate_plot")
plt.show()

In [None]:
# Which one is better? 

In [None]:
# Grid search test lr vs. n_est 

In [None]:
# Try different parameters 
# Which parameters are the best? 

## Gradient Boosting with Early stopping

#### Early stoopping v. 1 

In [None]:
# Run 120 estimaters and find the best! 

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)
gbrt.fit(X_train, y_train)

In [None]:
# MSE 
errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators, random_state=42)
gbrt_best.fit(X_train, y_train)
# print(bst_n_estimators)

In [None]:
min_error = np.min(errors)

In [None]:
print(min_error)

In [None]:
gb1_preds = gbrt_best.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, gb1_preds))
print("GBR early stopping 1 RMSE: %f" % (rmse))

In [None]:
# Plot the earlty stopping scheme

plt.figure(figsize=(11, 4))

plt.subplot(121)
plt.plot(errors, "b.-")
plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], "k--")
plt.plot([0, 120], [min_error, min_error], "k--")
plt.plot(bst_n_estimators, min_error, "ko")
plt.text(bst_n_estimators, min_error*1.2, "Minimum", ha="center", fontsize=14)
plt.axis([0, 120, 0, 0.01])
plt.xlabel("Number of trees")
plt.title("Validation error", fontsize=14)

plt.subplot(122)
plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("Best model (%d trees)" % bst_n_estimators, fontsize=14)

# save_fig("early_stopping_gbrt_plot")
plt.show()

#### Early stoopping v. 2

In [None]:
# Run the early stopping algorithm with parameter error going up = 5 to find the best! 

gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break  # early stopping

In [None]:
print(gbrt.n_estimators)

In [None]:
print("Minimum validation MSE v.2:", min_val_error)

In [None]:
print("Minimum validation MSE: v.1: ", min_error)

In [None]:
gb2_preds = gbrt.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, gb2_preds))
print("GBR early stopping v.2 RMSE: %f" % (rmse))

##  Extreme Gradient Boosting

In [None]:
import xgboost

In [None]:
xgboost.__version__

In [None]:
xgb_reg = xgboost.XGBRegressor()

In [None]:
# XGB early stopping 
# early_stopping_rounds (int) – Activates early stopping. 
# Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training. 
xgb_reg.fit(X_train, y_train,
            eval_set=[(X_val, y_val)], early_stopping_rounds=10)
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)
print("Validation MSE:", val_error)

In [None]:
# Performance comparison 

In [None]:
%timeit xgboost.XGBRegressor().fit(X_train, y_train)

In [None]:
%timeit GradientBoostingRegressor().fit(X_train, y_train)

In [None]:
# Compare regression model with the previous tesk 
# estimators = 61, slow learning rate = .1, shaloow trees depth = 2 

In [None]:
xg_reg = xgboost.XGBRegressor(n_estimators = 61, learning_rate=.1, max_depth=2)

In [None]:
xg_reg.fit(X_train,y_train)

In [None]:
preds = xg_reg.predict(X_val)

In [None]:
rmse = np.sqrt(mean_squared_error(y_val, preds))
print("RMSE: %f" % (rmse))

In [None]:
def plot_prediction(regressors, X, y, axes, label=None, style="r-", data_style="b.", data_label=None):
    x1 = np.linspace(axes[0], axes[1], 500)
    y_pred = regressors[0].predict(x1.reshape(-1, 1))
    plt.plot(X[:, 0], y, data_style, label=data_label)
    plt.plot(x1, y_pred, style, linewidth=2, label=label)
    if label or data_label:
        plt.legend(loc="upper center", fontsize=16)
    plt.axis(axes)

In [None]:
plot_prediction([xg_reg], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label="XGBoost")

In [None]:
# Homework: grid search parameters

In [None]:
... 

In [None]:
# Better results? 

### XGBoost classification

#### Iris data set

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# XGBoost Data Matrix 
dtrain = xgboost.DMatrix(X_train, label=y_train)
dtest = xgboost.DMatrix(X_test, label=y_test)

In [None]:
# set parameters
param = {
    'max_depth': 5,   # the maximum depth of each tree
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'eval_metric': 'mlogloss', 
    'num_class': 3}   # the number of classes that exist in this datset


### Visualize Feature Importance

In [None]:
xg_reg = xgboost.train(params=param, dtrain=dtrain, num_boost_round=10)

In [None]:
# save the model 
xg_reg.dump_model('dump.raw.txt')

In [None]:
xgboost.plot_importance(xg_reg)