# Libraries

In [None]:
from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np
import os as os
import pandas as pd
import seaborn as sns

import random
random.seed(0) # pick your seed

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor

# Data

In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
  drive.mount('/content/gdrive', force_remount=True)
  dir = os.path.join('gdrive', 'My Drive', 'Eurostat', '02 - Data Science for Structured Data')
else:
  dir = "."
data_dir = os.path.join(dir, 'data')
model_dir = os.path.join(dir, 'model')

Mounted at /content/gdrive


In [None]:
df_iris = pd.read_csv(os.path.join(data_dir, 'iris.csv'),
                      header=None,
                      names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
df_iris.sample(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
39,5.1,3.4,1.5,0.2,Iris-setosa
67,5.8,2.7,4.1,1.0,Iris-versicolor
115,6.4,3.2,5.3,2.3,Iris-virginica


In [None]:
df_pima = pd.read_csv(os.path.join(data_dir, 'pima.csv'),
                      header=8,
                      names = ['preg', 'gluc', 'pres', 'skin' ,'insu', 'bmi', 'pedi', 'age', 'class'])
df_pima.sample(3)

Unnamed: 0,preg,gluc,pres,skin,insu,bmi,pedi,age,class
83,0,101,65,28,0,24.6,0.237,22,0
539,3,129,92,49,155,36.4,0.968,32,1
512,9,91,68,0,0,24.2,0.2,58,0


In [None]:
df_wine = pd.read_csv(os.path.join(data_dir, 'wine.csv'), 
                      sep=';')
df_wine.sample(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
819,9.1,0.66,0.15,3.2,0.097,9.0,59.0,0.99976,3.28,0.54,9.6,5
181,8.9,0.61,0.49,2.0,0.27,23.0,110.0,0.9972,3.12,1.02,9.3,5
1172,9.7,0.42,0.46,2.1,0.074,5.0,16.0,0.99649,3.27,0.74,12.3,6


In [None]:
df_housing = pd.read_csv(os.path.join(data_dir, 'housing.csv'))
df_housing.sample(3)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
399,9.91655,0.0,18.1,0,0.693,5.852,77.8,1.5004,24,666,20.2,338.16,29.97,6.3
309,0.3494,0.0,9.9,0,0.544,5.972,76.7,3.1025,4,304,18.4,396.24,9.97,20.3
26,0.67191,0.0,8.14,0,0.538,5.813,90.3,4.682,4,307,21.0,376.88,14.81,16.6


In [None]:
df_happiness = pd.read_csv(os.path.join(data_dir, 'happiness.csv'))
df_happiness.sample(3)

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
1126,ED3_4,M,Y25-34,AL,2018,,,5.1,,5.5,5.8,,,7.2,5.7
1392,ED3_4,M,Y65-74,PL,2018,,,6.4,,7.2,7.7,,,8.0,7.8
857,ED3_4,F,Y35-49,DK,2018,,,6.9,,7.6,7.4,,,8.2,7.2


# Ensembles
You'll learn how to boost performance with:
1. **Bagging ensemble methods**, such as bagged decision trees, random forest and extra trees.
2. **Boosting ensemble methods**, such as AdaBoost, stochastic gradient boosting and XGBoost
3. **Voting/Stacking ensemble methods**, to combine the predictions from multiple algorithms.

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeRegressor

X = df_housing.copy()
y = X.pop('medv')

kfold = KFold(n_splits=5, shuffle=True, random_state=8)
model = DecisionTreeRegressor(max_depth=4)

scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(f"MSE: {-results.mean():.4f} ({results.std():.4f})")

MSE: 22.7198 (7.2371)


## Bagging
- Bagging, or "Bootstrap Aggregation", consists in building multiple models (typically of the same type) from different subsamples of the training dataset.

- Performs best with algorithms that have high variance.
- [How it works](https://en.wikipedia.org/wiki/Bootstrap_aggregating)
- [How to use it](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingRegressor

X = df_housing.copy()
y = X.pop('medv')

kfold = KFold(n_splits=5, shuffle=True, random_state=8)
model = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                         n_estimators=100,
                         random_state=8)

scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(f"MSE: {-results.mean():.4f} ({results.std():.4f})")

MSE: 13.6572 (6.4198)


### Random Forest

- extension of bagged decision trees.
- trees are constructed in a way that reduces the correlation
between individual classifiers. Specifically, rather than greedily choosing the best split point in the construction of each tree, only a random subset of features are considered for each split.

- [How it works](https://en.wikipedia.org/wiki/Random_forest)
- [How to use it](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor)


In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor

X = df_housing.copy()
y = X.pop('medv')

kfold = KFold(n_splits=5, shuffle=True, random_state=8)
model = RandomForestRegressor(n_estimators=100,
                              max_features=3,
                              random_state=8)

scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(f"MSE: {-results.mean():.4f} ({results.std():.4f})")

MSE: 11.4845 (4.2065)


### Extra Trees
- **Extremely randomized trees** are another modification of bagging where random trees are constructed from samples of the training dataset.
- Splits are  selected at random.
- [How to use it](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import ExtraTreesRegressor

X = df_housing.copy()
y = X.pop('medv')

kfold = KFold(n_splits=5, shuffle=True, random_state=8)
model = ExtraTreesRegressor(n_estimators=100,
                            max_features=5,
                            random_state=8)

scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(f"MSE: {-results.mean():.4f} ({results.std():.4f})")

MSE: 9.6719 (2.6532)


## Boosting
Building multiple models (typically of the same type) each of which learns to
fix the prediction errors of a prior model in the sequence of models.

### AdaBoost
- _Adaptive Boosting_ weights instances in the dataset by how easy or difficult they are to classify, allowing the algorithm to pay more or less attention to them in the construction of subsequent models.
- [How it works](https://en.wikipedia.org/wiki/AdaBoost)
- [How to use it](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import AdaBoostRegressor

X = df_housing.copy()
y = X.pop('medv')

kfold = KFold(n_splits=5, shuffle=True, random_state=8)
model = AdaBoostRegressor(n_estimators=60,
                          random_state=8)

scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(f"MSE: {-results.mean():.4f} ({results.std():.4f})")

MSE: 15.6009 (5.5179)


### Stochastic Gradient Boosting
- **Gradient Boosting Machines** are one of the most
sophisticated ensemble techniques, proving to be perhaps one of
the best techniques available for improving performance via ensembles.
- Each tree corrects for the residuals left after all previous trees, thus each taking one small step (~ learning rate) into the right direction.
- [How it works](https://en.wikipedia.org/wiki/Gradient_boosting)
- [How to use it](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor

X = df_housing.copy()
y = X.pop('medv')

kfold = KFold(n_splits=5, shuffle=True, random_state=8)
model = GradientBoostingRegressor(n_estimators=100,
                                  min_samples_leaf=4,
                                  random_state=8)

scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(f"MSE: {-results.mean():.4f} ({results.std():.4f})")

MSE: 10.9898 (4.4357)


### XGBoost
- An integral part of winning strategies for data-science competitions on Structured/Tabular Data
- [Read the docs](https://xgboost.readthedocs.io/en/latest/get_started.html)
- Change the runtime type to GPU

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

X = df_housing.copy()

mses = []
kfold = KFold(n_splits=5, shuffle=True, random_state=8)
for idx_train, idx_test in kfold.split(X):
  # Train, validation and test set
  X_train = X.iloc[idx_train,:]
  y_train = X_train.pop('medv')

  X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=8)
  
  X_test = X.iloc[idx_test,:]
  y_test = X_test.pop('medv')

  # Model training
  model = XGBRegressor(n_estimors=100,
                       objective ='reg:squarederror')
  model.fit(
    X_train, 
    y_train,
    eval_set = [(X_valid,y_valid)],
    verbose = 0,
    early_stopping_rounds = 100)  

  # Performance assessment
  predictions = model.predict(X_test)
  mses.append(mean_squared_error(y_test, predictions, squared=True))

print(f"MSE: {-results.mean():.4f} ({results.std():.4f})")

MSE: 10.9898 (4.4357)


## Voting / Stacking
- Building multiple models (typically of differing types) and combining predictions.
- Another step toward data science awards.

### Averaging
- Simple statistics (like calculating the mean) are used to combine predictions.
- [How to use it](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingRegressor)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor

X = df_housing.copy()
y = X.pop('medv')

kfold = KFold(n_splits=5, shuffle=True, random_state=8)

estimators = [('linear_regressor', LinearRegression()),
              ('random_forest', RandomForestRegressor(n_estimators=100, max_features=3, random_state=8)),
              ('gradient_boost', GradientBoostingRegressor(n_estimators=100, min_samples_leaf=4, random_state=8))]

model = VotingRegressor(estimators)

scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(f"MSE: {-results.mean():.4f} ({results.std():.4f})")

MSE: 12.0417 (4.9395)


### Stacking
- Consists in stacking the output of individual estimator (typically of differing types) and use a regressor to compute the final prediction.

- Allows to use the strength of each individual estimator by using their output as input of a final estimator.

- [How to use it](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingRegressor.html)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

X = df_housing.copy()
y = X.pop('medv')

kfold = KFold(n_splits=5, shuffle=True, random_state=8)

estimators = [('linear_regressor', LinearRegression()),
              ('random_forest', RandomForestRegressor(n_estimators=100, max_features=3, random_state=8)),
              ('gradient_boost', GradientBoostingRegressor(n_estimators=100, min_samples_leaf=4, random_state=8))]

model = StackingRegressor(estimators, passthrough=False)

scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(f"MSE: {-results.mean():.4f} ({results.std():.4f})")

MSE: 10.7812 (4.1656)
