# Libraries

In [None]:
from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np
import os as os
import pandas as pd
import seaborn as sns

import random
random.seed(0) # pick your seed

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Data

In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
  drive.mount('/content/gdrive', force_remount=True)
  dir = os.path.join('gdrive', 'My Drive', 'Eurostat', '02 - Data Science for Structured Data')
else:
  dir = "."
data_dir = os.path.join(dir, 'data')
model_dir = os.path.join(dir, 'model')

Mounted at /content/gdrive


In [None]:
df_iris = pd.read_csv(os.path.join(data_dir, 'iris.csv'),
                      header=None,
                      names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
df_iris.sample(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
135,7.7,3.0,6.1,2.3,Iris-virginica
16,5.4,3.9,1.3,0.4,Iris-setosa
30,4.8,3.1,1.6,0.2,Iris-setosa


In [None]:
df_pima = pd.read_csv(os.path.join(data_dir, 'pima.csv'),
                      header=8,
                      names = ['preg', 'gluc', 'pres', 'skin' ,'insu', 'bmi', 'pedi', 'age', 'class'])
df_pima.sample(3)

Unnamed: 0,preg,gluc,pres,skin,insu,bmi,pedi,age,class
397,0,131,66,40,0,34.3,0.196,22,1
598,1,173,74,0,0,36.8,0.088,38,1
411,1,112,72,30,176,34.4,0.528,25,0


In [None]:
df_wine = pd.read_csv(os.path.join(data_dir, 'wine.csv'), 
                      sep=';')
df_wine.sample(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1384,7.1,0.755,0.15,1.8,0.107,20.0,84.0,0.99593,3.19,0.5,9.5,5
494,6.5,0.39,0.23,8.3,0.051,28.0,91.0,0.9952,3.44,0.55,12.1,6
1169,7.6,0.5,0.29,2.3,0.086,5.0,14.0,0.99502,3.32,0.62,11.5,6


In [None]:
df_housing = pd.read_csv(os.path.join(data_dir, 'housing.csv'))
df_housing.sample(3)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
195,0.01381,80.0,0.46,0,0.422,7.875,32.0,5.6484,4,255,14.4,394.23,2.97,50.0
175,0.06664,0.0,4.05,0,0.51,6.546,33.1,3.1323,5,296,16.6,390.96,5.33,29.4
260,0.54011,20.0,3.97,0,0.647,7.203,81.8,2.1121,5,264,13.0,392.8,9.59,33.8


In [None]:
df_happiness = pd.read_csv(os.path.join(data_dir, 'happiness.csv'))
df_happiness.sample(3)

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
1028,ED3_4,F,Y65-74,MK,2013,7.0,,5.2,6.4,,6.0,6.9,7.3,8.1,7.4
1543,ED5-8,F,Y35-49,BG,2013,6.8,6.4,5.0,5.6,6.7,6.0,5.8,7.2,6.3,5.4
255,ED0-2,F,Y50-64,MT,2013,7.9,6.4,5.5,6.7,7.1,6.7,7.2,8.1,8.4,6.7


# Tuning
You'll learn:
1. The importance of algorithm parameter tuning to improve algorithm performance.
2. How to use a grid search algorithm tuning strategy.
3. How to use a random search algorithm tuning strategy.
4. How to use a handy library for tuning.

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsRegressor

X = df_wine.copy()
y = X.pop('quality')

kfold = KFold(n_splits=5, shuffle=True, random_state=8)
model = KNeighborsRegressor(n_neighbors=10)

scoring = 'neg_mean_squared_error'
results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(f"MSE: {-results.mean():.4f} ({results.std():.4f})")

MSE: 0.5515 (0.0523)


## Grid Search
- an approach to parameter tuning that will methodically build and evaluate a model for each combination of algorithm parameters specified in a grid.
- [How to use it](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

X = df_wine.copy()
y = X.pop('quality')

param_grid = {
    'n_neighbors': np.arange(1, 20),
    'algorithm': ['ball_tree', 'kd_tree', 'brute']}

model = KNeighborsRegressor()

grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    scoring='neg_mean_squared_error')
grid.fit(X, y)

print(grid.best_score_)
print(grid.best_estimator_)

-0.5834757378928265
KNeighborsRegressor(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=19, p=2,
                    weights='uniform')


## Random Search Parameter Tuning
- an approach to parameter tuning that will sample algorithm parameters from a random distribution (i.e. uniform) for a fixed number of iterations. A model is constructed and evaluated for each combination of parameters chosen.
- [How to use it](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from scipy.stats import uniform

X = df_wine.copy()
y = X.pop('quality')

param_dist= {
    'n_neighbors': np.arange(1, 20),
    'algorithm': ['ball_tree', 'kd_tree', 'brute']}

model = KNeighborsRegressor()

grid = RandomizedSearchCV(estimator=model,
                          param_distributions=param_dist,
                          scoring='neg_mean_squared_error',
                          n_iter=5,
                          random_state=8)
grid.fit(X, y)

print(grid.best_score_)
print(grid.best_estimator_)

-0.5840328331204767
KNeighborsRegressor(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=18, p=2,
                    weights='uniform')


## Optuna
- An open source hyperparameter optimization framework to automate hyperparameter search.
- [How to use it](https://optuna.org/)
- For the following example, use a GPU runtime

In [None]:
!pip install optuna



In [None]:
import optuna

In [None]:
help(XGBRegressor)

Help on class XGBRegressor in module xgboost.sklearn:

class XGBRegressor(XGBModel, sklearn.base.RegressorMixin)
 |  XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, silent=None, objective='reg:linear', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, importance_type='gain', **kwargs)
 |  
 |  Implementation of the scikit-learn API for XGBoost regression.
 |  
 |  Parameters
 |  ----------
 |  max_depth : int
 |      Maximum tree depth for base learners.
 |  learning_rate : float
 |      Boosting learning rate (xgb's "eta")
 |  n_estimators : int
 |      Number of trees to fit.
 |  verbosity : int
 |      The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
 |  silent : boolean
 |      Whether to print messages while running boostin

In [None]:
%%time
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

def housing_objective(trial):
  mses = []
  X = df_housing.copy()

  kfold = KFold(n_splits=5, shuffle=True, random_state=8)
  for idx_train, idx_test in kfold.split(X):
    # Train, validation and test set
    X_train = X.iloc[idx_train,:]
    y_train = X_train.pop('medv')

    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=8)
    
    X_test = X.iloc[idx_test,:]
    y_test = X_test.pop('medv')

    # Model training
    model = XGBRegressor(n_estimors=100,
                        objective ='reg:squarederror')
    
    param = {
        'objective': 'reg:squarederror',
        'tree_method':'gpu_hist',
        'predictor': 'gpu_predictor',
        'n_estimators': 100,
        'subsample': trial.suggest_uniform('subsample', 0.8, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.05, 0.5),
        'colsample_bytree':  trial.suggest_uniform('colsample_bytree', 0.3, 1),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'random_state': 8,
        }

    model = XGBRegressor(**param)

    model.fit(
      X_train, 
      y_train,
      eval_set = [(X_valid,y_valid)],
      verbose = 0,
      early_stopping_rounds = 100)  

    # Performance assessment
    predictions = model.predict(X_test)
    mses.append(mean_squared_error(y_test, predictions, squared=True))

  return(np.mean(mses))

study = optuna.create_study()
study.optimize(housing_objective, n_trials=50)

[32m[I 2021-09-29 11:04:38,040][0m A new study created in memory with name: no-name-63883eba-7780-4819-9006-4dfb1458ebd6[0m
[32m[I 2021-09-29 11:04:39,783][0m Trial 0 finished with value: 12.671041788590555 and parameters: {'subsample': 0.9586721213609594, 'learning_rate': 0.1986091011612125, 'colsample_bytree': 0.9095735848930506, 'max_depth': 2}. Best is trial 0 with value: 12.671041788590555.[0m
[32m[I 2021-09-29 11:04:41,700][0m Trial 1 finished with value: 14.509978569744487 and parameters: {'subsample': 0.8298201485605186, 'learning_rate': 0.27705556097025824, 'colsample_bytree': 0.7603205286979343, 'max_depth': 5}. Best is trial 0 with value: 12.671041788590555.[0m
[32m[I 2021-09-29 11:04:43,595][0m Trial 2 finished with value: 11.86943203394585 and parameters: {'subsample': 0.9619687970815208, 'learning_rate': 0.10818994106314013, 'colsample_bytree': 0.6586478466504719, 'max_depth': 5}. Best is trial 2 with value: 11.86943203394585.[0m
[32m[I 2021-09-29 11:04:44,99

CPU times: user 1min 29s, sys: 9.44 s, total: 1min 39s
Wall time: 1min 38s


In [None]:
study.best_trial.params

{'colsample_bytree': 0.42332776421284046,
 'learning_rate': 0.14460062501784812,
 'max_depth': 4,
 'subsample': 0.9857153261561256}

In [None]:
study.best_trial.value

11.099971739372467

In [None]:
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()

In [None]:
fig = optuna.visualization.plot_contour(study)
fig.show()

# ❓ Exercise

How well can you predict satisfaction of life with the Eurostat data?
- Prepare the data
- Engineer new features
- Create base-line solutions (e.g. LinearRegression)
- Improve with Ensembles
- Tune your favorite model