# Hyperparameter Optimization – More Penguins!

In [49]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

%matplotlib inline

In [10]:
df = pd.read_csv('../week_01/all_penguins_clean.csv', index_col=0)

In [11]:
df = df.dropna()

In [None]:
y = df['Species']

In [91]:
# train test split
Xtrain, Xtest, ytrain, ytest = train_test_split(df, y)

In [38]:
df.head(3)

Unnamed: 0_level_0,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Real ID,Sex
studyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
PAL0708,1,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,A_0,MALE
PAL0708,2,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,A_1,FEMALE
PAL0708,3,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,A_2,FEMALE


**Caution: difficult**

In [78]:
def extract_second_position_from_id(df):  # input is a DataFrame (with 1 column)
    """Returns the second position of a string column"""
    first_char = df.iloc[:, 0].str[1].astype(int)
    return first_char.values.reshape(-1, 1) # output has to be a 2D matrix

### ColumnTransformer

In [92]:
# take columns a,b,c apply feature engineering M to it, and put the result back into the data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

trans = ColumnTransformer([
    ('my_id', FunctionTransformer(extract_second_position_from_id), ['Individual ID']),    
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Island', 'Sex']),
    ('my_binning', KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile'), ['Culmen Depth (mm)']),   # like pd.qcut()
    ('do_nothing', 'passthrough', ['Culmen Length (mm)', 'Body Mass (g)'])
])

In [93]:
trans.fit(Xtrain)
X = trans.transform(Xtrain)
X.shape

(250, 14)

In [81]:
X[0]

array([1.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00,
       1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00,
       3.91e+01, 3.75e+03])

### Modeling Pipeline

1. Apply column transformations (OneHot, Binning, Custom Functions)
2. Scale everything
3. Train a model

In [87]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

In [88]:
p = make_pipeline(
    trans,
    MinMaxScaler(),
    LogisticRegression()
)

In [94]:
p.fit(Xtrain, ytrain)    # on training data
p.score(Xtrain, ytrain)

0.992

In [95]:
p.score(Xtest, ytest)  # on test data

0.9761904761904762

In [None]:
# we can use p in cross_val_score
# ... or in GridSearchCV

##  Hyperparameter Optimization

We want to:
* try different values for a hyperparamter
* train the model for each
* see which has the best validation score

In [None]:
X = df[['Culmen Length (mm)', 'Body Mass (g)']]
y = df['Species']

In [26]:
for i in range(1, 11, 3):
    for j in range(2, 20, 7):
        m = DecisionTreeClassifier(max_depth=i, min_samples_split=j)
        mean_acc = cross_val_score(m, X, y, cv=5, scoring='accuracy').mean()
        print(f"{i:3}    {j:3}    {mean_acc:8.3f}")

  1      2       0.740
  1      9       0.740
  1     16       0.740
  4      2       0.916
  4      9       0.910
  4     16       0.901
  7      2       0.904
  7      9       0.916
  7     16       0.901
 10      2       0.910
 10      9       0.913
 10     16       0.901


In [27]:
from sklearn.model_selection import GridSearchCV

In [29]:
# define our hyperparameters to combine
hyperparams = {
    'max_depth': list(range(1, 11)), 
    'min_samples_split': list(range(2, 20, 2))
}

# create an estimator 
m = DecisionTreeClassifier()
g = GridSearchCV(m, hyperparams, cv=5)
g.fit(X, y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18]})

#### How many models do we train?

with 10 + 10 hyperparameters: $5 * 10^2 + 1$

with 5 hyperparameters and 10 values each: $5 * 10^5 + 1$

In [30]:
g.best_params_  # <-- underscore indicates that this was created by .fit() 

{'max_depth': 5, 'min_samples_split': 6}

In [35]:
g.best_score_

0.9252374491180462

In [32]:
g.best_estimator_  # use this for making predictions

DecisionTreeClassifier(max_depth=5, min_samples_split=6)

In [34]:
details = g.cv_results_  # huge dictionary with all the details

alternative if your hyperparameter space is very big: `RandomSearchCV`