# Module 03

## Session 06 Model Performance, Evaluation Method and Hyperparameter Tuning

# Model Evaluation 2

Analyze tips data from seaborn:
* total bill, sex, smoker, day and time as feature
* tips as Target

Preprocess
* one hot encoding: smoker, day, time, sex
* no treatment: numerical

Random state 2020, splitting 80:20

model
* model linear regression and tree (criterion mse, max_depth=5) compute R2 using 5-fold CV
* model decision tree(criterion mse, max_depth 5) compute mse in test set
* hyperparameter tuning (randomized search) for decision tree (optimize criterion, min sample leaf, max depth) optimized by mse and using cv-5 fold
* compare the result before and after tuning in test set

# Library

In [48]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, RandomizedSearchCV

# Data

In [2]:
tips = sns.load_dataset('tips')

In [3]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


# Data Splitting

In [5]:
X = tips[['total_bill','sex', 'smoker', 'day', 'time']]
X = pd.get_dummies(X, drop_first=True)
y = tips['tip']

In [6]:
X

Unnamed: 0,total_bill,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1,1,0,0,1,1
1,10.34,0,1,0,0,1,1
2,21.01,0,1,0,0,1,1
3,23.68,0,1,0,0,1,1
4,24.59,1,1,0,0,1,1
...,...,...,...,...,...,...,...
239,29.03,0,1,0,1,0,1
240,27.18,1,0,0,1,0,1
241,22.67,0,0,0,1,0,1
242,17.82,0,1,0,1,0,1


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=2020
)

# Modeling

In [13]:
from sklearn.metrics import SCORERS

In [14]:
SCORERS

{'explained_variance': make_scorer(explained_variance_score),
 'r2': make_scorer(r2_score),
 'max_error': make_scorer(max_error, greater_is_better=False),
 'neg_median_absolute_error': make_scorer(median_absolute_error, greater_is_better=False),
 'neg_mean_absolute_error': make_scorer(mean_absolute_error, greater_is_better=False),
 'neg_mean_absolute_percentage_error': make_scorer(mean_absolute_percentage_error, greater_is_better=False),
 'neg_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False),
 'neg_mean_squared_log_error': make_scorer(mean_squared_log_error, greater_is_better=False),
 'neg_root_mean_squared_error': make_scorer(mean_squared_error, greater_is_better=False, squared=False),
 'neg_mean_poisson_deviance': make_scorer(mean_poisson_deviance, greater_is_better=False),
 'neg_mean_gamma_deviance': make_scorer(mean_gamma_deviance, greater_is_better=False),
 'accuracy': make_scorer(accuracy_score),
 'top_k_accuracy': make_scorer(top_k_accuracy_score, ne

In [9]:
tree = DecisionTreeRegressor(criterion='mse', max_depth=5, random_state=2020)
linreg = LinearRegression()

In [21]:
tree_cv = cross_val_score(tree, X_train, y_train, scoring='r2')

In [23]:
print('score cv: ', tree_cv)
print('mean score cv: ', tree_cv.mean())
print('std score cv: ', tree_cv.std())

score cv:  [0.11865437 0.09663697 0.13388799 0.60844872 0.41722998]
mean score cv:  0.27497160666626597
std score cv:  0.2037583555823742


In [22]:
linreg_cv = cross_val_score(linreg, X_train, y_train, scoring='r2')

In [24]:
print('score cv: ', linreg_cv)
print('mean score cv: ', linreg_cv.mean())
print('std score cv: ', linreg_cv.std())

score cv:  [0.27243371 0.18984475 0.31816682 0.46456813 0.44769515]
mean score cv:  0.33854171153933316
std score cv:  0.10458807015801658


# Modeling Tree in Test Set

In [25]:
tree = DecisionTreeRegressor(criterion='mse', max_depth=5, random_state=2020)

In [26]:
tree.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=5, random_state=2020)

In [27]:
y_pred = tree.predict(X_test)
print('mse: ', mean_squared_error(y_test, y_pred))

mse:  1.1723927020084088


# Hyperparameter Tuning for Tree

In [37]:
hyperparam = {
    'min_samples_leaf': [1,5,10,15,20,50,100],
    'max_depth':[2,3,4,5,6,7],
    'criterion':['mse','mae']
}

tree = DecisionTreeRegressor(criterion='mse', max_depth=5, random_state=2020)

random_search = RandomizedSearchCV(
    tree,
    param_distributions=hyperparam,
    n_iter=72,
    cv=5,
    scoring='r2',
    random_state=2020,
    n_jobs=-1
)

In [38]:
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=DecisionTreeRegressor(max_depth=5,
                                                   random_state=2020),
                   n_iter=72, n_jobs=-1,
                   param_distributions={'criterion': ['mse', 'mae'],
                                        'max_depth': [2, 3, 4, 5, 6, 7],
                                        'min_samples_leaf': [1, 5, 10, 15, 20,
                                                             50, 100]},
                   random_state=2020, scoring='r2')

In [41]:
random_search.best_score_

0.4335277785825589

In [42]:
random_search.best_params_

{'min_samples_leaf': 1, 'max_depth': 5, 'criterion': 'mae'}

In [43]:
cv_result = pd.DataFrame(random_search.cv_results_)

In [44]:
cv_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003209,0.000563,0.001369,0.000704,50,5,mae,"{'min_samples_leaf': 50, 'max_depth': 5, 'crit...",0.249158,0.169275,0.134417,0.209424,0.199847,0.192424,0.038636,55
1,0.001937,0.000615,0.001373,0.000478,1,3,mse,"{'min_samples_leaf': 1, 'max_depth': 3, 'crite...",0.401605,0.344567,0.172655,0.660094,0.557267,0.427238,0.169371,2
2,0.002190,0.000473,0.001103,0.000591,20,6,mae,"{'min_samples_leaf': 20, 'max_depth': 6, 'crit...",0.290600,-0.038291,0.238016,0.347737,0.343542,0.236321,0.143018,38
3,0.001968,0.000755,0.000983,0.000544,20,3,mae,"{'min_samples_leaf': 20, 'max_depth': 3, 'crit...",0.290600,-0.038291,0.238016,0.347737,0.343542,0.236321,0.143018,38
4,0.002133,0.000994,0.000883,0.000593,1,7,mse,"{'min_samples_leaf': 1, 'max_depth': 7, 'crite...",0.033806,-0.028286,-0.134957,0.527352,0.487819,0.177147,0.275434,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.000984,0.000047,0.000512,0.000012,100,6,mae,"{'min_samples_leaf': 100, 'max_depth': 6, 'cri...",-0.008075,-0.000119,-0.009128,-0.077183,-0.106279,-0.040157,0.043216,68
68,0.001421,0.000110,0.000556,0.000060,20,7,mae,"{'min_samples_leaf': 20, 'max_depth': 7, 'crit...",0.290600,-0.038291,0.238016,0.347737,0.343542,0.236321,0.143018,38
69,0.000887,0.000074,0.000493,0.000018,100,4,mae,"{'min_samples_leaf': 100, 'max_depth': 4, 'cri...",-0.008075,-0.000119,-0.009128,-0.077183,-0.106279,-0.040157,0.043216,68
70,0.000829,0.000084,0.000490,0.000022,100,3,mae,"{'min_samples_leaf': 100, 'max_depth': 3, 'cri...",-0.008075,-0.000119,-0.009128,-0.077183,-0.106279,-0.040157,0.043216,68


# Comparison Before & After Tuning

In [45]:
tree_before = DecisionTreeRegressor(criterion='mse', max_depth=5, random_state=2020)
tree_after = DecisionTreeRegressor(criterion='mae', max_depth=5, random_state=2020)
logreg = LinearRegression()

In [47]:
tree_before.fit(X_train, y_train)
y_pred = tree_before.predict(X_test)
print(mean_squared_error(y_test, y_pred))

1.1723927020084088


In [49]:
def final_eval(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(mean_squared_error(y_test, y_pred))
    print(r2_score(y_test, y_pred))

In [50]:
final_eval(tree_before)

1.1723927020084088
0.456139969823803


In [51]:
final_eval(tree_after)

1.2579158163265307
0.4164667413448675


In [52]:
final_eval(linreg)

1.3375316267761348
0.379533845908511
