In [26]:
import matplotlib.pyplot as plt
import pandas as pd
import pydataset

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, precision_score

In [9]:
tips = pydataset.data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [10]:
X = tips[['tip', 'total_bill', 'size']]
y = tips.time

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=.2)

In [11]:
dtree = DecisionTreeClassifier(max_depth=4)

cross_val_score(dtree, X_train, y_train, cv=4).mean()

0.6973852040816326

### using precision 'traditionally'

In [15]:

dtree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [14]:
predicted = dtree.predict(X_train)
actual = y_train

precision_score(actual, predicted, pos_label='Dinner')

0.8571428571428571

### Using precision as a metric for cross validation

In [16]:
precision_scorer = make_scorer(precision_score, pos_label="Dinner")

# decision tree with max depth of 4
cross_val_score(dtree, X_train, y_train, cv=4, scoring=precision_scorer).mean()

0.7387297937569676

In [17]:
cross_val_score(DecisionTreeClassifier(max_depth=3), X_train, y_train, scoring=precision_scorer).mean()

0.7396049896049897

## **Grid Search**

In [19]:
# keys are names of hyperparams, values are a list of values to try for that hyper parameter
params = {
    'max_depth': range(1, 11),
    'criterion': ['gini', 'entropy']
}

# cv=4 means 4-fold cross-validation, i.e. k = 4
grid = GridSearchCV(dtree, params, cv=4)
grid.fit(X_train, y_train)

grid.best_params_

{'criterion': 'entropy', 'max_depth': 3}

In [21]:
# .best_estimator_  creates. model that is prefit w/ best hyperparams
model = grid.best_estimator_
model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [22]:
model.score(X_test, y_test)

0.6530612244897959

In [23]:
grid.best_score_

0.7387329931972789

In [25]:
results = grid.cv_results_
results

{'mean_fit_time': array([0.00214207, 0.00210577, 0.00184423, 0.00172091, 0.00170124,
        0.0016523 , 0.00140792, 0.0013901 , 0.00138927, 0.00148499,
        0.00134373, 0.00136459, 0.00136095, 0.00143248, 0.00142854,
        0.00150377, 0.00149578, 0.00145835, 0.00146401, 0.00144434]),
 'std_fit_time': array([1.39909712e-04, 1.46476192e-04, 4.75976211e-05, 6.41598528e-05,
        2.44570866e-05, 3.41548137e-05, 6.25603208e-05, 1.22785568e-05,
        3.35484678e-06, 6.06948569e-05, 1.86972465e-06, 1.52042091e-05,
        2.34436517e-06, 4.54965420e-05, 6.80511740e-06, 5.73455851e-05,
        5.21208794e-05, 4.45203208e-06, 3.92667510e-06, 2.88511110e-05]),
 'mean_score_time': array([0.00105429, 0.00102848, 0.0008629 , 0.00077516, 0.00078177,
        0.00077403, 0.00062174, 0.00063562, 0.00062543, 0.00064325,
        0.00063747, 0.00067699, 0.00062507, 0.00063664, 0.00064427,
        0.00064433, 0.00063801, 0.00063044, 0.00063109, 0.00061572]),
 'std_score_time': array([1.35553596e-

In [27]:
plt.ion()
plt.rc('figure', figsize=(13,7))
plt.style.use('seaborn-darkgrid')

## Code can be reused as long as we use the name grid

In [28]:
results = grid.cv_results_
# Modify each parameter dictionary such that it also contains the model's average performance after cross validation
for score, p in zip(results['mean_test_score'], results['params']):
    # adding a new key to the dictionary
    p['score'] = score

# a list of dictionaries where each dictionary has the same keys can be turned into a dataframe
results['params']

pd.DataFrame(results['params'])

Unnamed: 0,criterion,max_depth,score
0,gini,1,0.733418
1,gini,2,0.733418
2,gini,3,0.718112
3,gini,4,0.692283
4,gini,5,0.625744
5,gini,6,0.671662
6,gini,7,0.651042
7,gini,8,0.66656
8,gini,9,0.630634
9,gini,10,0.671662


In [29]:
# Everything put together as one
# keys are names of hyperparams, values are a list of values to try for that hyper parameter
params = {
    'max_depth': range(1, 11),
    'criterion': ['gini', 'entropy']
}

# cv=4 means 4-fold cross-validation, i.e. k = 4
grid = GridSearchCV(dtree, params, cv=15)
grid.fit(X_train, y_train)

results = grid.cv_results_

for score, p in zip(results['mean_test_score'], results['params']):
    p['score'] = score
pd.DataFrame(results['params'])

Unnamed: 0,criterion,max_depth,score
0,gini,1,0.733333
1,gini,2,0.723077
2,gini,3,0.712821
3,gini,4,0.702564
4,gini,5,0.702564
5,gini,6,0.666667
6,gini,7,0.692308
7,gini,8,0.666667
8,gini,9,0.682051
9,gini,10,0.671795
