In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

df = pd.read_csv('pima-indians-diabetes.csv')
df.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [2]:
df.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class']
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


- Separate the dataset on features (X) and target (y)
- Make the train/test split

In [3]:
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

<br>

## Baseline model
- Model with default hyperparameters

In [4]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)

print(f'Accuracy = {round(accuracy_score(y_test, preds), 2)}')
print()
print(confusion_matrix(y_test, preds))

Accuracy = 0.77

[[111  20]
 [ 24  37]]


<br>

## Manual hyperparameter optimization - Method #1
- Declare parameter dictionaries beforehand
- Train and evaluate multiple models
- Can become really tedious really fast
- Not scalable

In [5]:
# 3 sets of hyperparameters
params_1 = {'criterion': 'gini', 'splitter': 'best', 'max_depth': 10}
params_2 = {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 1000}
params_3 = {'criterion': 'gini', 'splitter': 'random', 'max_depth': 100}

# 3 separate models
model_1 = DecisionTreeClassifier(**params_1)
model_2 = DecisionTreeClassifier(**params_2)
model_3 = DecisionTreeClassifier(**params_3)

model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)

# 3 separate prediction sets
preds_1 = model_1.predict(X_test)
preds_2 = model_3.predict(X_test)
preds_3 = model_2.predict(X_test)

print(f'Accuracy on Model 1 = {round(accuracy_score(y_test, preds_1), 5)}')
print(f'Accuracy on Model 2 = {round(accuracy_score(y_test, preds_2), 5)}')
print(f'Accuracy on Model 3 = {round(accuracy_score(y_test, preds_3), 5)}')

Accuracy on Model 1 = 0.78125
Accuracy on Model 2 = 0.73958
Accuracy on Model 3 = 0.72396


<br>

## Manual hyperparameter optimization - Method #2
- Better than the first method
- Still way too manual 
- Nested `for` loops don't look nice

In [6]:
# Define parameter possibilities as lists
p_criterion = ['gini', 'entropy']
p_splitter = ['best', 'random']
p_max_depth = [1, 10, 100, 1000]
# The scores will go here
results = []

# Nested loops - we need to test for all combinations
for criterion in p_criterion:
    for splitter in p_splitter:
        for max_depth in p_max_depth:
            # Train the model
            model = DecisionTreeClassifier(
                criterion=criterion,
                splitter=splitter,
                max_depth=max_depth
            )
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            # Append current results
            results.append({
                'Accuracy': round(accuracy_score(y_test, preds), 5),
                'P_Criterion': criterion,
                'P_Splitter': splitter,
                'P_MaxDepth': max_depth
            })
            
# Convert to Pandas DataFrame and sort descendingly by accuracy
results = pd.DataFrame(results)
results = results.sort_values(by='Accuracy', ascending=False)
results

Unnamed: 0,Accuracy,P_Criterion,P_Splitter,P_MaxDepth
10,0.78125,entropy,best,100
2,0.77083,gini,best,100
3,0.77083,gini,best,1000
11,0.77083,entropy,best,1000
1,0.76562,gini,best,10
9,0.76042,entropy,best,10
14,0.75521,entropy,random,100
0,0.74479,gini,best,1
8,0.74479,entropy,best,1
13,0.73958,entropy,random,10


<br>

## Go-to approach: `GridSearchCV`
- Define model and hyperparameter space beforehand
- Use `GridSearchCV` for optimization
- Also does the cross validation for you

In [7]:
model = DecisionTreeClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 10, 100, 1000]
}

clf = GridSearchCV(
    estimator=model, 
    param_grid=params, 
    cv=10,  # 10-fold cross validation
    n_jobs=-1  # run in parallel
)
clf.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 10, 100, 1000],
                         'splitter': ['best', 'random']})

- Convert best parameters array to a Pandas DataFrame:

In [8]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005304,0.002049,0.003155,0.001299,gini,1,best,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.706897,0.706897,...,0.793103,0.741379,0.789474,0.701754,0.701754,0.701754,0.736842,0.732123,0.033527,1
1,0.004606,0.001153,0.002953,0.00051,gini,1,random,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.62069,0.603448,...,0.655172,0.672414,0.666667,0.649123,0.649123,0.631579,0.736842,0.667816,0.05387,14
2,0.006952,0.001679,0.003963,0.001879,gini,10,best,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",0.603448,0.637931,...,0.758621,0.706897,0.719298,0.807018,0.701754,0.684211,0.701754,0.69761,0.0554,5
3,0.005152,0.001724,0.00264,0.000133,gini,10,random,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",0.655172,0.689655,...,0.793103,0.724138,0.754386,0.701754,0.719298,0.649123,0.649123,0.695644,0.050969,9
4,0.006462,0.000468,0.002888,0.000207,gini,100,best,"{'criterion': 'gini', 'max_depth': 100, 'split...",0.62069,0.706897,...,0.706897,0.689655,0.719298,0.789474,0.684211,0.684211,0.719298,0.69758,0.042062,6


- Keep only what matters
- Sort descendingly by average test score

In [9]:
cv_results = cv_results[['mean_test_score', 'param_criterion', 'param_splitter', 'param_max_depth']]
cv_results.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,mean_test_score,param_criterion,param_splitter,param_max_depth
0,0.732123,gini,best,1
8,0.732123,entropy,best,1
11,0.721476,entropy,random,10
6,0.697671,gini,best,1000
2,0.69761,gini,best,10
4,0.69758,gini,best,100
14,0.69755,entropy,best,1000
10,0.695735,entropy,best,10
3,0.695644,gini,random,10
12,0.681942,entropy,best,100


- Get the best parameters

In [10]:
clf.best_params_

{'criterion': 'gini', 'max_depth': 1, 'splitter': 'best'}