In [100]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [101]:
seed = 42

In [102]:
df = pd.read_csv('../data/diabetes.csv')
print(df.sample(5))

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
432            1       80             74             11       60  30.0   
633            1      128             82             17      183  27.5   
216            5      109             62             41      129  35.8   
763           10      101             76             48      180  32.9   
717           10       94             72             18        0  23.1   

     DiabetesPedigreeFunction  Age  Outcome  
432                     0.527   22        0  
633                     0.115   22        0  
216                     0.514   25        1  
763                     0.171   63        0  
717                     0.595   56        0  


In [103]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [104]:
X = df.drop("Outcome", axis=1).values
y = df["Outcome"].values

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [106]:
rf_clf = RandomForestClassifier(
    n_estimators=500,
    max_depth=5,
    random_state=seed
    )

rf_clf.fit(X_train, y_train)

In [107]:
y_pred = rf_clf.predict(X_test)

In [108]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[83 16]
 [20 35]]
0.7662337662337663
              precision    recall  f1-score   support

           0       0.81      0.84      0.82        99
           1       0.69      0.64      0.66        55

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.76      0.77      0.76       154



In [109]:
n_estimators = [300, 500, 700, 900, 1000, 1500, 2000, 2500, 3000]
max_depth = [i for i in range(1, 11)]
min_samples_split = [i for i in range(2, 8)]
min_samples_leaf = [i for i in range(2, 8)]
criterion = ['entropy', 'gini']

params = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'criterion': criterion
}

In [110]:
rf_clf = RandomForestClassifier()
rf_clf_rnd_cv = RandomizedSearchCV(estimator=rf_clf, param_distributions=params, n_iter=100, cv=3, n_jobs=-1, verbose=2, random_state=seed, error_score='raise')
rf_clf_rnd_cv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [111]:
rf_clf_rnd_cv.best_params_

{'n_estimators': 2500,
 'min_samples_split': 4,
 'min_samples_leaf': 5,
 'max_depth': 7,
 'criterion': 'entropy'}

In [112]:
rf_clf_rnd_cv.best_estimator_

In [113]:
y_pred = rf_clf_rnd_cv.best_estimator_.predict(X_test)

In [114]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[82 17]
 [20 35]]
0.7597402597402597
              precision    recall  f1-score   support

           0       0.80      0.83      0.82        99
           1       0.67      0.64      0.65        55

    accuracy                           0.76       154
   macro avg       0.74      0.73      0.74       154
weighted avg       0.76      0.76      0.76       154



In [115]:
best_n_estimators = rf_clf_rnd_cv.best_params_['n_estimators']
best_max_depth = rf_clf_rnd_cv.best_params_['max_depth']
best_min_samples_split = rf_clf_rnd_cv.best_params_['min_samples_split']
best_min_samples_leaf = rf_clf_rnd_cv.best_params_['min_samples_leaf']
best_criterion = rf_clf_rnd_cv.best_params_['criterion']

In [118]:
params = {
    'n_estimators': [best_n_estimators-val for val in range(-500, 501, 500) if best_n_estimators-val > 0],
    'max_depth': [best_max_depth-val for val in range(-2, 3) if best_max_depth-val > 0],
    'min_samples_split': [best_min_samples_split-val for val in range(-2, 3) if best_min_samples_split-val > 1],
    'min_samples_leaf': [best_min_samples_leaf-val for val in range(-2, 3) if best_min_samples_leaf-val > 1],
    'criterion': [best_criterion]
}

In [119]:
rf_clf = RandomForestClassifier()
rf_clf_grid_cv = GridSearchCV(estimator=rf_clf, param_grid=params, cv=3, n_jobs=-1, verbose=2, error_score='raise')
rf_clf_grid_cv.fit(X_train, y_train)

Fitting 3 folds for each of 375 candidates, totalling 1125 fits


In [120]:
rf_clf_grid_cv.best_params_

{'criterion': 'entropy',
 'max_depth': 9,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 2000}

In [121]:
rf_clf_grid_cv.best_estimator_

In [122]:
y_pred = rf_clf_grid_cv.best_estimator_.predict(X_test)

In [123]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[81 18]
 [19 36]]
0.7597402597402597
              precision    recall  f1-score   support

           0       0.81      0.82      0.81        99
           1       0.67      0.65      0.66        55

    accuracy                           0.76       154
   macro avg       0.74      0.74      0.74       154
weighted avg       0.76      0.76      0.76       154



In [124]:
n_estimators = [300, 500, 700, 900, 1000, 1500, 2000, 2500, 3000]
max_depth = [val for val in range(2, 11)]
criterion = ['entropy', 'gini']

In [125]:
space = {
    'n_estimators': hp.choice('n_estimators', n_estimators),
    'max_depth': hp.choice('max_depth', max_depth),
    'min_samples_split': hp.uniform('min_samples_split', 0, 0.5),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 1),
    'criterion': hp.choice('criterion', criterion)
}

In [126]:
def objective(space):
    rf_clf_bayesian = RandomForestClassifier(
        n_estimators=space['n_estimators'],
        max_depth=space['max_depth'],
        min_samples_split=space['min_samples_split'],
        min_samples_leaf=space['min_samples_leaf'],
        criterion=space['criterion']
        )

    accuracy = cross_val_score(rf_clf_bayesian, X_train, y_train, cv=5).mean()

    return {'loss': -accuracy, 'status': STATUS_OK}

In [127]:
trials = Trials()
best_model = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

100%|██████████| 100/100 [15:44<00:00,  9.45s/trial, best loss: -0.775249900039984]


In [128]:
best_model

{'criterion': 1,
 'max_depth': 7,
 'min_samples_leaf': 0.0007513173142764621,
 'min_samples_split': 0.012168767812794953,
 'n_estimators': 6}

In [129]:
rf_clf_bayesian = RandomForestClassifier(
    n_estimators=n_estimators[best_model['n_estimators']],
    max_depth=max_depth[best_model['max_depth']],
    min_samples_split=best_model['min_samples_split'],
    min_samples_leaf=best_model['min_samples_leaf'],
    criterion=criterion[best_model['criterion']]
    )
rf_clf_bayesian.fit(X_train, y_train)

In [130]:
y_pred = rf_clf_bayesian.predict(X_test)

In [131]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[80 19]
 [19 36]]
0.7532467532467533
              precision    recall  f1-score   support

           0       0.81      0.81      0.81        99
           1       0.65      0.65      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154

