In [9]:
import os
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score , accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , MinMaxScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import pickle

import warnings
warnings.filterwarnings("ignore")

In [4]:
def preprocess(fname ='diabetes_012_health_indicators_BRFSS2015.csv', target_col='Diabetes_012'):
    data = pd.read_csv(fname)
    data.rename(columns = {target_col:'target'}, inplace = True)
    y = data["target"]
    X = data.drop(['target'], axis = 1)
    print(y.value_counts())
    standard , minmax , enc =  StandardScaler() , MinMaxScaler(), OneHotEncoder()
    
    continuous = ["BMI", "PhysHlth",  "MentHlth"] # continuous
    ordinal = ['Income', "Education", "Age"] # ordinal with range
    one_hot = ['GenHlth'] # ordinal, categorical 
    binary = [] #"everything else"

    for col in list(X.columns):
        if col in ordinal:
            X[col] = minmax.fit_transform(np.array(X[col]).reshape(len(X),1))
        if col in continuous:
            X[col] = standard.fit_transform(np.array(X[col]).reshape(len(X),1))
        if col in one_hot:
            one_hot = pd.get_dummies(X[col])
            names = [col+"_" +str(list(one_hot)[i]) for i in range(len(list(one_hot)))]
            one_hot.columns = names
            X = pd.concat([X, one_hot], axis = 1)
        else:
            binary.append(col)
    return X,y

### Grid Search
* Update score, dataset, output_folder for the corresponding binary or multiclass cases
* User Can select between doing a randomized grid search on larger search space or full kfold=5 grid search on a refined set of parameters
* For the project, I used randomized grid first to idenitfy important parameters to then run a full grid search

In [20]:
# dataset = "Dataset/diabetes_012_health_indicators_BRFSS2015.csv
dataset = "data/Dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv"

score = 'f1'  
#score = 'f1_weighted'
# default to current working directory, suggest making "results" folder
out_folder = os.getcwd() 

X,y = preprocess(dataset, target_col = 'Diabetes_binary')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state= 42)
X_test, X_val,  y_test, y_val  = train_test_split(X_test, y_test, test_size = 0.33, stratify = y_test, random_state = 42)


random_grid = {
    
    'bootstrap'         : [True, False],
    "n_estimators"      : [int(i) for i in list(np.linspace(10,300, 30))],
    "criterion"         : ['gini', 'entropy', 'log_loss'],
    "max_depth"         : [2,4,6,8,10, 12, 14, 16, 20, 28, 32, 40],
    "max_features"      : ['sqrt', 'log2'],
    "min_samples_split" : [2,3,4, 8, 10, 12, 18],
    "min_samples_leaf"  : [2,4, 6, 8, 10, 14, 18, 20]   
    
}

grid = { "n_estimators" : [20, 60, 100, 120 , 140 , 180], 
         "max_depth"    : [8, 16, 20, 40, 60, 80, 100],
         "min_samples_split" : [2,4,6],  
         "min_samples_leaf" : [1,2]
}


rf = RandomForestClassifier()

#========== Choose to Do Randomized Search for bigger grid / faster search ========================
# search = RandomizedSearchCV(estimator = rf,
#                                     param_distributions = random_grid,
#                                     n_iter = 70, 
#                                     cv = 5, verbose = 3, 
#                                     scoring = score,
#                                     random_state = 42 , n_jobs = -1)
# search.fit(X, y)
# test = pd.DataFrame(search.cv_results_)
# best_clf = search.best_estimator_

# ===================== Choose Full Grid Search ============================================

print("searching...")
search = GridSearchCV(rf, param_grid = grid, scoring = score, cv=KFold(n_splits=5) )
search.fit(X_val, y_val)
best_clf = search.best_estimator_
print("done")


# ######## Fit on the entire training set #################
# best_clf.fit(X_train, y_train)


#================== Evaluate ================================================
y_pred = best_clf.predict(X_test)
print("Predictions | Ground Truth")
print(np.unique(y_pred, return_counts=True))
print(np.unique(y_test, return_counts=True))
print("Bin Accuracy: " , sklearn.metrics.accuracy_score(y_test,y_pred))
print("Bin F1: ", f1_score(y_test, y_pred) )
print("Bin AUC: ", sklearn.metrics.roc_auc_score(y_test, best_clf.predict_proba(X_test)[:,1]))
print("Bin Recall: ", sklearn.metrics.recall_score(y_test, y_pred))
print("Bin Precision: ", sklearn.metrics.precision_score(y_test, y_pred))
print(sklearn.metrics.confusion_matrix(y_test, y_pred))


#================= Save Feature Importances and Model =======================
importances = pd.DataFrame( zip(X_val.columns, list(best_clf.feature_importances_) ) , 
                                       columns = ["feature", "rf_importance"]).sort_values( "rf_importance", ascending = False )
importances.head(20)
importances.to_csv('{}/feature-importances.csv'.format(out_folder))
pd.DataFrame(search.cv_results_).to_csv('{}/grid_search_data.csv'.format(out_folder))

name = '{}/best_model.pkl'.format(out_folder)
with open(name, 'wb') as f:
    pickle.dump(best_clf, f)

0.0    35346
1.0    35346
Name: target, dtype: int64
searching...


KeyboardInterrupt: 