# Models for Diabetes Classification

In this notebook, we will use the k Nearest Neighbors, Support Vector Machine, and Random Forest classifiers for predicting whether patients with the given health indicators have diabetes. This analysis will use only the balanced binary data set.

In [3]:
import pandas as pd
import numpy as np

# Read data
data = pd.read_csv("data/diabetes_binary_health_indicators_BRFSS2015.csv")

X = data.drop("Diabetes_binary", axis=1)
Y = data["Diabetes_binary"]

In [None]:
from sklearn.model_selection import train_test_split

# Make a 80-20 train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2 , random_state=51235)

In [21]:
from sklearn.preprocessing import StandardScaler

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## K Nearest Neighbors

We use an exhaustive grid search to find the best combination of hyperparameters `n_neighbors` and `weights` to use.

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn_grid_cv = GridSearchCV(KNeighborsClassifier(),
                           param_grid = {'n_neighbors': range(1, 50, 2),
                                         'weights': ['uniform', 'distance']},
                           scoring = 'accuracy',
                           cv = 5)
knn_grid_cv.fit(X_train, Y_train)

In [10]:
knn_grid_cv.cv_results_

{'mean_fit_time': array([0.01270494, 0.01427999, 0.01383448, 0.0156054 , 0.01343889,
        0.01862645, 0.01397028, 0.0153511 , 0.01584587, 0.01514778,
        0.01524615, 0.01644907, 0.01587605, 0.01478181, 0.01488776,
        0.01517806, 0.01585984, 0.01457162, 0.01356649, 0.01457829,
        0.01472855, 0.01502275, 0.01613698, 0.01398354, 0.01455226,
        0.01250644, 0.01629992, 0.01377053, 0.01565299, 0.0135675 ,
        0.01538434, 0.01423731, 0.01628256, 0.0145308 , 0.01451459,
        0.01456151, 0.01944871, 0.02078223, 0.02078414, 0.02365546,
        0.021101  , 0.01824765, 0.01731648, 0.01649613, 0.02019477,
        0.0178721 , 0.01787653, 0.01718793, 0.01929522, 0.01911621]),
 'std_fit_time': array([0.00267441, 0.00114319, 0.00112891, 0.00281463, 0.00185574,
        0.00391679, 0.00079225, 0.00244847, 0.00129738, 0.00128475,
        0.00178811, 0.00507904, 0.00170453, 0.00162554, 0.00145019,
        0.00187666, 0.00219947, 0.00135929, 0.00105646, 0.00272622,
        0.000

In [11]:
knn_grid_cv.best_params_

{'n_neighbors': 41, 'weights': 'uniform'}

In [22]:
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score

knn = KNeighborsClassifier(n_neighbors = 41)
knn.fit(X_train, Y_train)
y_pred = knn.predict(X_test)

print('KNN accuracy: {:.4f}'.format(knn.score(X_test, Y_test)))
print('KNN roc_auc: {:.4f}'.format(roc_auc_score(Y_test, y_pred)))
print('KNN recall: {:.4f}'.format(recall_score(Y_test, y_pred)))

KNN accuracy: 0.7426
KNN roc_auc: 0.7428
KNN recall: 0.7881


## Support Vector Machine

We use an exhaustive grid search to find the best `C` to use for the linear SVM.

In [33]:
from sklearn.svm import LinearSVC

svc_grid_cv = GridSearchCV(LinearSVC(),
                           param_grid = {'C': np.logspace(-4, 1, 50),
                                         'dual': ['auto']},
                           scoring = 'accuracy')
svc_grid_cv.fit(X_train, Y_train)

In [34]:
svc_grid_cv.cv_results_

{'mean_fit_time': array([0.11289558, 0.11351786, 0.11503992, 0.12362618, 0.1278245 ,
        0.12048035, 0.12340956, 0.1422286 , 0.15346522, 0.16411738,
        0.1605463 , 0.16078663, 0.15651588, 0.15637865, 0.15635314,
        0.16062241, 0.15900269, 0.16309171, 0.18579741, 0.17468104,
        0.17855792, 0.18301287, 0.16069503, 0.15976119, 0.15686483,
        0.1553947 , 0.15312881, 0.16592278, 0.15870309, 0.15907388,
        0.15582438, 0.15427251, 0.15412068, 0.15126076, 0.1526053 ,
        0.15852671, 0.16347852, 0.15623317, 0.17866135, 0.18013778,
        0.17925963, 0.16458063, 0.16080799, 0.15671778, 0.17043729,
        0.15953765, 0.15156455, 0.15793767, 0.16061015, 0.16088119]),
 'std_fit_time': array([0.00314592, 0.00681564, 0.00194125, 0.00452142, 0.00782685,
        0.00210131, 0.00409995, 0.0060487 , 0.00516471, 0.00649437,
        0.00261794, 0.00184752, 0.0033565 , 0.00273291, 0.00276261,
        0.00460271, 0.00397498, 0.00439954, 0.04289085, 0.00283017,
        0.003

In [35]:
svc_grid_cv.best_params_

{'C': 0.010985411419875584, 'dual': 'auto'}

In [37]:
from sklearn.svm import LinearSVC

svc = LinearSVC(C = 0.011,
                dual = 'auto')
svc.fit(X_train, Y_train)
y_pred = svc.predict(X_test)

print('SVC accuracy: {:.4f}'.format(svc.score(X_test, Y_test)))
print('SVC roc_auc: {:.4f}'.format(roc_auc_score(Y_test, y_pred)))
print('SVC recall: {:.4f}'.format(recall_score(Y_test, y_pred)))

SVC accuracy: 0.7510
SVC roc_auc: 0.7510
SVC recall: 0.7743


## Random Forest

We use an exhaustive grid search to find the best combination of hyperparameters `n_estimators` and `max_depth` to use.

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [5]:
rf_grid_cv = GridSearchCV(RandomForestClassifier(),
                           param_grid = {'n_estimators': range(1,500, 50),
                                         'max_depth': range(1, 13)},
                           scoring = 'accuracy')
rf_grid_cv.fit(X_train, Y_train)

In [38]:
rf_grid_cv.cv_results_

{'mean_fit_time': array([ 0.01791191,  0.33689537,  0.71299434,  1.08477631,  1.33126354,
         1.63607531,  1.99698057,  2.3532733 ,  2.74187183,  3.1780283 ,
         0.0176682 ,  0.45015731,  0.93094893,  1.42378178,  1.99100366,
         2.36288867,  2.88747139,  3.381078  ,  3.685814  ,  4.13270631,
         0.02074451,  0.53349638,  1.34796858,  1.8582952 ,  2.13907881,
         2.74375415,  3.48347211,  3.7556735 ,  4.05378022,  4.42877522,
         0.02019672,  0.66439266,  1.13922114,  1.80292001,  2.31866961,
         3.28706412,  3.88599625,  4.82469854,  5.62210712,  6.01216688,
         0.02539592,  0.8625607 ,  1.53402338,  2.51817322,  3.24313822,
         4.2854527 ,  4.97696228,  6.05981164,  6.99924908,  6.82450986,
         0.02728391,  0.9756536 ,  1.75696015,  2.61729774,  3.42145705,
         4.44047832,  5.46894531,  5.96578889,  6.8998332 ,  7.8144424 ,
         0.0276895 ,  0.97467008,  1.98426514,  3.08685336,  3.8150403 ,
         4.78044028,  5.83853307, 

In [7]:
rf_grid_cv.best_params_

{'max_depth': 12, 'n_estimators': 201}

We refine the previous grid search by narrowing the search ranges around the best parameters of the previous search and decreasing the step size.

In [39]:
rf_grid_cv2 = GridSearchCV(RandomForestClassifier(),
                           param_grid = {'n_estimators': range(150,250,10),
                                         'max_depth': range(11,14)},
                           scoring = 'accuracy')
rf_grid_cv2.fit(X_train, Y_train)

In [40]:
rf_grid_cv2.cv_results_

{'mean_fit_time': array([ 5.33153896,  5.23052182,  5.61882157,  5.93952546,  6.70060539,
         7.7572536 ,  9.20897765,  9.31464238,  9.66820498,  9.5589036 ,
         7.4210114 ,  7.9195693 ,  8.04572487,  8.90580316,  7.95333104,
         7.61542673,  9.2796586 , 10.31109743, 11.18063469, 11.20688915,
         7.8328042 ,  8.48582702,  8.77008371, 10.21845384, 11.48199544,
         8.76331196,  9.04494667,  9.56833243, 11.48829265, 12.00900297]),
 'std_fit_time': array([0.53767753, 0.15057987, 0.32989068, 0.18510792, 0.52907788,
        1.21476363, 1.30430392, 0.65549264, 1.05481672, 0.44961071,
        0.6928795 , 1.32480571, 0.45443258, 0.84291093, 1.08129713,
        0.35854441, 1.54996964, 0.79805379, 0.49743461, 0.72356912,
        0.73704865, 0.70917328, 0.47374132, 1.03476369, 2.79988338,
        0.32645436, 0.45864776, 0.59628278, 1.09032032, 0.32641575]),
 'mean_score_time': array([0.31509266, 0.35334558, 0.33360672, 0.37622662, 0.40264831,
        0.5261035 , 0.55543151

In [41]:
rf_grid_cv2.best_params_

{'max_depth': 11, 'n_estimators': 160}

In [42]:
rf = rf_grid_cv2.best_estimator_
rf.fit(X_train, Y_train)
y_pred = rf.predict(X_test)

print('Random Forest accuracy: {:.4f}'.format(rf.score(X_test, Y_test)))
print('Random Forest roc_auc: {:.4f}'.format(roc_auc_score(Y_test, y_pred)))
print('Random Forest recall: {:.4f}'.format(recall_score(Y_test, y_pred)))

Random Forest accuracy: 0.7562
Random Forest roc_auc: 0.7563
Random Forest recall: 0.7961


Compared to KNN and SVC, the Random Forest model performs better with respect to all three metrics. It also allows us to compare the relative importance of the features based on how much each feature is used to reduce impurity in the decision trees.

In [46]:
score_df = pd.DataFrame({'feature': X.columns,
                         'importance_score': rf.feature_importances_})
score_df.sort_values('importance_score', ascending = False)

Unnamed: 0,feature,importance_score
13,GenHlth,0.215309
0,HighBP,0.203856
3,BMI,0.127957
18,Age,0.101406
1,HighChol,0.093793
16,DiffWalk,0.045405
20,Income,0.039147
15,PhysHlth,0.033127
6,HeartDiseaseorAttack,0.02627
19,Education,0.018822


## Voter Model

We build a voter model using the above three classifiers.

In [47]:
from sklearn.ensemble import VotingClassifier

vm = VotingClassifier([('knn', knn), ('svc', svc), ('rf', rf)])
vm.fit(X_train, Y_train)
y_pred = vm.predict(X_test)

print('Voter Model accuracy: {:.4f}'.format(vm.score(X_test, Y_test)))
print('Voter Model roc_auc: {:.4f}'.format(roc_auc_score(Y_test, y_pred)))
print('Voter Model recall: {:.4f}'.format(recall_score(Y_test, y_pred)))

Voter Model accuracy: 0.7527
Voter Model roc_auc: 0.7528
Voter Model recall: 0.7893
