In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import requests
import json
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from collections import Counter
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter
from imblearn.over_sampling import RandomOverSampler 

In [2]:
data = Path('./results/DataProcessingExtractFile-RawData.csv')
df = pd.read_csv(data)
df.head(10)

Unnamed: 0,Diabetes_Status,HighBP,HighChol,CholCheck,BMI_Range,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,Mental_Health_Range,Physical_Health_Range,DiffWalk,Sex,Age,Education,Income
0,1,1.0,1.0,1.0,4,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,4,3,1.0,0.0,9.0,4.0,3.0
1,1,0.0,0.0,0.0,3,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,1,1,0.0,0.0,7.0,6.0,1.0
2,1,1.0,1.0,1.0,3,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,6,6,1.0,0.0,9.0,4.0,8.0
3,1,1.0,0.0,1.0,3,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,11.0,3.0,6.0
4,1,1.0,1.0,1.0,2,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,11.0,5.0,4.0
5,1,1.0,1.0,1.0,3,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,1,1,0.0,1.0,10.0,6.0,8.0
6,1,1.0,0.0,1.0,4,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,1,3,0.0,0.0,9.0,6.0,7.0
7,1,1.0,1.0,1.0,3,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,1,1,1.0,0.0,11.0,4.0,4.0
8,2,1.0,1.0,1.0,4,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,5.0,6,6,1.0,0.0,9.0,5.0,1.0
9,1,0.0,0.0,1.0,2,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,1,1,0.0,1.0,8.0,4.0,3.0


In [3]:
# K Nearest Neighbors Model

X = df.copy()
X = df.drop(df.columns[[0]], axis=1)
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI_Range,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,Mental_Health_Range,Physical_Health_Range,DiffWalk,Sex,Age,Education,Income
0,1.0,1.0,1.0,4,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,5.0,4,3,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,3,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,3.0,1,1,0.0,0.0,7.0,6.0,1.0
2,1.0,1.0,1.0,3,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,5.0,6,6,1.0,0.0,9.0,4.0,8.0
3,1.0,0.0,1.0,3,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,11.0,3.0,6.0
4,1.0,1.0,1.0,2,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,11.0,5.0,4.0


In [4]:
y = df["Diabetes_Status"].ravel()
y[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 2, 1])

In [5]:
# MLP networks are sensitive to unscaled data
# this data is all categorical, scaled converges faster, but gives worse results
#sc = StandardScaler()
#scaler = sc.fit(X)

In [6]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    random_state=78,
                                                    stratify=y)
#X_train = scaler.transform(X_train)
#X_test = scaler.transform(X_test)
#X_train[:3]
X_train.head(3)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI_Range,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,Mental_Health_Range,Physical_Health_Range,DiffWalk,Sex,Age,Education,Income
173928,1.0,0.0,1.0,4,0.0,1.0,0.0,1.0,1.0,1.0,...,1.0,0.0,3.0,1,1,0.0,0.0,11.0,3.0,7.0
239730,0.0,1.0,1.0,3,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,5.0,2,1,1.0,0.0,7.0,3.0,4.0
81199,0.0,1.0,1.0,2,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,2.0,1,1,0.0,0.0,6.0,6.0,8.0


In [7]:
print('y_train dataset shape {}'.format(Counter(y_train)))

y_train dataset shape Counter({1: 160277, 2: 29983})


In [8]:
# make a balanced dataset before training
ros = RandomOverSampler(random_state=78)
X_res, y_res = ros.fit_resample(X_train, y_train)
print('Resampled dataset shape {}'.format(Counter(y_res)))

Resampled dataset shape Counter({1: 160277, 2: 160277})


In [9]:
# uncomment to revert oversampling as a test
#X_res=X_train
#y_res=y_train

In [10]:
# Create a Neural Net classifier.
mlp_model = MLPClassifier(random_state=78, verbose=True, max_iter=300,
                          hidden_layer_sizes=(64,8), 
                          solver="adam", activation="relu")

In [11]:
# Fitting the model
mlp_model = mlp_model.fit(X_res, y_res)

Iteration 1, loss = 0.53321674
Iteration 2, loss = 0.51993002
Iteration 3, loss = 0.51741493
Iteration 4, loss = 0.51625158
Iteration 5, loss = 0.51476033
Iteration 6, loss = 0.51394716
Iteration 7, loss = 0.51349436
Iteration 8, loss = 0.51305059
Iteration 9, loss = 0.51255999
Iteration 10, loss = 0.51207980
Iteration 11, loss = 0.51156245
Iteration 12, loss = 0.51131691
Iteration 13, loss = 0.51108776
Iteration 14, loss = 0.51065163
Iteration 15, loss = 0.51027765
Iteration 16, loss = 0.50993151
Iteration 17, loss = 0.50970734
Iteration 18, loss = 0.50941197
Iteration 19, loss = 0.50918640
Iteration 20, loss = 0.50865686
Iteration 21, loss = 0.50861662
Iteration 22, loss = 0.50782393
Iteration 23, loss = 0.50775194
Iteration 24, loss = 0.50754639
Iteration 25, loss = 0.50700819
Iteration 26, loss = 0.50693921
Iteration 27, loss = 0.50685300
Iteration 28, loss = 0.50631461
Iteration 29, loss = 0.50629142
Iteration 30, loss = 0.50598226
Iteration 31, loss = 0.50588222
Iteration 32, los

In [12]:
# evaluate the training accuracy first, based on the data before oversampling
pred_train=mlp_model.predict(X_train)
cm_train = confusion_matrix(y_train, pred_train)
cmt_df = pd.DataFrame(
    cm_train, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cmt_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,116621,43656
Actual 1,6589,23394


In [13]:
# Calculate the accuracy score on trained set.
acc_score = accuracy_score(y_train, pred_train)
acc_score

0.7359140124040786

In [14]:
# Making predictions using the testing data.
predictions = mlp_model.predict(X_test)
predictions

array([1, 2, 1, ..., 2, 2, 1])

In [15]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,38613,14813
Actual 1,2429,7565


In [16]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.7281299274676758

In [17]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,38613,14813
Actual 1,2429,7565


Accuracy Score : 0.7281299274676758
Classification Report
              precision    recall  f1-score   support

           1       0.94      0.72      0.82     53426
           2       0.34      0.76      0.47      9994

    accuracy                           0.73     63420
   macro avg       0.64      0.74      0.64     63420
weighted avg       0.85      0.73      0.76     63420



In [18]:
pickle.dump(mlp_model, open('mlp_model.pkl','wb'))