<a href="https://colab.research.google.com/github/flaviorv/ai_model_lifecycle/blob/main/diabetes_health_indicators.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Installing repo and a lib to generate synthetic data

In [2]:
!pip install ucimlrepo
!pip install imbalanced-learn

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


###Imports

In [11]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from ucimlrepo import fetch_ucirepo
import pandas as pd

###Get data

In [2]:
# loading Diabetes Health Indicators dataset
dhi = fetch_ucirepo(id=891)

pd.set_option('display.max_colwidth', None)
print('Variable Information:')
display(dhi.variables)

Variable Information:


Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,ID,ID,Integer,,Patient ID,,no
1,Diabetes_binary,Target,Binary,,0 = no diabetes 1 = prediabetes or diabetes,,no
2,HighBP,Feature,Binary,,0 = no high BP 1 = high BP,,no
3,HighChol,Feature,Binary,,0 = no high cholesterol 1 = high cholesterol,,no
4,CholCheck,Feature,Binary,,0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years,,no
5,BMI,Feature,Integer,,Body Mass Index,,no
6,Smoker,Feature,Binary,,Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes,,no
7,Stroke,Feature,Binary,,(Ever told) you had a stroke. 0 = no 1 = yes,,no
8,HeartDiseaseorAttack,Feature,Binary,,coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes,,no
9,PhysActivity,Feature,Binary,,physical activity in past 30 days - not including job 0 = no 1 = yes,,no


###Feature selection and df separation between features and target

In [3]:
# manual feature selection and features and target separation
x = dhi.data.features.drop(columns=['Education', 'Income', 'NoDocbcCost', 'AnyHealthcare'])
y = dhi.data.targets

# separating feature names and target name
feature_names = list(x.columns)
target_name = y.columns[0]

print('Target name:', target_name)
print('Feature names:', feature_names)
print('Target count:\n', y.value_counts())

Target name: Diabetes_binary
Feature names: ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age']
Target count:
 Diabetes_binary
0                  218334
1                   35346
Name: count, dtype: int64


###Train and test split

In [4]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.20, stratify=y, random_state=42)
# Showing stratification
print('Diabetic target proportion on the train:', ytrain.value_counts(normalize=True))
print('Diabetic target proportion on the test:', ytest.value_counts(normalize=True))

Diabetic target proportion on the train: Diabetes_binary
0                  0.860666
1                  0.139334
Name: proportion, dtype: float64
Diabetic target proportion on the test: Diabetes_binary
0                  0.860671
1                  0.139329
Name: proportion, dtype: float64


###Normalization

In [5]:
# Selecting features to scaling
numerics = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age']
scaler = MinMaxScaler()
print(xtrain[numerics])
# Scaling only the numerics
x_train_scaled = xtrain
x_test_scaled = xtest
x_train_scaled[numerics] = scaler.fit_transform(xtrain[numerics])
x_test_scaled[numerics] = scaler.transform(xtest[numerics])
print(x_train_scaled[numerics])

        BMI  GenHlth  MentHlth  PhysHlth  Age
153147   28        2         0         0    2
176137   23        2         0         0   13
175578   29        1         0         0    9
177887   39        4         0         0    7
182143   16        5        30        30    7
...     ...      ...       ...       ...  ...
208398   25        4         0         0   10
106221   25        2         0         1    9
169035   18        4        30        30    7
49036    26        3         0         0    8
206060   31        4         0         5    8

[202944 rows x 5 columns]
             BMI  GenHlth  MentHlth  PhysHlth       Age
153147  0.186047     0.25       0.0  0.000000  0.083333
176137  0.127907     0.25       0.0  0.000000  1.000000
175578  0.197674     0.00       0.0  0.000000  0.666667
177887  0.313953     0.75       0.0  0.000000  0.500000
182143  0.046512     1.00       1.0  1.000000  0.500000
...          ...      ...       ...       ...       ...
208398  0.151163     0.75    

###First Training and evaluation - KNN

In [6]:
# Training the model with different number of neighbors
n_neighbors = [2, 5, 80]
for n in n_neighbors:
  knn = KNeighborsClassifier(n_neighbors=n)
  knn.fit(x_train_scaled, ytrain['Diabetes_binary'])
  ypredicted = knn.predict(x_test_scaled)
  print(f'Neighbors: {n}')
  print(classification_report(ytest, ypredicted))

Neighbors: 2
              precision    recall  f1-score   support

           0       0.87      0.97      0.92     43667
           1       0.38      0.11      0.17      7069

    accuracy                           0.85     50736
   macro avg       0.62      0.54      0.54     50736
weighted avg       0.80      0.85      0.81     50736

Neighbors: 5
              precision    recall  f1-score   support

           0       0.88      0.95      0.91     43667
           1       0.39      0.21      0.27      7069

    accuracy                           0.84     50736
   macro avg       0.64      0.58      0.59     50736
weighted avg       0.81      0.84      0.82     50736

Neighbors: 80
              precision    recall  f1-score   support

           0       0.87      0.99      0.93     43667
           1       0.55      0.07      0.13      7069

    accuracy                           0.86     50736
   macro avg       0.71      0.53      0.53     50736
weighted avg       0.82      0.86 

###Balancing the train data

In [7]:
smote = SMOTE(k_neighbors=2, random_state=42)
xbalanced, ybalanced = smote.fit_resample(x_train_scaled, ytrain)
print(ybalanced.value_counts())

Diabetes_binary
0                  174667
1                  174667
Name: count, dtype: int64


###New train with balanced train set

In [8]:
n_neighbors = [5, 50, 100, 250]
for n in n_neighbors:
  knn = KNeighborsClassifier(n_neighbors=n)
  knn.fit(xbalanced, ybalanced['Diabetes_binary'])
  ypredicted = knn.predict(xtest)
  print(f'Neighbors: {n}')
  print(classification_report(ytest, ypredicted))

Neighbors: 5
              precision    recall  f1-score   support

           0       0.91      0.77      0.84     43667
           1       0.28      0.56      0.37      7069

    accuracy                           0.74     50736
   macro avg       0.60      0.66      0.60     50736
weighted avg       0.83      0.74      0.77     50736

Neighbors: 50
              precision    recall  f1-score   support

           0       0.95      0.67      0.79     43667
           1       0.28      0.77      0.41      7069

    accuracy                           0.69     50736
   macro avg       0.61      0.72      0.60     50736
weighted avg       0.85      0.69      0.73     50736

Neighbors: 100
              precision    recall  f1-score   support

           0       0.95      0.67      0.78     43667
           1       0.28      0.79      0.41      7069

    accuracy                           0.68     50736
   macro avg       0.62      0.73      0.60     50736
weighted avg       0.86      0.6

###Cross Validation

In [13]:
# The pipeline inside the cross_validate ensures that the val fold created by the cross_val_score does not contain synthetic data.
# It aslo ensures that the val fold not fit the scaling. This way, there is no data leackage
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('smote', SMOTE(k_neighbors=2, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=250))
])
score = cross_validate(
    pipeline, xtrain, ytrain['Diabetes_binary'],
    scoring=['recall', 'precision', 'accuracy', 'f1'],
    cv=5, return_train_score=False
)
print('Precision', score['test_precision'])
print('Recall', score['test_recall'])
print('F1-score', score['test_f1'])
print('Accuracy', score['test_accuracy'])

Precision [0.28009117 0.27695031 0.27697352 0.27867638 0.28300243]
Recall [0.8040672  0.80229885 0.7915488  0.79809052 0.8040672 ]
F1-score [0.41545982 0.41176204 0.41035747 0.41310515 0.4186539 ]
Accuracy [0.68476681 0.68062776 0.68301757 0.68400306 0.68887356]
