In [2]:
%pip install --user numpy scikit-learn pandas

Note: you may need to restart the kernel to use updated packages.


In [71]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import CategoricalNB
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

In [60]:
def print_model_evaluation(y_test, y_pred):
    print(metrics.accuracy_score(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))

In [111]:
y_column = 'Has Diabetes'
columns = ['Pregnancy', 'Glucose', 'Blood', 'Skin thickness', 'Insuline', 'BMI', 'Inheritance score', 'Age', y_column]
diabetes_data = pd.read_csv('diabetes.csv', names=columns)
diabetes_data

Unnamed: 0,Pregnancy,Glucose,Blood,Skin thickness,Insuline,BMI,Inheritance score,Age,Has Diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## KNN
Splitting into test and training data

In [76]:
X, y = diabetes_data.iloc[:, :-1], diabetes_data.iloc[:, -1]
X_scaled = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.20, stratify=diabetes_data[[y_column]])
X_scaled

Unnamed: 0,Pregnancy,Glucose,Blood,Skin thickness,Insuline,BMI,Inheritance score,Age
0,0.352941,0.743719,0.590164,0.353535,0.000000,0.500745,0.234415,0.483333
1,0.058824,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667
2,0.470588,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.183333
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.000000,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...,...,...
763,0.588235,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000
764,0.117647,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000
765,0.294118,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000
766,0.058824,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333


In [108]:
model = KNeighborsClassifier(n_neighbors = 8)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print_model_evaluation(y_test, y_pred)

0.7402597402597403
[[93  7]
 [33 21]]
              precision    recall  f1-score   support

           0       0.74      0.93      0.82       100
           1       0.75      0.39      0.51        54

    accuracy                           0.74       154
   macro avg       0.74      0.66      0.67       154
weighted avg       0.74      0.74      0.71       154



In [102]:
def transform_single(x):
    if x <= 0.25:
        return 0
    elif x <= 0.50:
        return 1
    elif x <= 0.75:
        return 2
    else:
        return 3
X_train_transformed = X_train.transform([transform_single])
X_test_transformed = X_test.transform([transform_single])
X_test_transformed

Unnamed: 0_level_0,Pregnancy,Glucose,Blood,Skin thickness,Insuline,BMI,Inheritance score,Age
Unnamed: 0_level_1,transform_single,transform_single,transform_single,transform_single,transform_single,transform_single,transform_single,transform_single
562,0,1,2,1,0,2,0,0
54,1,3,2,1,1,2,1,1
155,1,3,2,1,0,2,0,0
210,0,1,1,0,0,1,0,0
473,1,2,2,0,0,1,0,1
...,...,...,...,...,...,...,...,...
599,0,2,1,0,0,1,0,0
103,0,1,2,0,0,1,0,0
688,0,2,2,1,0,1,1,0
253,0,1,2,1,0,2,0,0


In [109]:
clf = CategoricalNB(min_categories=4)
clf.fit(X_train_transformed, y_train)
y_pred = clf.predict(X_test_transformed)
print_model_evaluation(y_test, y_pred)

0.7077922077922078
[[79 21]
 [24 30]]
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       100
           1       0.59      0.56      0.57        54

    accuracy                           0.71       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.70      0.71      0.71       154





In [110]:
logreg = LogisticRegression(random_state=16)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print_model_evaluation(y_test, y_pred)

0.7597402597402597
[[90 10]
 [27 27]]
              precision    recall  f1-score   support

           0       0.77      0.90      0.83       100
           1       0.73      0.50      0.59        54

    accuracy                           0.76       154
   macro avg       0.75      0.70      0.71       154
weighted avg       0.76      0.76      0.75       154

