KNN Algorithm to predict whether a person will have diabetes or not

In [1]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
data =pd.read_csv('diabetes.csv')
print(len(data))
print(data.head())

768
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [3]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
# Replacing zero values
zero_not=['Glucose','BloodPressure','SkinThickness','BMI','Insulin']

for column in zero_not:
    data[column]=data[column].replace(0,np.NaN)
    mean = int(data[column].mean(skipna=True))
    data[column]=data[column].replace(np.NaN, mean)

In [5]:
#split dataset
X= data.iloc[:,0:8]
Y= data.iloc[:,8]
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, random_state=0, test_size=0.2)

In [6]:
#Feature Scaling 
sc_X= StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)

In [7]:
import math
math.sqrt(len(Y_train))

24.779023386727733

In [8]:
import math
math.sqrt(len(Y_test))

12.409673645990857

Define the model using KNeighborsClassifier and fit the train data in the model

In [9]:
#Define the model: Init K-NN
classifier= KNeighborsClassifier(n_neighbors=11,p=2, metric='euclidean')

In [10]:
#Fit Model
classifier.fit(X_train,Y_train)

In [11]:
#Predict the test set results
y_pred=classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [12]:
#Evaluate Model 
cm= confusion_matrix(Y_test,y_pred)
print(cm)

[[94 13]
 [15 32]]


In [13]:
print(f1_score(Y_test,y_pred))

0.6956521739130436


In [14]:
print(accuracy_score(Y_test,y_pred))

0.8181818181818182
