In [17]:
import pandas as pd
import numpy as np

1. Loading dataset

In [24]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


2. Pre-Processing

2.1 Removing null values

In [23]:
# BloodPressure and Glucose and SkinThickness and BMI and Insulin cannot have zero values:
cols = ['BloodPressure', 'Glucose', 'BMI', 'SkinThickness', 'Insulin']
for item in cols:
  df[item] = df[item].replace(0, np.nan)
  mean = df[item].mean(skipna=True)
  df[item] = df[item].replace(np.nan, mean)

2.2 Feature Scaling

In [33]:
from sklearn.preprocessing import StandardScaler

In [34]:
sc = StandardScaler()
df.iloc[:, :8] = sc.fit_transform(df.iloc[:, :8])

 -0.25095213  1.82781311 -0.54791859  1.23388019  0.04601433  1.82781311
  1.82781311 -0.84488505  0.3429808   0.93691372 -1.14185152  0.93691372
 -0.84488505 -0.84488505 -0.25095213  1.23388019  0.93691372  1.53084665
  2.12477957  1.82781311  0.93691372 -0.84488505  2.7187125   0.3429808
  0.3429808  -0.25095213 -0.25095213  0.63994726  1.82781311  0.04601433
  2.12477957  1.53084665 -0.54791859  0.04601433 -0.25095213  0.93691372
  0.93691372  1.53084665  0.93691372 -1.14185152 -0.84488505 -0.54791859
  0.93691372  0.93691372 -0.84488505 -0.84488505  0.3429808   1.23388019
  0.93691372 -0.84488505  0.93691372 -1.14185152 -1.14185152 -1.14185152
 -0.54791859  1.23388019  0.3429808  -0.54791859  0.93691372  0.3429808
 -1.14185152 -0.54791859 -0.84488505  0.04601433 -0.54791859  0.3429808
  2.7187125   0.04601433 -0.84488505 -0.84488505  0.93691372  0.3429808
 -1.14185152 -0.54791859 -0.25095213 -0.54791859  0.93691372 -1.14185152
  0.3429808  -0.54791859  2.7187125  -0.54791859  3.312

3. Training

3.1 Split dataset

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
# set features and target:
X = df.iloc[:, :8]
y = df.iloc[:, 8]

In [37]:
# split dataset into train and test set:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

3.2 Create KNN model

In [40]:
from sklearn.neighbors import KNeighborsClassifier

In [46]:
# find best value for k:
import math
k = int(math.sqrt(len(y_test)))
k

12

In [47]:
# create knn model
model = KNeighborsClassifier(n_neighbors=k, p=2, metric='euclidean')

3.3 Fit the model

In [48]:
model.fit(X_train, y_train)

4. Predicting

In [49]:
pred = model.predict(X_test)
pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

5. Evaluating the model

In [50]:
from sklearn.metrics import confusion_matrix

In [51]:
c_matrix = confusion_matrix(y_test, pred)
c_matrix

array([[95, 12],
       [22, 25]], dtype=int64)

In [53]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [54]:
f1_score(y_test, pred)

0.5952380952380952

In [55]:
accuracy_score(y_test, pred)

0.7792207792207793