### KNN - Objective: Predict whether a person will be diagnosed with diabetes or not

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
# standard scaler preprocessor, so that we dont have really large number which will cause skewed result, 
#we wanna change that to between -1 and 1.
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [3]:
dataset = pd.read_csv('KNN_Dataset.csv')

In [4]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
len(dataset)

768

In [6]:
# Replace zeroes
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

In [8]:
for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN) #replace 0 with NaN
    mean = int(dataset[column].mean(skipna=True)) #we create mean of columns by skipping NaN
    dataset[column] = dataset[column].replace(np.NaN, mean) #replace NaN with mean

In [9]:
dataset['Glucose']

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
5      116.0
6       78.0
7      115.0
8      197.0
9      125.0
10     110.0
11     168.0
12     139.0
13     189.0
14     166.0
15     100.0
16     118.0
17     107.0
18     103.0
19     115.0
20     126.0
21      99.0
22     196.0
23     119.0
24     143.0
25     125.0
26     147.0
27      97.0
28     145.0
29     117.0
       ...  
738     99.0
739    102.0
740    120.0
741    102.0
742    109.0
743    140.0
744    153.0
745    100.0
746    147.0
747     81.0
748    187.0
749    162.0
750    136.0
751    121.0
752    108.0
753    181.0
754    154.0
755    128.0
756    137.0
757    123.0
758    106.0
759    190.0
760     88.0
761    170.0
762     89.0
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: Glucose, Length: 768, dtype: float64

In [10]:
# Split dataset
X = dataset.iloc[:, 0:8]
y = dataset.iloc[:, 8]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

### Rule of thumb: 
### Any algorithm that computes distance or assumes normality, scale your features!

In [11]:
# Feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [13]:
# Define the model
# N_neighbors here is 'K'
# p is the power parameter to define the metric used, which is 'Euclidean' in our case
classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')

In [14]:
# Fit model
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=11, p=2,
           weights='uniform')

In [15]:
# Below shows why choose n=11, n have to be odd number and one step smaller than sqrt of len(y_test)
len(y)

768

In [16]:
import math
math.sqrt(len(y_test))

12.409673645990857

In [18]:
# Predict the test set results
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [19]:
# Evaluate the model
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[94 13]
 [15 32]]


In [20]:
print(f1_score(y_test, y_pred))

0.6956521739130436


In [21]:
print(accuracy_score(y_test, y_pred))

0.8181818181818182
