## KNN Algorithm

In [20]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [3]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [4]:
df.describe()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,121.117188,69.076823,20.536458,79.799479,31.992578,0.471876,33.24349,0.348958
std,31.805091,19.367794,15.952218,115.244002,7.88416,0.331329,11.758182,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,142.0,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Feature Scaling

In [None]:
fit >> train
transform >> train n test
fit_transform >> train n test

In [5]:
x_df = df.drop('Outcome', axis=1)


In [6]:
x_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,50,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,52
3,150,66,23,94,28.1,0.167,21
4,150,40,35,168,43.1,2.288,33


In [13]:
normal_scaler = MinMaxScaler()
arr = normal_scaler.fit_transform(x_df)
x_norm_df = pd.DataFrame(arr, columns=x_df.columns)
x_norm_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.743719,0.409836,0.353535,0.0,0.500745,0.234415,0.483333
1,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667
2,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.516667
3,0.753769,0.540984,0.232323,0.111111,0.418778,0.038002,0.0
4,0.753769,0.327869,0.353535,0.198582,0.642325,0.943638,0.2


In [14]:
std_scaler = StandardScaler()
arr = std_scaler.fit_transform(x_df)
x_std_df = pd.DataFrame(arr, columns=x_df.columns)
x_std_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.845787,-0.985618,0.90727,-0.692891,0.204013,0.468492,1.426022
1,-1.136319,-0.158966,0.530902,-0.692891,-0.684422,-0.365061,-0.190927
2,1.946957,-0.262298,-1.288212,-0.692891,-1.103255,0.604397,1.596227
3,0.908711,-0.158966,0.154533,0.123302,-0.494043,-0.920763,-1.041953
4,0.908711,-1.502276,0.90727,0.765836,1.409746,5.484909,-0.020722


## Train_test_split

In [10]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 11)

In [15]:
# Normalization
x = x_norm_df.copy()
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 11)

In [16]:
# Standardization
x = x_std_df.copy()
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 11)

## Model Training

In [17]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train,y_train)

KNeighborsClassifier()

## Evaluation

In [18]:
# Testing Data Evaluation

y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print('\nConfusion Matrix is: \n', cnf_matrix)
print()
acc = accuracy_score(y_test,y_pred)
print('Accuracy is: ', acc)
print()
clf_report = classification_report(y_test,y_pred)
print('\nClassification Report is: \n',clf_report)


Confusion Matrix is: 
 [[85 15]
 [28 26]]

Accuracy is:  0.7207792207792207


Classification Report is: 
               precision    recall  f1-score   support

           0       0.75      0.85      0.80       100
           1       0.63      0.48      0.55        54

    accuracy                           0.72       154
   macro avg       0.69      0.67      0.67       154
weighted avg       0.71      0.72      0.71       154



In [19]:
# Training Data Evaluation

y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print('\nConfusion Matrix is: \n', cnf_matrix)
print()
acc = accuracy_score(y_train,y_pred_train)
print('Accuracy is: ', acc)
print()
clf_report = classification_report(y_train,y_pred_train)
print('\nClassification Report is: \n',clf_report)


Confusion Matrix is: 
 [[355  45]
 [ 66 148]]

Accuracy is:  0.8192182410423453


Classification Report is: 
               precision    recall  f1-score   support

           0       0.84      0.89      0.86       400
           1       0.77      0.69      0.73       214

    accuracy                           0.82       614
   macro avg       0.81      0.79      0.80       614
weighted avg       0.82      0.82      0.82       614



## Hyper Parameter Tunning

In [21]:
knn_clf = KNeighborsClassifier()

hyp = {'n_neighbors':np.arange(4,35),
       'p' : [1,2]}
gscv_knn = GridSearchCV(knn_clf, hyp, cv=5)
gscv_knn.fit(x_train,y_train)
gscv_knn.best_estimator_

KNeighborsClassifier(n_neighbors=19)

In [22]:
# Testing Data Evaluation
knn_clf = gscv_knn.best_estimator_
y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print('\nConfusion Matrix is: \n', cnf_matrix)
print()
acc = accuracy_score(y_test,y_pred)
print('Accuracy is: ', acc)
print()
clf_report = classification_report(y_test,y_pred)
print('\nClassification Report is: \n',clf_report)


Confusion Matrix is: 
 [[86 14]
 [29 25]]

Accuracy is:  0.7207792207792207


Classification Report is: 
               precision    recall  f1-score   support

           0       0.75      0.86      0.80       100
           1       0.64      0.46      0.54        54

    accuracy                           0.72       154
   macro avg       0.69      0.66      0.67       154
weighted avg       0.71      0.72      0.71       154



In [23]:
# Training Data Evaluation

y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print('\nConfusion Matrix is: \n', cnf_matrix)
print()
acc = accuracy_score(y_train,y_pred_train)
print('Accuracy is: ', acc)
print()
clf_report = classification_report(y_train,y_pred_train)
print('\nClassification Report is: \n',clf_report)


Confusion Matrix is: 
 [[365  35]
 [ 93 121]]

Accuracy is:  0.7915309446254072


Classification Report is: 
               precision    recall  f1-score   support

           0       0.80      0.91      0.85       400
           1       0.78      0.57      0.65       214

    accuracy                           0.79       614
   macro avg       0.79      0.74      0.75       614
weighted avg       0.79      0.79      0.78       614

