# KNN (Lazy Learner)

# Import Libraries

In [21]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, roc_curve

# Load the dataset

In [2]:
Diabetes_db = pd.read_csv('diabetes.csv')
Diabetes_db.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Check the dataframe

In [3]:
Diabetes_db.shape

(768, 9)

In [4]:
Diabetes_db.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [5]:
Diabetes_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
print('Glucose :' ,Diabetes_db[Diabetes_db.Glucose==0].shape[0])
print('BloodPressure :' ,Diabetes_db[Diabetes_db.BloodPressure==0].shape[0])
print('SkinThickness :' ,Diabetes_db[Diabetes_db.SkinThickness==0].shape[0])
print('Insulin :' ,Diabetes_db[Diabetes_db.Insulin==0].shape[0])
print('BMI :' ,Diabetes_db[Diabetes_db.BMI==0].shape[0])
print('DiabetesPedigreeFunction :' ,Diabetes_db[Diabetes_db.DiabetesPedigreeFunction==0].shape[0])
print('Age :' ,Diabetes_db[Diabetes_db.Age==0].shape[0])

Glucose : 5
BloodPressure : 35
SkinThickness : 227
Insulin : 374
BMI : 11
DiabetesPedigreeFunction : 0
Age : 0


In [7]:
columns_replace = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction']

In [8]:
for column in columns_replace:
    Diabetes_db[column] = Diabetes_db[column].replace(0,np.NaN)
    mean = int(Diabetes_db[column].mean(skipna = True))
    Diabetes_db[column] = Diabetes_db[column].replace(np.NaN,mean)

In [9]:
print('Glucose :' ,Diabetes_db[Diabetes_db.Glucose==0].shape[0])
print('BloodPressure :' ,Diabetes_db[Diabetes_db.BloodPressure==0].shape[0])
print('SkinThickness :' ,Diabetes_db[Diabetes_db.SkinThickness==0].shape[0])
print('Insulin :' ,Diabetes_db[Diabetes_db.Insulin==0].shape[0])
print('BMI :' ,Diabetes_db[Diabetes_db.BMI==0].shape[0])
print('DiabetesPedigreeFunction :' ,Diabetes_db[Diabetes_db.DiabetesPedigreeFunction==0].shape[0])
print('Age :' ,Diabetes_db[Diabetes_db.Age==0].shape[0])

Glucose : 0
BloodPressure : 0
SkinThickness : 0
Insulin : 0
BMI : 0
DiabetesPedigreeFunction : 0
Age : 0


# Split the dataset into train and test

In [10]:
X = Diabetes_db.iloc[:, :8].values
y = Diabetes_db.iloc[:, -1].values

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 30, test_size = 0.2)

# Transform and fit data using Standard Scaler

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# KNN Model

Distance : 
1. Manhattan distance (p = 1)
2. Euclidean distance (p = 2)
3. Minkowski distance

In [13]:
knn_model = KNeighborsClassifier(n_neighbors = 47, p = 2, metric = 'euclidean')
knn_model.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=47)

In [14]:
y_pred = knn_model.predict(X_test)
print(y_pred)

[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0
 1 1 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 1 0 0
 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 1 0
 0 1 0 0 1 0]


# Calculate Accuracy, F1 score, Confusion Matrix, Classification Report

In [17]:
print("Classification Report :")
print(classification_report(y_test, y_pred))

Classification Report :
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       104
           1       0.74      0.56      0.64        50

    accuracy                           0.79       154
   macro avg       0.77      0.73      0.75       154
weighted avg       0.79      0.79      0.78       154



In [18]:
print("Confusion Matrix :")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix :
[[94 10]
 [22 28]]


In [19]:
print("F1-Score :")
print(f1_score(y_test, y_pred))

F1-Score :
0.6363636363636364


In [20]:
print("Accuracy Score :")
print(accuracy_score(y_test, y_pred))

Accuracy Score :
0.7922077922077922


In [26]:
# print("ROC Curve :")
# print(roc_curve(y_test, y_pred))