In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [4]:
diabetes = pd.read_csv("diabetesdata.csv",header = None)
diabetes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
diabetes.columns = ["times_pregnant","glucose_concentration","Diastolic_blood_pressure",
                   "Triceps_skin_fold_thickness","serum_insulin","BMI","Diabetes_pedigree_function","Age","Diabetes"]

In [6]:
diabetes.shape

(768, 9)

In [7]:
diabetes.describe()

Unnamed: 0,times_pregnant,glucose_concentration,Diastolic_blood_pressure,Triceps_skin_fold_thickness,serum_insulin,BMI,Diabetes_pedigree_function,Age,Diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [12]:
# finding missig values
print((diabetes[["glucose_concentration","Diastolic_blood_pressure","Triceps_skin_fold_thickness","serum_insulin","BMI"]] == 0).sum())

glucose_concentration            5
Diastolic_blood_pressure        35
Triceps_skin_fold_thickness    227
serum_insulin                  374
BMI                             11
dtype: int64


In [13]:
# Replacing 0 with Nan
diabetes[["glucose_concentration","Diastolic_blood_pressure","Triceps_skin_fold_thickness","serum_insulin","BMI"]] = diabetes[["glucose_concentration","Diastolic_blood_pressure","Triceps_skin_fold_thickness","serum_insulin","BMI"]].replace(0,np.NaN)

In [14]:
print(diabetes.isnull().sum())

times_pregnant                   0
glucose_concentration            5
Diastolic_blood_pressure        35
Triceps_skin_fold_thickness    227
serum_insulin                  374
BMI                             11
Diabetes_pedigree_function       0
Age                              0
Diabetes                         0
dtype: int64


In [15]:
X = diabetes.iloc[:,:8]
y  = diabetes.iloc[:,8]

In [30]:
# create pipeline for performing multiple transformations
pipeline = Pipeline([
('imputer', SimpleImputer(missing_values=np.nan, strategy="mean")),
('scaler', StandardScaler())])

In [31]:
X_transformed = pipeline.fit_transform(X)
X_transformed

array([[ 0.63994726,  0.86510807, -0.03351824, ...,  0.16629174,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.20616153, -0.52985903, ..., -0.85253118,
        -0.36506078, -0.19067191],
       [ 1.23388019,  2.0158134 , -0.69530596, ..., -1.33283341,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 , -0.0225789 , -0.03351824, ..., -0.91074963,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.14180757, -1.02619983, ..., -0.34311972,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.94314317, -0.19896517, ..., -0.29945588,
        -0.47378505, -0.87137393]])

In [33]:
new_dataset = pd.DataFrame(X_transformed, columns= ["times_pregnant","glucose_concentration","Diastolic_blood_pressure",
                   "Triceps_skin_fold_thickness","serum_insulin","BMI","Diabetes_pedigree_function","Age"])
new_dataset

Unnamed: 0,times_pregnant,glucose_concentration,Diastolic_blood_pressure,Triceps_skin_fold_thickness,serum_insulin,BMI,Diabetes_pedigree_function,Age
0,0.639947,0.865108,-0.033518,6.655021e-01,-3.345079e-16,0.166292,0.468492,1.425995
1,-0.844885,-1.206162,-0.529859,-1.746338e-02,-3.345079e-16,-0.852531,-0.365061,-0.190672
2,1.233880,2.015813,-0.695306,8.087936e-16,-3.345079e-16,-1.332833,0.604397,-0.105584
3,-0.844885,-1.074652,-0.529859,-7.004289e-01,-7.243887e-01,-0.634212,-0.920763,-1.041549
4,-1.141852,0.503458,-2.680669,6.655021e-01,1.465506e-01,1.548980,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.680125,0.297376,2.145261e+00,2.877840e-01,0.064409,-0.908682,2.532136
764,-0.547919,0.010298,-0.198965,-2.451185e-01,-3.345079e-16,0.632039,-0.398282,-0.531023
765,0.342981,-0.022579,-0.033518,-7.004289e-01,-5.125386e-01,-0.910750,-0.685193,-0.275760
766,-0.844885,0.141808,-1.026200,8.087936e-16,-3.345079e-16,-0.343120,-0.371101,1.170732


In [34]:
new_dataset['Diabetes']=y
new_dataset.head()

Unnamed: 0,times_pregnant,glucose_concentration,Diastolic_blood_pressure,Triceps_skin_fold_thickness,serum_insulin,BMI,Diabetes_pedigree_function,Age,Diabetes
0,0.639947,0.865108,-0.033518,0.6655021,-3.345079e-16,0.166292,0.468492,1.425995,1
1,-0.844885,-1.206162,-0.529859,-0.01746338,-3.345079e-16,-0.852531,-0.365061,-0.190672,0
2,1.23388,2.015813,-0.695306,8.087936e-16,-3.345079e-16,-1.332833,0.604397,-0.105584,1
3,-0.844885,-1.074652,-0.529859,-0.7004289,-0.7243887,-0.634212,-0.920763,-1.041549,0
4,-1.141852,0.503458,-2.680669,0.6655021,0.1465506,1.54898,5.484909,-0.020496,1


In [35]:
# perform train test split
X_train,X_test,y_train,y_test= train_test_split(new_dataset,y,test_size=0.2,stratify=y,random_state = 88)

In [36]:
X_train.shape,y_train.shape, X_test.shape,y_test.shape

((614, 9), (614,), (154, 9), (154,))

In [37]:
knn = KNeighborsClassifier()

In [38]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [39]:
knn.score(X_train,y_train)

0.9706840390879479

In [40]:
y_pred = knn.predict(X_test)
y_pred

array([0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0],
      dtype=int64)

In [41]:
confusion_matrix(y_test, y_pred)

array([[91,  9],
       [ 7, 47]], dtype=int64)

In [42]:
accuracy_score(y_test, y_pred)

0.8961038961038961

In [50]:
# Use cross validation for selecting optimum value of 'k'
parameter = {'n_neighbors':[3,5,7,9,11]}

In [51]:
knn_tune = GridSearchCV(knn, parameter, cv=5)

In [52]:
knn_tune.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [3, 5, 7, 9, 11]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [53]:
knn_tune.best_params_

{'n_neighbors': 9}

In [54]:
knn_tune.best_score_

0.9413967746234839

In [55]:
y_pred_tune = knn_tune.predict(X_test)
y_pred_tune

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0],
      dtype=int64)

In [56]:
confusion_matrix(y_test, y_pred_tune)

array([[93,  7],
       [ 6, 48]], dtype=int64)

In [57]:
accuracy_score(y_test, y_pred_tune)

0.9155844155844156