### Diabetes Classification
OSEMN Pipeline

O - Obtaining our data

S - Scrubbing / Cleaning our data

E - Exploring / Visualizing our data will allow us to find patterns and trends

M - Modeling our data will give us our predictive power as a wizard

N - INterpreting our data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#### Importing the Dataset

In [2]:
dfx = pd.read_csv('Diabetes_XTrain.csv')
dfy = pd.read_csv('Diabetes_YTrain.csv')
df_test = pd.read_csv('Diabetes_XTest.csv')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,7,168,88,42,321,38.2,0.787,40
1,8,110,76,0,0,27.8,0.237,58
2,7,147,76,0,0,39.4,0.257,43
3,2,100,66,20,90,32.9,0.867,28
4,4,129,86,20,270,35.1,0.231,23


In [3]:
dfx.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,7,168,88,42,321,38.2,0.787,40
1,8,110,76,0,0,27.8,0.237,58
2,7,147,76,0,0,39.4,0.257,43
3,2,100,66,20,90,32.9,0.867,28
4,4,129,86,20,270,35.1,0.231,23


In [4]:
dfy.head()

Unnamed: 0,Outcome
0,1
1,0
2,1
3,1
4,0


In [5]:
df_test.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,180,90,26,90,36.5,0.314,35
1,2,93,64,32,160,38.0,0.674,23
2,2,114,68,22,0,28.7,0.092,25
3,13,76,60,0,0,32.8,0.18,41
4,1,80,74,11,60,30.0,0.527,22


In [6]:
dfx.shape

(576, 8)

In [7]:
dfx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               576 non-null    int64  
 1   Glucose                   576 non-null    int64  
 2   BloodPressure             576 non-null    int64  
 3   SkinThickness             576 non-null    int64  
 4   Insulin                   576 non-null    int64  
 5   BMI                       576 non-null    float64
 6   DiabetesPedigreeFunction  576 non-null    float64
 7   Age                       576 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 36.1 KB


In [8]:
dfx.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64

In [9]:
dfx.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,576.0,3.758681,3.328783,0.0,1.0,3.0,6.0,17.0
Glucose,576.0,119.975694,31.512914,0.0,99.0,116.0,139.0,199.0
BloodPressure,576.0,68.826389,20.285812,0.0,62.0,72.0,80.0,122.0
SkinThickness,576.0,20.364583,15.893856,0.0,0.0,22.0,32.0,63.0
Insulin,576.0,76.166667,109.19355,0.0,0.0,36.0,120.0,744.0
BMI,576.0,31.857292,8.134926,0.0,26.975,32.0,36.725,67.1
DiabetesPedigreeFunction,576.0,0.481519,0.33691,0.078,0.24775,0.3815,0.64125,2.42
Age,576.0,32.954861,11.878137,21.0,24.0,29.0,40.0,81.0


#### Converting data into numpy array

In [10]:
x = dfx.values
y = dfy.values.reshape((-1,))
x_test = df_test.values

In [11]:
print(x.shape , y.shape , x_test.shape)

(576, 8) (576,) (192, 8)


#### Scaling the Data - Normalization/Standardisation

In [12]:
from sklearn.preprocessing import MinMaxScaler

In [13]:
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)
x_test_scaled = scaler.transform(x_test)

#### KNN Classifier from Scratch

In [14]:
# Euclidean Distance - shortest distance

def dist(x1,x2):
    return np.sqrt(sum((x1-x2)**2))

In [15]:
# KNN Classifier

def knn(X, Y , queryPoint , k = 13):
    
    # store distances
    vals = []
    # total points
    m = X.shape[0]
    
    for i in range(m):
        d = dist(queryPoint , X[i])
        vals.append((d , Y[i]))          # append distance of each point and its corresponding label
        
    vals = sorted(vals)
    vals = vals[:k]          # nearest/first k points
    vals = np.array(vals)             
    
    new_vals = np.unique(vals[: , 1] , return_counts = True)    # mode
    
    max_freq_index = new_vals[1].argmax()        # index with maximum frequency
    
    prediction  = new_vals[0][max_freq_index]    # value corresponding to index with max freq
    
    return int(prediction)

#### Making Predictions

In [16]:
m = x_test_scaled.shape[0]

y_pred_list = []

for j in range(m):
    
    ans = knn(x_scaled , y , x_test_scaled[j] , k=19)
    y_pred_list.append(ans)

y_pred_list = np.array(y_pred_list)
print(y_pred_list.shape)

(192,)


In [18]:
print(y_pred_list)

[1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 1 1 0 0 0 0 0 1 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1
 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 1
 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 1 0 0]


In [19]:
output = pd.DataFrame(y_pred_list , columns = ['Outcome'])
output.to_csv('knn-prediction' , index = False)

#### Using Sklearn

In [21]:
from sklearn.neighbors import KNeighborsClassifier as KNN

In [22]:
clf = KNN(n_neighbors = 13)
clf.fit(x_scaled , y)

test_predict = clf.predict(x_test_scaled)

In [23]:
print(type(test_predict) , test_predict.shape)
test_predict

<class 'numpy.ndarray'> (192,)


array([1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [24]:
output = pd.DataFrame(test_predict , columns = ['Outcome'])
output.head()

Unnamed: 0,Outcome
0,1
1,0
2,0
3,0
4,0


In [25]:
output.shape

(192, 1)