In [1]:
import pandas as pd
import numpy as np
import pandas_profiling
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
#importing diabetes dataset
diab = pd.read_csv('diabetes.csv')


In [3]:
#profiling data
profile = ProfileReport(diab, explorative=True, dark_mode=True)

#output in HTML
profile.to_file('output.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
#replacing zeros
non_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for coloumn in non_zero:
    diab[coloumn] = diab[coloumn].replace(0,np.NaN)
    mean = int(diab[coloumn].mean(skipna = True))
    diab[coloumn] = diab[coloumn].replace(np.NaN, mean)
    print(diab[coloumn])

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
       ...  
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: Glucose, Length: 768, dtype: float64
0      72.0
1      66.0
2      64.0
3      66.0
4      40.0
       ... 
763    76.0
764    70.0
765    72.0
766    60.0
767    70.0
Name: BloodPressure, Length: 768, dtype: float64
0      35.0
1      29.0
2      29.0
3      23.0
4      35.0
       ... 
763    48.0
764    27.0
765    23.0
766    29.0
767    31.0
Name: SkinThickness, Length: 768, dtype: float64
0      155.0
1      155.0
2      155.0
3       94.0
4      168.0
       ...  
763    180.0
764    155.0
765    112.0
766    155.0
767    155.0
Name: Insulin, Length: 768, dtype: float64
0      33.6
1      26.6
2      23.3
3      28.1
4      43.1
       ... 
763    32.9
764    36.8
765    26.2
766    30.1
767    30.4
Name: BMI, Length: 768, dtype: float64


In [6]:
#showing head
diab.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
#checking dataframe shape
diab.shape

(768, 9)

In [8]:
#cheching for null values
diab.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
#filling zeros with mean value
non_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for coloumn in non_zero:
    diab[coloumn] = diab[coloumn].replace(0,np.NaN)
    mean = int(diab[coloumn].mean(skipna = True))
    diab[coloumn] = diab[coloumn].replace(np.NaN, mean)
    print(diab[coloumn])

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
       ...  
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: Glucose, Length: 768, dtype: float64
0      72.0
1      66.0
2      64.0
3      66.0
4      40.0
       ... 
763    76.0
764    70.0
765    72.0
766    60.0
767    70.0
Name: BloodPressure, Length: 768, dtype: float64
0      35.0
1      29.0
2      29.0
3      23.0
4      35.0
       ... 
763    48.0
764    27.0
765    23.0
766    29.0
767    31.0
Name: SkinThickness, Length: 768, dtype: float64
0      155.0
1      155.0
2      155.0
3       94.0
4      168.0
       ...  
763    180.0
764    155.0
765    112.0
766    155.0
767    155.0
Name: Insulin, Length: 768, dtype: float64
0      33.6
1      26.6
2      23.3
3      28.1
4      43.1
       ... 
763    32.9
764    36.8
765    26.2
766    30.1
767    30.4
Name: BMI, Length: 768, dtype: float64


In [24]:
# ML train

In [10]:
#all the column except the last one (features)
X =diab.iloc[:,0:8]

#the outcome column (label)
y =diab.iloc[:,8]

#feature scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

#testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0, stratify=y)

In [11]:
#fitting model
model = KNN(n_neighbors=5).fit(X_train, y_train)

#predictions and model accuracy
y_pred = model.predict(X_test)
print(f'Model accuracy on test set: {accuracy_score(y_test, y_pred):.2f}')

Model accuracy on test set: 0.75


In [12]:
#best k
for k in [1,2,3,5,10]:
    model = KNN(n_neighbors=k)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    validation_accuracy = accuracy_score(y_test, predictions)
    print('Validation accuracy with k {}: {:.2f}'.format(k, validation_accuracy))  

Validation accuracy with k 1: 0.71
Validation accuracy with k 2: 0.72
Validation accuracy with k 3: 0.75
Validation accuracy with k 5: 0.75
Validation accuracy with k 10: 0.78


In [13]:
#training model with best k
model = KNN(n_neighbors=10).fit(X_train, y_train)

#predictions and model accuracy
y_pred = model.predict(X_test)

#accuracy
print(f'Model accuracy on test set: {accuracy_score(y_test, y_pred):.2f}')

Model accuracy on test set: 0.78


In [14]:
#confusion matrix
cm= confusion_matrix(y_test, y_pred)
cm

array([[89, 11],
       [23, 31]])

In [15]:
#precision score
precision = precision_score(y_test, y_pred)

print(f'{precision: .2f}')

 0.74


In [23]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 10))

#diabetes outcome (Kernel Density Estimation)
ax = sns.kdeplot(y, color="r", label="Actual Value") #y = diab['Outcome']

#predictions (Kernel Densiti Estimation), in the same plot (ax=ax)
sns.kdeplot(y_pred, color="b", label="Predicted Values", ax=ax)

#labeling title
plt.title('Actual vs Precited values')

#showing legend
plt.legend()

#showing plot
plt.show()

In [21]:
# CAN WE FIND THE BEST ML ALGORITHM?

In [20]:
from pycaret.classification import *

#defining the features
numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

#setting up the data
clf = setup(data=diab, target='Outcome')

#comparing themodels
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7654,0.8275,0.5705,0.7021,0.6238,0.4577,0.4663,0.24
gbc,Gradient Boosting Classifier,0.7597,0.8364,0.6129,0.672,0.6346,0.4576,0.4636,0.102
et,Extra Trees Classifier,0.7579,0.8112,0.5743,0.6792,0.6147,0.4422,0.4507,0.325
lda,Linear Discriminant Analysis,0.7561,0.8266,0.5541,0.6843,0.6069,0.435,0.443,0.028
ridge,Ridge Classifier,0.7541,0.0,0.5374,0.6856,0.5965,0.4258,0.4354,0.041
rf,Random Forest Classifier,0.7523,0.8371,0.5801,0.6645,0.6126,0.4342,0.4403,0.36
dt,Decision Tree Classifier,0.7448,0.7151,0.6181,0.6444,0.6261,0.4335,0.4375,0.024
ada,Ada Boost Classifier,0.7429,0.8211,0.602,0.6412,0.6151,0.4239,0.4286,0.093
lightgbm,Light Gradient Boosting Machine,0.741,0.8315,0.6281,0.6297,0.6241,0.4278,0.4314,0.07
knn,K Neighbors Classifier,0.728,0.7682,0.5532,0.6283,0.5829,0.3835,0.389,0.084
