## 41134
# Pushkar Jain

# Diabetes predication system with KNN algorithm

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, f1_score, recall_score, precision_score

### Load

In [None]:
data = pd.read_csv('diabetes.csv')

### Preprocessing

In [None]:
data.drop(['Pregnancies', 'BloodPressure', 'SkinThickness'], axis=1, inplace=True)

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.describe().T

In [None]:
hist = data.hist(figsize=(20,16))

In [None]:
target_feature = 'Outcome'
num_features = list(set(data.columns) - set([target_feature]))

Train test split

In [None]:
X = data.drop(target_feature, axis=1)
y = data[target_feature]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

Outliers

In [None]:
data[data['Glucose'] == 0]

In [None]:
data[data['BMI'] == 0]

In [None]:
imputer = SimpleImputer(missing_values=0.0, strategy='median')
imputer.fit(X_train[['Glucose','BMI']])
X_train[['Glucose','BMI']] = imputer.transform(X_train[['Glucose','BMI']])
X_test[['Glucose','BMI']] = imputer.transform(X_test[['Glucose','BMI']])

Сlass balancing

In [None]:
y_train.value_counts(normalize=True).plot.bar(color=['tab:blue', 'tab:red'])
plt.grid(axis='y')
plt.title(target_feature)
plt.xlabel('Class')
plt.ylabel('Proportion')

In [None]:
y_train.value_counts(normalize=True).plot.bar(color=['tab:blue', 'tab:red'])
plt.grid(axis='y')
plt.title(target_feature)
plt.xlabel('Class')
plt.ylabel('Proportion')

Correlation matrix

In [None]:
X_train[num_features].corr().style.background_gradient(cmap='coolwarm')

Scaling

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=num_features)
X_test = scaler.transform(X_test)
X_test = pd.DataFrame(X_test, columns=num_features)

### Training

In [None]:
def print_metrics(y_true, y_pred):
    print('Metrics:')
    print(f'f1_score = {f1_score(y_true=y_true, y_pred=y_pred).round(3)}')
    print(f'recall_score = {recall_score(y_true=y_true, y_pred=y_pred).round(3)}')
    print(f'precision_score = {precision_score(y_true=y_true, y_pred=y_pred).round(3)}')
    
def print_confusion_matrix(y_true, y_pred):
    sns.heatmap(confusion_matrix(y_true=y_true, y_pred=y_pred), annot=True, cmap='coolwarm', cbar=False)
    plt.title('Confusion matrix')
    plt.xlabel('Predict')
    plt.ylabel('Actual')
    plt.show()
    
def print_roc_auc(y_true, y_pred_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
    auc = roc_auc_score(y_true, y_pred_prob)
        
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC AUC: {auc.round(3)}')
    plt.show()
    
def print_params(model):
    print('Model parameters:')
    print(f'K neighbors = {model.n_neighbors}')
    print(f'Power = {model.p}')
    
def start_train(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_prep = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:,1]
    
    print_params(model=model)
    print_metrics(y_true=y_test, y_pred=y_prep)
    print_confusion_matrix(y_true=y_test, y_pred=y_prep)
    print_roc_auc(y_true=y_test, y_pred_prob=y_pred_prob) 
    
    

In [None]:
start_train(model=KNeighborsClassifier(), X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)