# K-Nearest Neighbor Model for Diabetes Prediction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

## 1. Data Acquisition

In [2]:
df = pd.read_csv("../datasets/Processed_data.csv",sep=",",index_col="PatientID")
df.shape

(15298, 18)

## 2. Data Preparation

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
# Split data into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
# Display training data shape
df_train.shape

(12238, 18)

In [6]:
# Define feature columns for clarity
feature_columns = ['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
                   'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age']

In [7]:
# Extract features and target for training data
X_train = df_train.loc[:, feature_columns].values
y_train = df_train.Diabetic.values

In [8]:
# Extract features and target for test data
X_test = df_test.loc[:, feature_columns].values
y_test = df_test.Diabetic.values

## 3. KNN Model Implementation

In [9]:
# Import the KNeighborsClassifier model from scikit-learn
from sklearn.neighbors import KNeighborsClassifier

# Initialize the K-Nearest Neighbors classifier with 3 neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=3)

# Train the model using the training data (features: X_train, target: y_train)
knn_classifier.fit(X=X_train, y=y_train)

# Make predictions on the test dataset
y_test_predicted = knn_classifier.predict(X_test)

# Convert the predicted values to integers (useful if the target values are discrete)  
y_test_predicted = y_test_predicted.astype(int) 


In [10]:
# Calculate basic accuracy
accuracy = (y_test_predicted == y_test).sum()/len(y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8898692810457516


## 4. Model Evaluation

### 4.1 Confusion Matrix

In [11]:
from sklearn.metrics import confusion_matrix

In [12]:
# Create confusion matrix
cf = pd.DataFrame(
    columns=["y_test_0", "y_test_1"], index=["y_pred_0", "y_pred_1"]
)

cf.loc[:, :] = confusion_matrix(y_true=y_test, y_pred=y_test_predicted)
cf

Unnamed: 0,y_test_0,y_test_1
y_pred_0,1314,243
y_pred_1,94,1409


### 4.2 Classification Metrics

In [13]:
from sklearn.metrics import recall_score, precision_score, classification_report

In [14]:
# Calculate recall and precision
recall = recall_score(y_true=y_test, y_pred=y_test_predicted)
precision = precision_score(y_true=y_test, y_pred=y_test_predicted)

print(f"Recall: {recall},\nPrecision: {precision}")

Recall: 0.9374584165003327,
Precision: 0.8529055690072639


In [15]:
# Generate classification report
report = classification_report(y_true=y_test, y_pred=y_test_predicted)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.84      0.89      1557
           1       0.85      0.94      0.89      1503

    accuracy                           0.89      3060
   macro avg       0.89      0.89      0.89      3060
weighted avg       0.89      0.89      0.89      3060



## 7. Hyperparameter Tuning

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Setup grid search for KNN Classifier
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Show best parameters and best cross-validation score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")


Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best Cross-Validation Score: 0.9145286612124759


## 8. Final Model and Evaluation

In [17]:
from sklearn.metrics import classification_report

# Create final model with best parameters
final_model = KNeighborsClassifier(
    n_neighbors=grid_search.best_params_['n_neighbors'],
    weights=grid_search.best_params_['weights'],
    metric=grid_search.best_params_['metric']
)

# Fit the final model on the training data
final_model.fit(X_train, y_train)

# Evaluate on test set
y_pred_final = final_model.predict(X_test)
final_accuracy = (y_pred_final == y_test).sum() / len(y_test)
final_report = classification_report(y_test, y_pred_final)

print(f"Final Model Accuracy: {final_accuracy}")
print(f"\nFinal Classification Report:\n{final_report}")


Final Model Accuracy: 0.923202614379085

Final Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.88      0.92      1557
           1       0.89      0.97      0.93      1503

    accuracy                           0.92      3060
   macro avg       0.93      0.92      0.92      3060
weighted avg       0.93      0.92      0.92      3060



In [18]:
# Saving the model
with open('../models/knn_model.pkl', 'wb') as file:
    pickle.dump(final_model, file)