# K-Nearest Neighbor Model for Diabetes Prediction

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

## 1. Data Acquisition

In [6]:
df = pd.read_csv("./TAIPEI_diabetes.csv",sep=",",index_col="PatientID")
df.shape

(15000, 9)

## 2. Data Preparation

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
# Split data into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [17]:
# Display training data shape
df_train.shape

(12000, 9)

In [18]:
# Define feature columns for clarity
feature_columns = ['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
                   'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age']

In [19]:
# Extract features and target for training data
X_train = df_train.loc[:, feature_columns].values
y_train = df_train.Diabetic.values

In [20]:
# Extract features and target for test data
X_test = df_test.loc[:, feature_columns].values
y_test = df_test.Diabetic.values

## 3. KNN Model Implementation

In [21]:
# Import the KNeighborsRegressor model from scikit-learn  
from sklearn.neighbors import KNeighborsRegressor  

# Initialize the K-Nearest Neighbors regressor with 3 neighbors  
knn_regressor = KNeighborsRegressor(n_neighbors=3)  

# Train the model using the training data (features: X_train, target: y_train)  
knn_regressor.fit(X=X_train, y=y_train)  

# Make predictions on the test dataset  
y_test_predicted = knn_regressor.predict(X_test)  

# Convert the predicted values to integers (useful if the target values are discrete)  
y_test_predicted = y_test_predicted.astype(int)  


In [22]:
# Calculate basic accuracy
accuracy = (y_test_predicted == y_test).sum()/len(y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7986666666666666


## 4. Model Evaluation

### 4.1 Confusion Matrix

In [23]:
from sklearn.metrics import confusion_matrix

In [24]:
# Create confusion matrix
cf = pd.DataFrame(
    columns=["y_test_0", "y_test_1"], index=["y_pred_0", "y_pred_1"]
)

cf.loc[:, :] = confusion_matrix(y_true=y_test, y_pred=y_test_predicted)
cf

Unnamed: 0,y_test_0,y_test_1
y_pred_0,1946,68
y_pred_1,536,450


### 4.2 Classification Metrics

In [25]:
from sklearn.metrics import recall_score, precision_score, classification_report

In [26]:
# Calculate recall and precision
recall = recall_score(y_true=y_test, y_pred=y_test_predicted)
precision = precision_score(y_true=y_test, y_pred=y_test_predicted)

print(f"Recall: {recall},\nPrecision: {precision}")

Recall: 0.4563894523326572,
Precision: 0.8687258687258688


In [27]:
# Generate classification report
report = classification_report(y_true=y_test, y_pred=y_test_predicted)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.97      0.87      2014
           1       0.87      0.46      0.60       986

    accuracy                           0.80      3000
   macro avg       0.83      0.71      0.73      3000
weighted avg       0.81      0.80      0.78      3000



## 7. Hyperparameter Tuning

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

param_grid = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Setup grid search for KNN Regressor
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Show best parameters and best cross-validation score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")


Best Parameters: {'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'distance'}
Best Cross-Validation Score: -0.09342309875783758


## 8. Final Model and Evaluation

In [31]:
# Create final model with best parameters
final_model = KNeighborsRegressor(
    n_neighbors=grid_search.best_params_['n_neighbors'],
    weights=grid_search.best_params_['weights'],
    metric=grid_search.best_params_['metric']
)

# Fit the final model on the training data
final_model.fit(X_train, y_train)

# Evaluate on test set
y_pred_final = final_model.predict(X_test)
y_pred_final = y_pred_final.astype(int)
final_accuracy = (y_pred_final == y_test).sum() / len(y_test)
final_report = classification_report(y_test, y_pred_final)

print(f"Final Model Accuracy: {final_accuracy}")
print(f"\nFinal Classification Report:\n{final_report}")

Final Model Accuracy: 0.738

Final Classification Report:
              precision    recall  f1-score   support

           0       0.72      1.00      0.84      2014
           1       0.98      0.21      0.34       986

    accuracy                           0.74      3000
   macro avg       0.85      0.60      0.59      3000
weighted avg       0.81      0.74      0.67      3000



In [32]:
# Saving the model
with open('knn_model.pkl', 'wb') as file:
    pickle.dump(final_model, file)