In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import joblib

# Set seeds for reproducibility
np.random.seed(42)

# Create directories if they don't exist
graphs_dir = 'graphs_knn_diabetes'
models_dir = 'models_knn_diabetes'
os.makedirs(graphs_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

# Load the dataset
data_path = 'diabetes_binary_5050split.csv'
data = pd.read_csv(data_path)

# Separate features and target
X = data.drop('Diabetes_binary', axis=1)  # Adjust the column name if necessary
y = data['Diabetes_binary']  # Adjust the column name if necessary

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, os.path.join(models_dir, 'scaler.joblib'))

# Hyperparameters
n_neighbors = [8, 15]
kfold = KFold(n_splits=5, shuffle=True, random_state=42)  # Use 5-fold cross-validation

# Train the models with cross-validation
history_logs = []
global_suffix_count = 1

train_accuracies_dict = {n: [] for n in n_neighbors}
val_accuracies_dict = {n: [] for n in n_neighbors}

for n in n_neighbors:
    best_val_accuracy = 0
    fold_no = 1
    
    for train_index, val_index in kfold.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        clf = KNeighborsClassifier(n_neighbors=n)
        clf.fit(X_train_fold, y_train_fold)
        
        # Generate learning curve
        train_sizes = [0.2, 0.5, 0.8, 1.0]
        train_sizes, train_scores, val_scores = learning_curve(clf, X_train, y_train, cv=kfold, train_sizes=train_sizes)
        train_accuracies_dict[n] = np.mean(train_scores, axis=1)
        val_accuracies_dict[n] = np.mean(val_scores, axis=1)

        fold_no += 1

    # Collect history logs
    history_logs.append((n, best_val_accuracy))

    # Save the model
    model_path = os.path.join(models_dir, f'model_knn_{n}_best.joblib')
    joblib.dump(clf, model_path)

    global_suffix_count += 1

# Plot combined learning curves
plt.figure()
for n in n_neighbors:
    plt.plot(train_sizes, train_accuracies_dict[n], label=f'Training accuracy (n_neighbors={n})')
    plt.plot(train_sizes, val_accuracies_dict[n], label=f'Validation accuracy (n_neighbors={n})', linestyle='--')
plt.title('Learning Curves for KNN')
plt.xlabel('Training examples fraction')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.grid()
learning_curve_path = os.path.join(graphs_dir, 'combined_learning_curve_knn.png')
plt.savefig(learning_curve_path)
plt.close()

# Evaluate the best models on the test set and generate combined validation curves
suffix_count = global_suffix_count
best_val_accuracies = []

for n in n_neighbors:
    model_path = os.path.join(models_dir, f'model_knn_{n}_best.joblib')
    clf = joblib.load(model_path)

    # Generate validation accuracy
    val_accuracy = clf.score(X_test, y_test)
    best_val_accuracies.append(val_accuracy)

# Plot combined validation curves
plt.figure()
plt.plot(n_neighbors, best_val_accuracies, label='Validation accuracy', marker='o')
plt.title('Validation Curves for Different Number of Neighbors')
plt.xlabel('Number of Neighbors')
plt.ylabel('Validation Accuracy')
plt.legend(loc='best')
plt.grid()
validation_curve_path = os.path.join(graphs_dir, 'combined_validation_curve_knn.png')
plt.savefig(validation_curve_path)
plt.close()

# Generate confusion matrix for the best model
best_model_info = max(history_logs, key=lambda x: x[1])
best_n, best_val_accuracy = best_model_info
best_model_path = os.path.join(models_dir, f'model_knn_{best_n}_best.joblib')
best_model = joblib.load(best_model_path)

# Predict on the test set
y_pred = best_model.predict(X_test)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Normalize the confusion matrix by row (i.e by the number of samples in each class)
conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] * 100

# Plot the normalized confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_normalized, annot=True, fmt='.2f', cmap='Blues')
plt.title('Confusion Matrix for Best Model (Percentage)')
plt.xlabel('Predicted')
plt.ylabel('True')
conf_matrix_path = os.path.join(graphs_dir, f'confusion_matrix_best_model_percentage.png')
plt.savefig(conf_matrix_path)
plt.close()

print("All tasks completed successfully. Models and graphs are saved in their respective directories.")


All tasks completed successfully. Models and graphs are saved in their respective directories.
