In [None]:
# the following code is what is used to know what is the best k for our knn model copy it and past it Multiple-subjects file to make it work properly

# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score
# import matplotlib.pyplot as plt
# import numpy as np

# # === Extract Features and Labels ===
# X_train = train_df.drop(columns=['label'])
# y_train = train_df['label']

# X_test = test_df.drop(columns=['label'])
# y_test = test_df['label']

# # === Optional: split a validation set from training (e.g., 20% validation) ===
# from sklearn.model_selection import train_test_split
# X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# # === Scale the features ===
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_part)
# X_val_scaled = scaler.transform(X_val)
# X_test_scaled = scaler.transform(X_test)

# # === KNN evaluation across different k values ===
# x = list(range(1, 6))  # k from 1 to 5
# yValidation = []
# yValidationScaler = []

# for k in x:
#     model = KNeighborsClassifier(n_neighbors=k)
#     model.fit(X_train_part, y_train_part)

#     acc_val = accuracy_score(y_val, model.predict(X_val))
#     yValidation.append(acc_val)

#     model_scaled = KNeighborsClassifier(n_neighbors=k)
#     model_scaled.fit(X_train_scaled, y_train_part)
#     acc_val_scaled = accuracy_score(y_val, model_scaled.predict(X_val_scaled))
#     yValidationScaler.append(acc_val_scaled)

# # === Plotting ===
# plt.figure(figsize=(12, 8), dpi=100)
# plt.plot(x, yValidation, label='Validation')
# plt.plot(x, yValidationScaler, label='Validation (Scaled)')
# plt.xlabel('n-neighbors')
# plt.ylabel('Accuracy')
# plt.title('n-neighbors vs Accuracy')
# plt.legend()
# plt.xticks(x)
# plt.grid(True)
# plt.savefig('KNN-Algorithm.pdf', dpi=300)
# plt.show()

# print('The best score with validation: ', max(yValidation), 'with Neighbors:', x[yValidation.index(max(yValidation))])
# print('The best score with validation (scaled): ', max(yValidationScaler), 'with Neighbors:', x[yValidationScaler.index(max(yValidationScaler))])

# # === Train final KNN model using best k on full train data ===
# best_k = x[yValidationScaler.index(max(yValidationScaler))]
# KNN = KNeighborsClassifier(n_neighbors=best_k)

# # Concatenate full training set (part + val)
# X_train_full = pd.concat([X_train_part, X_val], ignore_index=True)
# y_train_full = pd.concat([y_train_part, y_val], ignore_index=True)

# # Fit with scaling
# X_train_full_scaled = scaler.fit_transform(X_train_full)
# KNN.fit(X_train_full_scaled, y_train_full)

# # === Predict and evaluate on test set ===
# y_pred = KNN.predict(X_test_scaled)
# accuracy = accuracy_score(y_test, y_pred)

# print(f"✅ Final Test Accuracy with K = {best_k}: {accuracy:.4f}")
# metrics(y_test, y_pred)



import os
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# === CONFIGURATION ===
DATA_DIR = "./subjects_csv"
K_RANGE = range(1, 11)
OUTPUT_FILE = "knn-results.csv"

# === Load subject files ===
subject_files = sorted([f for f in os.listdir(DATA_DIR) if f.endswith(".csv")])

# === Store accuracy scores for elbow method ===
elbow_scores = []

print("🔁 Starting Elbow Method Evaluation (Accuracy)...")

for K in K_RANGE:
    acc_scores = []

    for i, test_file in enumerate(subject_files):
        test_path = os.path.join(DATA_DIR, test_file)
        test_df = pd.read_csv(test_path)

        train_files = subject_files[:i] + subject_files[i+1:]
        train_df = pd.concat([pd.read_csv(os.path.join(DATA_DIR, f)) for f in train_files], ignore_index=True)

        X_train = train_df.drop(columns=['label'])
        y_train = train_df['label']
        X_test = test_df.drop(columns=['label'])
        y_test = test_df['label']

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        knn = KNeighborsClassifier(n_neighbors=K)
        knn.fit(X_train_scaled, y_train)
        y_pred = knn.predict(X_test_scaled)

        acc = accuracy_score(y_test, y_pred)
        acc_scores.append(acc)

    avg_acc = np.mean(acc_scores)
    elbow_scores.append(avg_acc)
    print(f"✅ k = {K}: Avg. Accuracy = {avg_acc:.4f}")

# === Elbow Method Plot (Accuracy) ===
plt.figure(figsize=(10, 6))
plt.plot(list(K_RANGE), elbow_scores, marker='o')
plt.title("Elbow Method: k vs Avg. Accuracy")
plt.xlabel("k (n_neighbors)")
plt.ylabel("Average Accuracy")
plt.grid(True)
plt.xticks(list(K_RANGE))
plt.savefig("knn-elbow-plot-accuracy.png", dpi=300)
plt.show()

# === Best k based on max Accuracy ===
best_k = list(K_RANGE)[np.argmax(elbow_scores)]
print(f"\n🏆 Best k selected based on max accuracy: {best_k}")

# === Final Evaluation Using Best k ===
results = []

print(f"\n🔁 Running Final Evaluation with k = {best_k}")

for i, test_file in enumerate(subject_files):
    test_path = os.path.join(DATA_DIR, test_file)
    test_df = pd.read_csv(test_path)

    train_files = subject_files[:i] + subject_files[i+1:]
    train_df = pd.concat([pd.read_csv(os.path.join(DATA_DIR, f)) for f in train_files], ignore_index=True)

    X_train = train_df.drop(columns=['label'])
    y_train = train_df['label']
    X_test = test_df.drop(columns=['label'])
    y_test = test_df['label']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    results.append({
        'Subject': test_file,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1 Score': f1
    })

# === Save and Print Final Results ===
results_df = pd.DataFrame(results)
results_df.to_csv(OUTPUT_FILE, index=False)

print("as the plot shows, and based on the elbow method, the best k is between 4, 5 and 6 will use 6")
