In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import time

# Euclidean Distance Function
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# k-NN Classifier
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X.to_numpy()
        self.y_train = y.to_numpy()

    def predict(self, X):
        predictions = [self._predict(x) for x in X.to_numpy()]
        return np.array(predictions)

    def _predict(self, x):
        # Compute distances to all training samples
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Get indices of k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        # Get the labels of k nearest neighbors
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Majority vote
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

# Cross-validation to find optimal k
def cross_validate_k(X, y, k_values):
    kf = KFold(n_splits=5, shuffle=True, random_state=44)
    avg_accuracies = []

    for k in k_values:
        knn = KNN(k=k)
        accuracies = []

        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_val)
            accuracies.append(accuracy_score(y_val, y_pred))

        avg_accuracies.append(f"{np.mean(accuracies):.3f}")

    best_k = k_values[np.argmax(avg_accuracies)]
    return best_k, avg_accuracies

# Evaluation Function
def evaluate_model(knn, X_train, X_test, y_train, y_test):
    start_time = time.time()
    y_test_pred = knn.predict(X_test)
    runtime = time.time() - start_time

    metrics = {
        "Train Accuracy": accuracy_score(y_train, knn.predict(X_train)),
        "Test Accuracy": accuracy_score(y_test, y_test_pred),
        "Precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=0),
        "Recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=0),
        "F1 Score": f1_score(y_test, y_test_pred, average='weighted', zero_division=0),
        "AUROC": roc_auc_score(y_test, y_test_pred, average='weighted'),
        "Runtime (s)": runtime
    }
    return metrics

# Main
if __name__ == "__main__":
    # Load dataset
    X = pd.read_excel("../coffeeDataSynthesized.xlsx", "dataset")
    y = np.where(X["type"] == "robusta", 0, 1)
    y = pd.Series(y)

    X = X[['width', 'height', 'depth', 'weight']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

    # Determine the best k
    k_values = list(range(1, 21))
    best_k, accuracies = cross_validate_k(X_train, y_train, k_values)

    print(f"Best k: {best_k}")
    print(f"Cross-Validation Accuracies: {accuracies}")

    # Train final model
    knn = KNN(k=best_k)
    knn.fit(X_train, y_train)

    # Evaluate final model
    results = evaluate_model(knn, X_train, X_test, y_train, y_test)
    print("Performance Metrics:")
    for metric, value in results.items():
        print(f"{metric}: {value:.4f}")


Best k: 7
Cross-Validation Accuracies: ['0.824', '0.824', '0.847', '0.847', '0.849', '0.849', '0.854', '0.851', '0.851', '0.845', '0.839', '0.840', '0.845', '0.846', '0.845', '0.841', '0.847', '0.849', '0.852', '0.853']
Performance Metrics:
Train Accuracy: 0.8791
Test Accuracy: 0.8347
Precision: 0.8347
Recall: 0.8347
F1 Score: 0.8347
AUROC: 0.8347
Runtime (s): 1.1980
