## As an alternative to k-fold cross-validation, we also performed something which was suggested in the reference paper: 

"To reduce as much as possible the bias introduced by the randomness in selecting the samples of the training and test set, we performed twenty runs. In each run, the dataset was randomly shuffled and split into a training and a test set."

### Cf. Section 2.1 of our report (we proved there is no significant statistical difference between the results we get with this methodology and k-fold)

## KNN (20 iterations)

In [1]:
import pandas as pd
from ucimlrepo import fetch_ucirepo
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Fetch dataset 
darwin = fetch_ucirepo(id=732)

# Data (as pandas dataframes)
X = darwin.data.features
y = darwin.data.targets

X = X.drop(columns=['ID'])  # Remove ID column as it's not needed

# Number of runs
n_runs = 20

# Best parameters for KNN
params = {
    'algorithm': 'auto',
    'leaf_size': 10,
    'metric': 'minkowski',
    'metric_params': None,
    'n_neighbors': 1,
    'p': 1,
    'weights': 'uniform'
}

# Lists to store performance metrics
all_accuracies = []
all_precisions = []
all_recalls = []
all_f1_scores = []
all_sensitivities = []
all_specificities = []

# Run the model multiple times for robustness
for run in range(n_runs):
    # Shuffle and split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=None, stratify=y
    )

    # Ensure y_train and y_test are 1D
    y_train = y_train.values.ravel()
    y_test = y_test.values.ravel()

    # Create KNN classifier with the specified parameters
    clf_knn = KNeighborsClassifier(**params)

    # Train the model
    clf_knn.fit(X_train, y_train)

    # Predict on test data
    y_pred = clf_knn.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary', pos_label='P')  # Explicitly define pos_label
    recall = recall_score(y_test, y_pred, average='binary', pos_label='P')        # Explicitly define pos_label
    f1 = f1_score(y_test, y_pred, average='binary', pos_label='P')                # Explicitly define pos_label

    all_accuracies.append(accuracy)
    all_precisions.append(precision)
    all_recalls.append(recall)
    all_f1_scores.append(f1)

    # Calculate confusion matrix components
    cm = confusion_matrix(y_test, y_pred, labels=['H', 'P'])
    # Correctly identify the elements of the confusion matrix
    if cm.shape == (2, 2):  # Ensure it's a 2x2 matrix for binary classification
        tn, fp, fn, tp = cm.ravel()
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    else:
        # If it's not a 2x2 matrix, something went wrong
        print(f"Unexpected confusion matrix shape: {cm.shape}")
        sensitivity = specificity = 0

    all_sensitivities.append(sensitivity)
    all_specificities.append(specificity)

# Calculate overall mean and standard deviation for metrics
final_metrics = {
    'mean_accuracy': np.mean(all_accuracies),
    'std_accuracy': np.std(all_accuracies),
    'mean_precision': np.mean(all_precisions),
    'std_precision': np.std(all_precisions),
    'mean_recall': np.mean(all_recalls),
    'std_recall': np.std(all_recalls),
    'mean_f1_score': np.mean(all_f1_scores),
    'std_f1_score': np.std(all_f1_scores),
    'mean_sensitivity': np.mean(all_sensitivities),
    'std_sensitivity': np.std(all_sensitivities),
    'mean_specificity': np.mean(all_specificities),
    'std_specificity': np.std(all_specificities),
}

# Print the final aggregated metrics
print("Final Aggregated Performance Metrics:")
print(f"Mean Accuracy: {final_metrics['mean_accuracy']:.4f} ± {final_metrics['std_accuracy']:.4f}")
print(f"Mean Precision: {final_metrics['mean_precision']:.4f} ± {final_metrics['std_precision']:.4f}")
print(f"Mean Recall: {final_metrics['mean_recall']:.4f} ± {final_metrics['std_recall']:.4f}")
print(f"Mean F1 Score: {final_metrics['mean_f1_score']:.4f} ± {final_metrics['std_f1_score']:.4f}")
print(f"Mean Sensitivity: {final_metrics['mean_sensitivity']:.4f} ± {final_metrics['std_sensitivity']:.4f}")
print(f"Mean Specificity: {final_metrics['mean_specificity']:.4f} ± {final_metrics['std_specificity']:.4f}")


Final Aggregated Performance Metrics:
Mean Accuracy: 0.7443 ± 0.0609
Mean Precision: 0.9388 ± 0.0682
Mean Recall: 0.5417 ± 0.1190
Mean F1 Score: 0.6778 ± 0.1024
Mean Sensitivity: 0.5417 ± 0.1190
Mean Specificity: 0.9588 ± 0.0496


In [2]:
all_accuracies

[0.8285714285714286,
 0.8285714285714286,
 0.6857142857142857,
 0.8285714285714286,
 0.7428571428571429,
 0.6285714285714286,
 0.7142857142857143,
 0.7428571428571429,
 0.6285714285714286,
 0.6857142857142857,
 0.7428571428571429,
 0.7428571428571429,
 0.6857142857142857,
 0.7142857142857143,
 0.8285714285714286,
 0.7714285714285715,
 0.7428571428571429,
 0.8,
 0.7428571428571429,
 0.8]

In [3]:
all_precisions

[np.float64(1.0),
 np.float64(0.9285714285714286),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(0.9),
 np.float64(0.8),
 np.float64(0.8571428571428571),
 np.float64(0.8888888888888888),
 np.float64(0.8461538461538461),
 np.float64(1.0),
 np.float64(0.8181818181818182),
 np.float64(0.9),
 np.float64(0.9285714285714286),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(0.9090909090909091),
 np.float64(1.0)]

In [4]:
all_recalls

[np.float64(0.6666666666666666),
 np.float64(0.7222222222222222),
 np.float64(0.3888888888888889),
 np.float64(0.6666666666666666),
 np.float64(0.5),
 np.float64(0.2777777777777778),
 np.float64(0.5),
 np.float64(0.6666666666666666),
 np.float64(0.3333333333333333),
 np.float64(0.4444444444444444),
 np.float64(0.6111111111111112),
 np.float64(0.5),
 np.float64(0.5),
 np.float64(0.5),
 np.float64(0.7222222222222222),
 np.float64(0.5555555555555556),
 np.float64(0.5),
 np.float64(0.6111111111111112),
 np.float64(0.5555555555555556),
 np.float64(0.6111111111111112)]

In [5]:
all_f1_scores

[np.float64(0.8),
 np.float64(0.8125),
 np.float64(0.56),
 np.float64(0.8),
 np.float64(0.6666666666666666),
 np.float64(0.43478260869565216),
 np.float64(0.6428571428571429),
 np.float64(0.7272727272727273),
 np.float64(0.48),
 np.float64(0.5925925925925926),
 np.float64(0.7096774193548387),
 np.float64(0.6666666666666666),
 np.float64(0.6206896551724138),
 np.float64(0.6428571428571429),
 np.float64(0.8125),
 np.float64(0.7142857142857143),
 np.float64(0.6666666666666666),
 np.float64(0.7586206896551724),
 np.float64(0.6896551724137931),
 np.float64(0.7586206896551724)]

In [6]:
all_sensitivities

[np.float64(0.6666666666666666),
 np.float64(0.7222222222222222),
 np.float64(0.3888888888888889),
 np.float64(0.6666666666666666),
 np.float64(0.5),
 np.float64(0.2777777777777778),
 np.float64(0.5),
 np.float64(0.6666666666666666),
 np.float64(0.3333333333333333),
 np.float64(0.4444444444444444),
 np.float64(0.6111111111111112),
 np.float64(0.5),
 np.float64(0.5),
 np.float64(0.5),
 np.float64(0.7222222222222222),
 np.float64(0.5555555555555556),
 np.float64(0.5),
 np.float64(0.6111111111111112),
 np.float64(0.5555555555555556),
 np.float64(0.6111111111111112)]

In [7]:
all_specificities

[np.float64(1.0),
 np.float64(0.9411764705882353),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(0.9411764705882353),
 np.float64(0.8235294117647058),
 np.float64(0.9411764705882353),
 np.float64(0.9411764705882353),
 np.float64(0.8823529411764706),
 np.float64(1.0),
 np.float64(0.8823529411764706),
 np.float64(0.9411764705882353),
 np.float64(0.9411764705882353),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(1.0),
 np.float64(0.9411764705882353),
 np.float64(1.0)]