In [15]:
# Import necessary libraries
import pandas as pd # For data manipulation and analysis
import numpy as np # For numerical operations
import pickle # For serializing and deserializing Python objects
from modelling_utils import train_knn_classifier, evaluate_model, plot_model_comparison

In [16]:
# Load features from a pickle file
feature_dict_path = "/home/suraj/Repositories/FM-extractors-radiomics/evaluation/features/luna.pkl" # Path to the pickle file containing features
with open(feature_dict_path, 'rb') as file: # Open the file in read binary mode
    data = pickle.load(file) # Load the data from the pickle file

In [17]:
# Store test accuracies for each model
test_accuracies_dict = {} # Initialize an empty dictionary to store test accuracies

In [18]:
# Iterate through each model's features
for model_name, values in data.items(): # Loop through each model and its corresponding features

    # Skip MedImageInsightExtractor
    if model_name == "MedImageInsightExtractor": # Skip the MedImageInsightExtractor model
        continue

    # Extract paths and labels for train, val, and test sets
    train_labels = [v["row"]["malignancy"] for v in values["train"]] # Extract malignancy labels for the training set
    val_labels = [v["row"]["malignancy"] for v in values["val"]] # Extract malignancy labels for the validation set
    test_labels = [v["row"]["malignancy"] for v in values["test"]] # Extract malignancy labels for the test set
    
    # Stack features
    train_items = np.vstack([v["feature"] for v in values["train"]]) # Stack features for the training set
    val_items = np.vstack([v["feature"] for v in values["val"]]) # Stack features for the validation set
    test_items = np.vstack([v["feature"] for v in values["test"]]) # Stack features for the test set



    # Train model with hyperparameter optimization
    best_model, study = train_knn_classifier(train_items, train_labels, val_items, val_labels)
    
    # Evaluate on test set
    test_accuracies_dict[model_name] = evaluate_model(best_model, test_items, test_labels)

[I 2025-01-22 16:22:02,093] A new study created in memory with name: no-name-e0ecfa42-ff39-4600-b6d0-e440afeec09d
[I 2025-01-22 16:22:02,122] Trial 0 finished with value: 0.8395710681244744 and parameters: {'k': 3}. Best is trial 0 with value: 0.8395710681244744.
[I 2025-01-22 16:22:02,145] Trial 1 finished with value: 0.8810625175217268 and parameters: {'k': 29}. Best is trial 1 with value: 0.8810625175217268.
[I 2025-01-22 16:22:02,162] Trial 2 finished with value: 0.8882814690215867 and parameters: {'k': 14}. Best is trial 2 with value: 0.8882814690215867.
[I 2025-01-22 16:22:02,179] Trial 3 finished with value: 0.8832352116624614 and parameters: {'k': 11}. Best is trial 2 with value: 0.8882814690215867.
[I 2025-01-22 16:22:02,197] Trial 4 finished with value: 0.8782590412111017 and parameters: {'k': 27}. Best is trial 2 with value: 0.8882814690215867.
[I 2025-01-22 16:22:02,214] Trial 5 finished with value: 0.883515559293524 and parameters: {'k': 25}. Best is trial 2 with value: 0.

In [20]:
# Plot test accuracies
fig = plot_model_comparison(test_accuracies_dict)
fig.show() # Show the plot