In [6]:
# Import necessary libraries
import pandas as pd # For data manipulation and analysis
import numpy as np # For numerical operations
import pickle # For serializing and deserializing Python objects
from modelling_utils import train_knn_classifier, evaluate_model, plot_model_comparison

In [7]:
# Load features from a pickle file
feature_dict_path = "/home/suraj/Repositories/FM-extractors-radiomics/evaluation/features/lndb.pkl" # Path to the pickle file containing features
with open(feature_dict_path, 'rb') as file: # Open the file in read binary mode
    data = pickle.load(file) # Load the data from the pickle file

In [10]:
# Store test accuracies for each model
test_accuracies_dict = {} # Initialize an empty dictionary to store test accuracies

# Iterate through each model's features
for model_name, values in data.items(): # Loop through each model and its corresponding features
    # Skip MedImageInsightExtractor
    if model_name == "MedImageInsightExtractor": # Skip the MedImageInsightExtractor model
        continue

    # Process each dataset split
    splits = ['train', 'val', 'test']
    items, labels = {}, {}
    
    for split in splits:
        # Extract features and labels
        labels[split] = np.array([v["row"]["texture"] for v in values[split]])
        items[split] = np.vstack([v["feature"] for v in values[split]])
        
        # Remove samples with nan labels
        mask = ~np.isnan(labels[split])
        items[split] = items[split][mask]
        labels[split] = labels[split][mask]
    
    # Combine train and validation sets
    combined_items = np.vstack([items['train'], items['val']])
    combined_labels = np.concatenate([labels['train'], labels['val']])
    
    # Randomly shuffle the combined data
    shuffle_idx = np.random.permutation(len(combined_items))
    combined_items = combined_items[shuffle_idx]
    combined_labels = combined_labels[shuffle_idx]
    
    # Split into new train and validation sets (80-20 split)
    split_idx = int(0.6 * len(combined_items))
    train_items, train_labels = combined_items[:split_idx], combined_labels[:split_idx]
    val_items, val_labels = combined_items[split_idx:], combined_labels[split_idx:]
    
    # Keep test set as is
    test_items, test_labels = items['test'], labels['test']
    

    # Train model with hyperparameter optimization
    best_model, study = train_knn_classifier(train_items, train_labels, val_items, val_labels, n_trials=300)
    
    # Evaluate on test set
    test_accuracies_dict[model_name] = evaluate_model(best_model, test_items, test_labels)

[I 2025-01-22 17:02:14,039] A new study created in memory with name: no-name-5724e3a5-5a6e-4f1a-bad4-7c517d6d5a4c


[I 2025-01-22 17:02:14,058] Trial 0 finished with value: 0.5443209969780041 and parameters: {'k': 29}. Best is trial 0 with value: 0.5443209969780041.
[I 2025-01-22 17:02:14,081] Trial 1 finished with value: 0.5638025639158633 and parameters: {'k': 12}. Best is trial 1 with value: 0.5638025639158633.
[I 2025-01-22 17:02:14,102] Trial 2 finished with value: 0.5633504352867482 and parameters: {'k': 11}. Best is trial 1 with value: 0.5638025639158633.
[I 2025-01-22 17:02:14,124] Trial 3 finished with value: 0.5697650794742779 and parameters: {'k': 42}. Best is trial 3 with value: 0.5697650794742779.
[I 2025-01-22 17:02:14,145] Trial 4 finished with value: 0.5608328014556713 and parameters: {'k': 3}. Best is trial 3 with value: 0.5697650794742779.
[I 2025-01-22 17:02:14,171] Trial 5 finished with value: 0.5575009213524532 and parameters: {'k': 28}. Best is trial 3 with value: 0.5697650794742779.
[I 2025-01-22 17:02:14,196] Trial 6 finished with value: 0.5663784658524991 and parameters: {'k

In [9]:
# Plot test accuracies
fig = plot_model_comparison(test_accuracies_dict)
fig.show() # Show the plot