In [1]:
# Import necessary libraries
import pandas as pd # For data manipulation and analysis
import numpy as np # For numerical operations
import pickle # For serializing and deserializing Python objects
from modelling_utils import train_knn_classifier, evaluate_model, plot_model_comparison, extract_model_features, compute_knn_indices, compute_overlap_matrix, plot_overlap_matrix, split_shuffle_data

In [2]:
data = {
    "negative_control_randomized_sampled_non_roi": pd.read_csv("../evaluation/readii_eval/negative_control_randomized_sampled_non_roi.csv"),
    "negative_control_randomized_sampled_roi": pd.read_csv("../evaluation/readii_eval/negative_control_randomized_sampled_roi.csv"),
    "negative_control_shuffled_non_roi": pd.read_csv("../evaluation/readii_eval/negative_control_shuffled_non_roi.csv"),
    "negative_control_shuffled_roi": pd.read_csv("../evaluation/readii_eval/negative_control_shuffled_roi.csv"),
    "original": pd.read_csv("/home/suraj/Repositories/FM-extractors-radiomics/evaluation/readii_eval/original.csv")
}

In [None]:
# Store test accuracies for each model
test_accuracies_dict = {} # Initialize an empty dictionary to store test accuracies

# Iterate through each model's features
for model_name, df in data.items(): # Loop through each model and its corresponding features
    # Skip MedImageInsightExtractor
    if model_name == "MedImageInsightExtractor": # Skip the MedImageInsightExtractor model
        continue

    all_items = df.filter(like="pred").values
    all_labels = df["survival" if "survival" in df else "survival_x"]

    # Average across multiple shuffle splits
    n_splits = 5
    split_scores = []
    
    for split in range(n_splits):
        # Get stratified indices for new train/val split with different random seeds
        train_items, train_labels, val_items, val_labels, test_items, test_labels = split_shuffle_data(
            all_items, all_labels, train_ratio=0.5, val_ratio=0.2, random_seed=10+split
        )

        # Train model with hyperparameter optimization
        best_model, study = train_knn_classifier(train_items, train_labels, val_items, val_labels)
        
        # Get score for this split
        split_score = evaluate_model(best_model, test_items, test_labels)
        split_scores.append(split_score)
    
    # Average the scores across splits
    avg_score = np.mean(split_scores)
    
    # Evaluate on test set
    test_accuracies_dict[model_name] = avg_score

[I 2025-02-24 12:51:22,121] A new study created in memory with name: no-name-664afc3b-7494-4c37-9878-d3bae07588fb
[I 2025-02-24 12:51:22,133] Trial 0 finished with value: 0.5760840290869916 and parameters: {'k': 29}. Best is trial 0 with value: 0.5760840290869916.
[I 2025-02-24 12:51:22,145] Trial 1 finished with value: 0.5569620253164557 and parameters: {'k': 12}. Best is trial 0 with value: 0.5760840290869916.


[I 2025-02-24 12:51:22,157] Trial 2 finished with value: 0.5340694855911662 and parameters: {'k': 11}. Best is trial 0 with value: 0.5760840290869916.
[I 2025-02-24 12:51:22,170] Trial 3 finished with value: 0.5645030972259628 and parameters: {'k': 42}. Best is trial 0 with value: 0.5760840290869916.
[I 2025-02-24 12:51:22,182] Trial 4 finished with value: 0.5378400215459197 and parameters: {'k': 3}. Best is trial 0 with value: 0.5760840290869916.
[I 2025-02-24 12:51:22,195] Trial 5 finished with value: 0.570024239159709 and parameters: {'k': 28}. Best is trial 0 with value: 0.5760840290869916.
[I 2025-02-24 12:51:22,208] Trial 6 finished with value: 0.5891462429302451 and parameters: {'k': 39}. Best is trial 6 with value: 0.5891462429302451.
[I 2025-02-24 12:51:22,220] Trial 7 finished with value: 0.5830864530029626 and parameters: {'k': 32}. Best is trial 6 with value: 0.5891462429302451.
[I 2025-02-24 12:51:22,234] Trial 8 finished with value: 0.5434958254780501 and parameters: {'k'

In [8]:
# Plot test accuracies
fig = plot_model_comparison(test_accuracies_dict, height=500)
fig.show() # Show the plot

In [9]:
model_features = {k.replace("negative_control_", ""): v.filter(like="pred").values for k,v in data.items()}
model_neighbors = compute_knn_indices(model_features, num_neighbors=10, metric="cosine")
overlap_matrix, model_list = compute_overlap_matrix(model_neighbors)
fig = plot_overlap_matrix(overlap_matrix, model_list, width=500, tickangle=90)
fig.show()
