In [1]:
# Import necessary libraries
import pandas as pd # For data manipulation and analysis
import numpy as np # For numerical operations
import pickle # For serializing and deserializing Python objects
from modelling_utils import train_knn_classifier, evaluate_model, plot_model_comparison, extract_model_features, compute_knn_indices, compute_overlap_matrix, plot_overlap_matrix, split_shuffle_data

In [2]:
# Load features from a pickle file
feature_dict_path = "/home/suraj/Repositories/FM-extractors-radiomics/evaluation/features/nsclc_radiogenomics.pkl" # Path to the pickle file containing features
with open(feature_dict_path, 'rb') as file: # Open the file in read binary mode
    data = pickle.load(file) # Load the data from the pickle file

In [None]:
# Store test accuracies for each model
test_accuracies_dict = {} # Initialize an empty dictionary to store test accuracies

# Iterate through each model's features
for model_name, values in data.items():
    # Skip MedImageInsightExtractor
    if model_name == "MedImageInsightExtractor":
        continue
        
    # Extract labels and features
    train_labels = [v["row"]["survival"] for v in values["train"]]
    val_labels = [v["row"]["survival"] for v in values["val"]] 
    test_labels = [v["row"]["survival"] for v in values["test"]]
    
    train_items = np.vstack([v["feature"] for v in values["train"]])
    val_items = np.vstack([v["feature"] for v in values["val"]])
    test_items = np.vstack([v["feature"] for v in values["test"]])

    # Combine train and val items and labels
    all_items = np.vstack([train_items, val_items, test_items])
    all_labels = train_labels + val_labels + test_labels


    # Average across multiple shuffle splits
    n_splits = 5
    split_scores = []
    
    for split in range(n_splits):
        # Get stratified indices for new train/val split with different random seeds
        train_items, train_labels, val_items, val_labels, test_items, test_labels = split_shuffle_data(
            all_items, all_labels, train_ratio=0.5, val_ratio=0.1, random_seed=10+split
        )

        # Train model with hyperparameter optimization
        best_model, study = train_knn_classifier(train_items, train_labels, val_items, val_labels)
        
        # Get score for this split
        split_score = evaluate_model(best_model, test_items, test_labels)
        split_scores.append(split_score)
    
    # Average the scores across splits
    avg_score = np.mean(split_scores)
    
    # Evaluate on test set
    test_accuracies_dict[model_name] = avg_score

[I 2025-02-06 18:29:41,037] A new study created in memory with name: no-name-34e41c22-6cc4-43bd-a6a3-9c72a3ac3c72


[I 2025-02-06 18:29:41,061] Trial 0 finished with value: 0.4625 and parameters: {'k': 29}. Best is trial 0 with value: 0.4625.
[I 2025-02-06 18:29:41,078] Trial 1 finished with value: 0.47500000000000003 and parameters: {'k': 12}. Best is trial 1 with value: 0.47500000000000003.
[I 2025-02-06 18:29:41,095] Trial 2 finished with value: 0.38750000000000007 and parameters: {'k': 11}. Best is trial 1 with value: 0.47500000000000003.
[I 2025-02-06 18:29:41,111] Trial 3 finished with value: 0.8125 and parameters: {'k': 42}. Best is trial 3 with value: 0.8125.
[I 2025-02-06 18:29:41,126] Trial 4 finished with value: 0.27499999999999997 and parameters: {'k': 3}. Best is trial 3 with value: 0.8125.
[I 2025-02-06 18:29:41,145] Trial 5 finished with value: 0.5375 and parameters: {'k': 28}. Best is trial 3 with value: 0.8125.
[I 2025-02-06 18:29:41,165] Trial 6 finished with value: 0.8 and parameters: {'k': 39}. Best is trial 3 with value: 0.8125.
[I 2025-02-06 18:29:41,183] Trial 7 finished with 

In [9]:
# Plot test accuracies
fig = plot_model_comparison(test_accuracies_dict)
fig.show() # Show the plot

In [10]:
model_features = extract_model_features(data)
model_neighbors = compute_knn_indices(model_features, num_neighbors=10, metric="cosine")
overlap_matrix, model_list = compute_overlap_matrix(model_neighbors)
fig = plot_overlap_matrix(overlap_matrix, model_list)
fig.show()
