In [6]:
# Import necessary libraries
import pandas as pd # For data manipulation and analysis
import numpy as np # For numerical operations
import pickle # For serializing and deserializing Python objects
from sklearn.neighbors import KNeighborsClassifier # For k-nearest neighbors classification
from sklearn.linear_model import LogisticRegression # For logistic regression
from sklearn.metrics import accuracy_score, roc_auc_score # For evaluating model performance
from sklearn.preprocessing import StandardScaler # For standardizing features
from sklearn.pipeline import Pipeline # For creating a pipeline of transformers and estimators
import plotly.express as px # For creating interactive plots
import optuna # For hyperparameter optimization

In [2]:
# Load features from a pickle file
feature_dict_path = "/home/suraj/Repositories/FM-extractors-radiomics/evaluation/features/nsclc_radiomics.pkl" # Path to the pickle file containing features
with open(feature_dict_path, 'rb') as file: # Open the file in read binary mode
    data = pickle.load(file) # Load the data from the pickle file

In [3]:
# Store test accuracies for each model
test_accuracies_dict = {} # Initialize an empty dictionary to store test accuracies

In [14]:
# Iterate through each model's features
for model_name, values in data.items(): # Loop through each model and its corresponding features
    # Extract paths and labels for train, val, and test sets

    
    train_labels = [v["row"]["survival"] for v in values["train"]] # Extract malignancy labels for the training set
    val_labels = [v["row"]["survival"] for v in values["val"]] # Extract malignancy labels for the validation set
    test_labels = [v["row"]["survival"] for v in values["test"]] # Extract malignancy labels for the test set
    
    # Stack features
    train_items = np.vstack([v["feature"] for v in values["train"]]) # Stack features for the training set
    val_items = np.vstack([v["feature"] for v in values["val"]]) # Stack features for the validation set
    test_items = np.vstack([v["feature"] for v in values["test"]]) # Stack features for the test set

    # Skip MedImageInsightExtractor
    if model_name == "MedImageInsightExtractor": # Skip the MedImageInsightExtractor model
        continue

    # Define the objective function for Optuna
    def objective(trial): # Define the objective function for hyperparameter optimization
        k = trial.suggest_int('k', 1, 50) # Suggest an integer value for k (number of neighbors) between 1 and 50
        metric = trial.suggest_categorical('metric', ['cosine']) # Suggest a categorical value for the distance metric
        use_scaler = trial.suggest_categorical('use_scaler', [True, False]) # Suggest a boolean value for whether to use a scaler
        
        pipeline_steps = [] # Initialize an empty list to store pipeline steps
        if use_scaler: # If use_scaler is True
            pipeline_steps.append(('scaler', StandardScaler())) # Add a StandardScaler to the pipeline
        pipeline_steps.append(('knn', KNeighborsClassifier(n_neighbors=k, metric=metric))) # Add a KNeighborsClassifier to the pipeline
        
        pipeline = Pipeline(pipeline_steps) # Create a pipeline from the steps
        pipeline.fit(train_items, train_labels) # Fit the pipeline to the training data
        val_predictions = pipeline.predict_proba(val_items) # Predict probabilities for the validation set
        val_accuracy = roc_auc_score(val_labels, val_predictions[:,1]) # Calculate the ROC AUC score for the validation set
        return val_accuracy # Return the validation accuracy

    # Create an Optuna study and optimize the objective function
    study = optuna.create_study(direction='maximize') # Create an Optuna study to maximize the validation accuracy
    study.optimize(objective, n_trials=500) # Optimize the objective function for 100 trials

    # Get the best parameters from the Optuna study
    best_params = study.best_params # Get the best parameters found by Optuna
    best_k = best_params['k'] # Get the best value for k
    best_metric = best_params['metric'] # Get the best value for the distance metric
    best_use_scaler = best_params['use_scaler'] # Get the best value for whether to use a scaler

    # Test with the best k
    pipeline_steps = [] # Initialize an empty list to store pipeline steps
    if best_use_scaler: # If best_use_scaler is True
        pipeline_steps.append(('scaler', StandardScaler())) # Add a StandardScaler to the pipeline
    pipeline_steps.append(('knn', KNeighborsClassifier(n_neighbors=best_k, metric=best_metric))) # Add a KNeighborsClassifier to the pipeline
    
    pipeline = Pipeline(pipeline_steps) # Create a pipeline from the steps
    pipeline.fit(train_items, train_labels) # Fit the pipeline to the training data
    test_predictions = pipeline.predict_proba(test_items) # Predict probabilities for the test set
    test_accuracy = roc_auc_score(test_labels, test_predictions[:, 1]) # Calculate the ROC AUC score for the test set

    # Store the test accuracy
    test_accuracies_dict[model_name] = test_accuracy # Store the test accuracy for the current model

[I 2024-12-31 15:13:11,871] A new study created in memory with name: no-name-6b5a6ab6-5a2c-4252-85a7-c2674d8fca53
[I 2024-12-31 15:13:11,879] Trial 0 finished with value: 0.5431818181818182 and parameters: {'k': 18, 'metric': 'cosine', 'use_scaler': False}. Best is trial 0 with value: 0.5431818181818182.
[I 2024-12-31 15:13:11,886] Trial 1 finished with value: 0.7079545454545454 and parameters: {'k': 41, 'metric': 'cosine', 'use_scaler': False}. Best is trial 1 with value: 0.7079545454545454.
[I 2024-12-31 15:13:11,929] Trial 2 finished with value: 0.6090909090909091 and parameters: {'k': 16, 'metric': 'manhattan', 'use_scaler': True}. Best is trial 1 with value: 0.7079545454545454.
[I 2024-12-31 15:13:12,055] Trial 3 finished with value: 0.6977272727272726 and parameters: {'k': 45, 'metric': 'euclidean', 'use_scaler': False}. Best is trial 1 with value: 0.7079545454545454.
[I 2024-12-31 15:13:12,185] Trial 4 finished with value: 0.515909090909091 and parameters: {'k': 1, 'metric': 'eu

In [16]:
# Plot test accuracies using plotly
fig = px.bar( # Create a bar plot using plotly
    x=list(test_accuracies_dict.keys()), # Set the x-axis values to the model names
    y=list(test_accuracies_dict.values()), # Set the y-axis values to the test accuracies
    labels={'x': 'Model Name', 'y': 'Test Accuracy'}, # Set the axis labels
    title='Test AUC Comparison Between Models', # Set the title of the plot
    template='plotly_white', # Set the plot template to plotly_white
    text=[f'{val:.3f}' for val in test_accuracies_dict.values()], # Set the text labels for each bar
    width=500, # Set the width of the plot
    height=500, # Set the height of the plot
)

fig.update_traces( # Update the traces of the plot
    textposition='outside', # Set the text position to outside the bars
    marker_color='#1f77b4' # Set the marker color to #1f77b4
)

fig.update_layout( # Update the layout of the plot
    showlegend=False, # Hide the legend
    plot_bgcolor='rgba(0,0,0,0)', # Set the plot background color to transparent
    yaxis_range=[0,1], # Set the y-axis range to 0-1
    xaxis_linecolor='black', # Set the x-axis line color to black
    yaxis_linecolor='black', # Set the y-axis line color to black
    xaxis_ticks='', # Hide the x-axis ticks
    yaxis_ticks='', # Hide the y-axis ticks
    title_x=0.5, # Set the title position to the center
    font=dict(size=10) # Set the font size to 10
)

fig.show() # Show the plot