In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import math

In [None]:
def calculate_and_plot(type, csv_file, model_names, prompt_id, target_ratio, temperature):
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    external_list = []
    internal_list = []

    num_models = len(model_names)
    num_rows = math.ceil(num_models / 2)

    fig, axes = plt.subplots(num_rows, 2, figsize=(12, num_rows * 5))
    axes = axes.flatten()  # Flatten to 1D array for easy iteration

    # Loop through each model name
    for i, model_name in enumerate(model_names):
        # Extract the true labels, predicted labels, and predicted probabilities
        if prompt_id == 1:
            if target_ratio is None:
                if temperature is None:
                    true_labels = df['propose_correct_answer']
                    decisions = df[f'Decision ({model_name})']
                    external_confidence = df[f'External Confidence ({model_name})']
                    internal_confidence = df[f'Internal Confidence ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        'propose_correct_answer': true_labels,
                        f'Decision ({model_name})': decisions,
                        f'External Confidence ({model_name})': external_confidence,
                        f'Internal Confidence ({model_name})': internal_confidence
                    }
                else:
                    true_labels = df['propose_correct_answer']
                    decisions = df[f'Decision (temp {temperature}) ({model_name})']
                    external_confidence = df[f'External Confidence (temp {temperature}) ({model_name})']
                    internal_confidence = df[f'Internal Confidence (temp {temperature}) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        'propose_correct_answer': true_labels,
                        f'Decision (temp {temperature}) ({model_name})': decisions,
                        f'External Confidence (temp {temperature}) ({model_name})': external_confidence,
                        f'Internal Confidence (temp {temperature}) ({model_name})': internal_confidence
                    }
            else:
                if temperature is None:
                    true_labels = df[f'propose_correct_answer ({target_ratio*100:.0f}%)']
                    decisions = df[f'Decision ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence = df[f'External Confidence ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence = df[f'Internal Confidence ({target_ratio*100:.0f}%) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        f'propose_correct_answer ({target_ratio*100:.0f}%)': true_labels,
                        f'Decision ({target_ratio*100:.0f}%) ({model_name})': decisions,
                        f'External Confidence ({target_ratio*100:.0f}%) ({model_name})': external_confidence,
                        f'Internal Confidence ({target_ratio*100:.0f}%) ({model_name})': internal_confidence
                    }
                else:
                    true_labels = df[f'propose_correct_answer ({target_ratio*100:.0f}%)']
                    decisions = df[f'Decision (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence = df[f'External Confidence (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence = df[f'Internal Confidence (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        f'propose_correct_answer ({target_ratio*100:.0f}%)': true_labels,
                        f'Decision (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})': decisions,
                        f'External Confidence (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})': external_confidence,
                        f'Internal Confidence (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})': internal_confidence
                    }
        elif prompt_id == 2:
            if target_ratio is None:
                if temperature is None:
                    true_labels = df['correct_choice_id']
                    decisions = df[f'Decision (AB) ({model_name})']
                    external_confidence = df[f'External Confidence (AB) ({model_name})']
                    internal_confidence = df[f'Internal Confidence (AB) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        'correct_choice_id': true_labels,
                        f'Decision (AB) ({model_name})': decisions,
                        f'External Confidence (AB) ({model_name})': external_confidence,
                        f'Internal Confidence (AB) ({model_name})': internal_confidence
                    }
                else:
                    true_labels = df['correct_choice_id']
                    decisions = df[f'Decision (AB) (temp {temperature}) ({model_name})']
                    external_confidence = df[f'External Confidence (AB) (temp {temperature}) ({model_name})']
                    internal_confidence = df[f'Internal Confidence (AB) (temp {temperature}) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        'correct_choice_id': true_labels,
                        f'Decision (AB) (temp {temperature}) ({model_name})': decisions,
                        f'External Confidence (AB) (temp {temperature}) ({model_name})': external_confidence,
                        f'Internal Confidence (AB) (temp {temperature}) ({model_name})': internal_confidence
                    }
            else:
                if temperature is None:
                    true_labels = df[f'correct_choice_id ({target_ratio*100:.0f}%)']
                    decisions = df[f'Decision (AB) ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence = df[f'External Confidence (AB) ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence = df[f'Internal Confidence (AB) ({target_ratio*100:.0f}%) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        f'correct_choice_id ({target_ratio*100:.0f}%)': true_labels,
                        f'Decision (AB) ({target_ratio*100:.0f}%) ({model_name})': decisions,
                        f'External Confidence (AB) ({target_ratio*100:.0f}%) ({model_name})': external_confidence,
                        f'Internal Confidence (AB) ({target_ratio*100:.0f}%) ({model_name})': internal_confidence
                    }
                else:
                    true_labels = df[f'correct_choice_id ({target_ratio*100:.0f}%)']
                    decisions = df[f'Decision (AB) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence = df[f'External Confidence (AB) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence = df[f'Internal Confidence (AB) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        f'correct_choice_id ({target_ratio*100:.0f}%)': true_labels,
                        f'Decision (AB) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})': decisions,
                        f'External Confidence (AB) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})': external_confidence,
                        f'Internal Confidence (AB) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})': internal_confidence
                    }
            
        elif prompt_id == 3:
            if target_ratio is None:
                if temperature is None:
                    true_labels = df['correct_choice_id']
                    decisions = df[f'Decision (AB2) ({model_name})']
                    external_confidence = df[f'External Confidence (AB2) ({model_name})']
                    internal_confidence = df[f'Internal Confidence (AB2) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        'correct_choice_id': true_labels,
                        f'Decision (AB2) ({model_name})': decisions,
                        f'External Confidence (AB2) ({model_name})': external_confidence,
                        f'Internal Confidence (AB2) ({model_name})': internal_confidence
                    }
                else:
                    true_labels = df['correct_choice_id']
                    decisions = df[f'Decision (AB2) (temp {temperature}) ({model_name})']
                    external_confidence = df[f'External Confidence (AB2) (temp {temperature}) ({model_name})']
                    internal_confidence = df[f'Internal Confidence (AB2) (temp {temperature}) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        'correct_choice_id': true_labels,
                        f'Decision (AB2) (temp {temperature}) ({model_name})': decisions,
                        f'External Confidence (AB2) (temp {temperature}) ({model_name})': external_confidence,
                        f'Internal Confidence (AB2) (temp {temperature}) ({model_name})': internal_confidence
                    }
            else:
                if temperature is None:
                    true_labels = df[f'correct_choice_id ({target_ratio*100:.0f}%)']
                    decisions = df[f'Decision (AB2) ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence = df[f'External Confidence (AB2) ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence = df[f'Internal Confidence (AB2) ({target_ratio*100:.0f}%) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        f'correct_choice_id ({target_ratio*100:.0f}%)': true_labels,
                        f'Decision (AB2) ({target_ratio*100:.0f}%) ({model_name})': decisions,
                        f'External Confidence (AB2) ({target_ratio*100:.0f}%) ({model_name})': external_confidence,
                        f'Internal Confidence (AB2) ({target_ratio*100:.0f}%) ({model_name})': internal_confidence
                    }
                else:
                    true_labels = df[f'correct_choice_id ({target_ratio*100:.0f}%)']
                    decisions = df[f'Decision (AB2) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence = df[f'External Confidence (AB2) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence = df[f'Internal Confidence (AB2) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    
                    # Check for NaNs and print information
                    columns_to_check = {
                        f'correct_choice_id ({target_ratio*100:.0f}%)': true_labels,
                        f'Decision (AB2) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})': decisions,
                        f'External Confidence (AB2) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})': external_confidence,
                        f'Internal Confidence (AB2) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})': internal_confidence
                    }

        else:
            raise ValueError("The prompt_id must be either 1 or 2 or 3")
            
        nan_indices = set()
        for col_name, col_data in columns_to_check.items():
            nans = col_data.isna()
            total_nans = nans.sum()
            if total_nans > 0:
                col_nan_indices = df[nans].index.tolist()
                nan_indices.update(col_nan_indices)
                print(f"Column '{col_name}' has {total_nans} NaN(s).")
                print(f"Row indices with NaNs: {col_nan_indices}")
        
        # Remove rows with NaNs
        nan_indices = list(nan_indices)
        df_cleaned = df.drop(index=nan_indices)
        
        # Re-extract the true labels, predicted labels, and predicted probabilities after removing NaNs
        if prompt_id == 1:
            if target_ratio is None:
                if temperature is None:
                    true_labels_cleaned = df_cleaned['propose_correct_answer']
                    decisions_cleaned = df_cleaned[f'Decision ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence ({model_name})']
                else:
                    true_labels_cleaned = df_cleaned['propose_correct_answer']
                    decisions_cleaned = df_cleaned[f'Decision (temp {temperature}) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence (temp {temperature}) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence (temp {temperature}) ({model_name})']
            else:
                if temperature is None:
                    true_labels_cleaned = df_cleaned[f'propose_correct_answer ({target_ratio*100:.0f}%)']
                    decisions_cleaned = df_cleaned[f'Decision ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence ({target_ratio*100:.0f}%) ({model_name})']
                else:
                    true_labels_cleaned = df_cleaned[f'propose_correct_answer ({target_ratio*100:.0f}%)']
                    decisions_cleaned = df_cleaned[f'Decision (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
        elif prompt_id == 2:
            if target_ratio is None:
                if temperature is None:
                    true_labels_cleaned = df_cleaned['correct_choice_id']
                    decisions_cleaned = df_cleaned[f'Decision (AB) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence (AB) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence (AB) ({model_name})']
                else:
                    true_labels_cleaned = df_cleaned['correct_choice_id']
                    decisions_cleaned = df_cleaned[f'Decision (AB) (temp {temperature}) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence (AB) (temp {temperature}) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence (AB) (temp {temperature}) ({model_name})']
            else:
                if temperature is None:
                    true_labels_cleaned = df_cleaned[f'correct_choice_id ({target_ratio*100:.0f}%)']
                    decisions_cleaned = df_cleaned[f'Decision (AB) ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence (AB) ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence (AB) ({target_ratio*100:.0f}%) ({model_name})']
                else:
                    true_labels_cleaned = df_cleaned[f'correct_choice_id ({target_ratio*100:.0f}%)']
                    decisions_cleaned = df_cleaned[f'Decision (AB) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence (AB) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence (AB) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
        elif prompt_id == 3:
            if target_ratio is None:
                if temperature is None:
                    true_labels_cleaned = df_cleaned['correct_choice_id']
                    decisions_cleaned = df_cleaned[f'Decision (AB2) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence (AB2) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence (AB2) ({model_name})']
                else:
                    true_labels_cleaned = df_cleaned['correct_choice_id']
                    decisions_cleaned = df_cleaned[f'Decision (AB2) (temp {temperature}) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence (AB2) (temp {temperature}) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence (AB2) (temp {temperature}) ({model_name})']
            else:
                if temperature is None:
                    true_labels_cleaned = df_cleaned[f'correct_choice_id ({target_ratio*100:.0f}%)']
                    decisions_cleaned = df_cleaned[f'Decision (AB2) ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence (AB2) ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence (AB2) ({target_ratio*100:.0f}%) ({model_name})']
                else:
                    true_labels_cleaned = df_cleaned[f'correct_choice_id ({target_ratio*100:.0f}%)']
                    decisions_cleaned = df_cleaned[f'Decision (AB2) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    external_confidence_cleaned = df_cleaned[f'External Confidence (AB2) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
                    internal_confidence_cleaned = df_cleaned[f'Internal Confidence (AB2) (temp {temperature}) ({target_ratio*100:.0f}%) ({model_name})']
        else:
            raise ValueError("The prompt_id must be either 1 or 2 or 3")

        # Adjust the confidence values based on the decision
        external_confidence_adjusted = np.where(decisions_cleaned == 0, external_confidence_cleaned * -1, external_confidence_cleaned)
        internal_confidence_adjusted = np.where(decisions_cleaned == 0, internal_confidence_cleaned * -1, internal_confidence_cleaned)
        
        external_confidence_adjusted = (external_confidence_adjusted / 2) + 0.5
        internal_confidence_adjusted = (internal_confidence_adjusted / 2) + 0.5

        # Calculate ROC curve and AUROC
        if type.upper() == "ROC":
            a, b, _ = roc_curve(true_labels_cleaned, external_confidence_adjusted)
            c = auc(a, b)
            external_list.append(c)
        
            d, e, _ = roc_curve(true_labels_cleaned, internal_confidence_adjusted)
            f = auc(d, e)
            internal_list.append(f)
            
        elif type.upper() == "PRC":
            b, a, _ = precision_recall_curve(true_labels_cleaned, external_confidence_adjusted)
            c = auc(a, b)
            external_list.append(c)
            
            e, d, _ = precision_recall_curve(true_labels_cleaned, internal_confidence_adjusted)
            f = auc(d, e)
            internal_list.append(f)

        else:
            raise ValueError("The type must be either ROC or PRC")
        
        # Plot ROC or PRC curve
        ax = axes[i]
        ax.plot(a, b, color='darkorange', lw=2, label=f'Explicit Probability (AU{type} = {c:.3f})')
        ax.plot(d, e, color='blue', lw=2, label=f'Implicit Probability (AU{type} = {f:.3f})')
        
        # Increase the font size for labels, ticks, and title
        label_fontsize = 14
        title_fontsize = 12
        tick_fontsize = 12
        
        if type.upper() == "ROC":
            ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            ax.set_xlabel('False Positive Rate', fontsize=label_fontsize)
            ax.set_ylabel('True Positive Rate', fontsize=label_fontsize)
            ax.legend(loc="lower right", fontsize=tick_fontsize)
        elif type.upper() == "PRC":
            ax.set_xlabel('Recall', fontsize=label_fontsize)
            ax.set_ylabel('Precision', fontsize=label_fontsize)
            ax.legend(loc="lower left", fontsize=tick_fontsize)
        else:
            raise ValueError("The type must be either ROC or PRC")
        
        # Set the title based on the csv_file and target_ratio
        if csv_file.split(".")[0] == "clinical_knowledge":
            title_str = "MMLU-CK"
        elif csv_file.split(".")[0] == "college_medicine":
            title_str = "MMLU-CM"
        elif csv_file.split(".")[0] == "medQA_en":
            title_str = "USMLE"
        elif csv_file.split(".")[0] == "medQA_zh":
            title_str = "MCMLE"
        else:
            title_str = "MGB-SDoH"
        
        ax.set_title(f'{model_name}', fontsize=title_fontsize) if target_ratio is None else ax.set_title(f'{model_name} ({target_ratio*100:.0f}% imbalance)', fontsize=title_fontsize)
        
        # Increase the font size of the tick labels
        ax.tick_params(axis='both', which='major', labelsize=tick_fontsize)

    # Adjust layout and save the plot as a PNG file
    plt.tight_layout()
    if prompt_id == 1:
        if target_ratio is None:
            output_file_name = f"{csv_file.split('.')[0]}_{type}_All_Models.png"
        else:
            output_file_name = f"{csv_file.split('.')[0]}_{type}_All_Models ({target_ratio*100:.0f} imbalance).png"
    elif prompt_id == 2:
        if target_ratio is None:
            output_file_name = f"{csv_file.split('.')[0]}_{type}_All_Models (AB).png"
        else:
            output_file_name = f"{csv_file.split('.')[0]}_{type}_All_Models (AB) ({target_ratio*100:.0f} imbalance).png"
    elif prompt_id == 3:
        if target_ratio is None:
            output_file_name = f"{csv_file.split('.')[0]}_{type}_All_Models (AB2).png"
        else:
            output_file_name = f"{csv_file.split('.')[0]}_{type}_All_Models (AB2) ({target_ratio*100:.0f} imbalance).png"
    else:
        raise ValueError("The prompt_id must be either 1 or 2 or 3")
    plt.savefig(output_file_name)
    plt.show()

    return external_list, internal_list

In [None]:
all_model_names = ['Qwen2-72B-Instruct', 'Qwen2-7B-Instruct', 'Meta-Llama-3.1-70B-Instruct', 'Meta-Llama-3.1-8B-Instruct', 'gemma-2-27b-it', 'gemma-2-9b-it', 'Mistral-Large-Instruct-2407', 'Mistral-7B-Instruct-v0.3', 'Yi-1.5-34B-Chat', 'Yi-1.5-9B-Chat', 'Phi-3-medium-128k-instruct', 'Phi-3-mini-128k-instruct']
large_model_names = ['Qwen2-72B-Instruct', 'Meta-Llama-3.1-70B-Instruct', 'gemma-2-27b-it', 'Mistral-Large-Instruct-2407', 'Yi-1.5-34B-Chat', 'Phi-3-medium-128k-instruct']  # List of model names to analyze
first_half_model_names = ['Qwen2-72B-Instruct', 'Qwen2-7B-Instruct', 'Meta-Llama-3.1-70B-Instruct', 'Meta-Llama-3.1-8B-Instruct', 'gemma-2-27b-it', 'gemma-2-9b-it']  # List of model names to analyze
second_half_model_names = ['Mistral-Large-Instruct-2407', 'Mistral-7B-Instruct-v0.3', 'Yi-1.5-34B-Chat', 'Yi-1.5-9B-Chat', 'Phi-3-medium-128k-instruct', 'Phi-3-mini-128k-instruct']  # List of model names to analyze
small_model_names = ['Qwen2-7B-Instruct', 'Meta-Llama-3.1-8B-Instruct', 'gemma-2-9b-it', 'Mistral-7B-Instruct-v0.3', 'Yi-1.5-9B-Chat', 'Phi-3-mini-128k-instruct']
llama_model_names = ['Meta-Llama-3.1-70B-Instruct', 'Meta-Llama-3.1-8B-Instruct']
mistral_model_names = ['Mistral-Large-Instruct-2407', 'Mistral-7B-Instruct-v0.3']
fine_tuned_models = ['Meta-Llama-3.1-8B-Instruct', 'Meta-Llama-3.1-8B-Instruct (LoRA 8)', 'Meta-Llama-3.1-8B-Instruct (LoRA 9)', 'Meta-Llama-3.1-8B-Instruct (LoRA 10)']
type = "PRC"
csv_file = '../data/SDoH.csv'  # Path to your CSV file
prompt_id = 2
target_ratio = None
temperature = None
external_list, internal_list = calculate_and_plot(type, csv_file, large_model_names, prompt_id, target_ratio, temperature)