##LLM Win Rate

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
#Load the raw scores of each model
LLAMA2_FT=pd.read_excel("/content/raw_scores-AZval-LlamaFT-1-25-24.xlsx")
LLAMA2_ZS=pd.read_excel("/content/raw_scores-ZS-Llama-AZval-1-25-24.xlsx")
MedAlp_ZS=pd.read_excel("/content/MedAlp-ZS-AZval-1-25-24.xlsx")
Zephyr_FT=pd.read_excel("/content/raw_scores-AZval-ZeFT-1-25-24.xlsx")
Zephyr_ZS=pd.read_excel("/content/ZeZS-raw_scores-AZval-1-25-24.xlsx")
T5_ZS=pd.read_excel("/content/raw_scores-ZS-T5-AZval-1-25-24.xlsx")

In [None]:
# Assuming you have loaded the Excel file into the respective DataFrames, here's the code for one of the DataFrames:
# Fill NaN with 0 *Some metrics return NaN if failed.
Zephyr_FT.fillna(0, inplace=True)

# Rename the 'F1RadGraph Score' column to 'RadGraph F1'
Zephyr_FT.rename(columns={'F1RadGraph Score': 'RadGraph F1'}, inplace=True)

# Repeat the above two steps for the other DataFrames as needed
MedAlp_ZS.fillna(0, inplace=True)
MedAlp_ZS.rename(columns={'F1RadGraph Score': 'RadGraph F1'}, inplace=True)

LLAMA2_FT.fillna(0, inplace=True)
LLAMA2_FT.rename(columns={'F1RadGraph Score': 'RadGraph F1'}, inplace=True)

LLAMA2_ZS.fillna(0, inplace=True)
LLAMA2_ZS.rename(columns={'F1RadGraph Score': 'RadGraph F1'}, inplace=True)

Zephyr_ZS.fillna(0, inplace=True)
Zephyr_ZS.rename(columns={'F1RadGraph Score': 'RadGraph F1'}, inplace=True)

T5_ZS.fillna(0, inplace=True)
T5_ZS.rename(columns={'F1RadGraph Score': 'RadGraph F1'}, inplace=True)

# Display the updated DataFrame
print(Zephyr_FT)


In [None]:
# Construct data matrix
# Extract the "BLEU Score" column from each DataFrame and add a "Model" column
LLAMA2_FT['Model'] = 'Llama2_FT'
LLAMA2_ZS['Model'] = 'Llama2_ZS'
MedAlp_ZS['Model'] = 'MedAlp_ZS'
Zephyr_FT['Model'] = 'Zephyr_FT'
Zephyr_ZS['Model'] = 'Zephyr_ZS'
T5_ZS['Model'] = 'T5_ZS'

# Metric to use
Score = "RadGraph F1"

# Stack the DataFrames vertically
stacked_df = pd.concat([LLAMA2_FT[[Score, 'Model']], LLAMA2_ZS[[Score, 'Model']], MedAlp_ZS[[Score, 'Model']], Zephyr_FT[[Score, 'Model']], Zephyr_ZS[[Score, 'Model']], T5_ZS[[Score, 'Model']]])

# Reset the index of the stacked DataFrame
stacked_df.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
print(stacked_df)


In [None]:
# Simulated data with variations

# Define the metric you want to compare (e.g., 'BLEU Score', 'METEOR Score', etc.)
selected_metric = Score

# Create a DataFrame from the data
df = stacked_df

# Calculate the win rate percentiles for each model pair based on the selected metric
combinations = []
for model1 in df['Model'].unique():
    for model2 in df['Model'].unique():
        if model1 != model2:
            metric1 = df[df['Model'] == model1][selected_metric].values
            metric2 = df[df['Model'] == model2][selected_metric].values
            metric_win = (metric1 > metric2).mean() * 100  # Calculate the win rate as a percentile
            combinations.append([model1, model2, metric_win])

combinations_df = pd.DataFrame(combinations, columns=['Model1', 'Model2', f'{selected_metric} Win Rate'])

# Create a pivot table to prepare the data for visualization
pivot_table = combinations_df.pivot('Model1', 'Model2', f'{selected_metric} Win Rate')

# Create a mask to show only the lower corner of the heatmap
#mask = np.triu(np.ones_like(pivot_table, dtype=bool))
mask = np.tril(np.ones_like(pivot_table, dtype=bool))

# Create a color map for the color bar
cmap = sns.color_palette("coolwarm", as_cmap=True)

# Create a heatmap to visualize win rate comparisons for the selected metric (lower corner only)
plt.figure(figsize=(8, 6))
ax = sns.heatmap(pivot_table, annot=True, fmt='.1f', cmap=cmap, cbar=True, mask=mask)

ax.annotate("Box Font Size", xy=(-0.5, -0.5), fontsize=25)

# Change x and y-axis labels
ax.set_title(f'{selected_metric} Win Rate Comparisons')
ax.set_xlabel('...against this model', fontweight='bold')
ax.set_ylabel('Win rate of this model...',  fontweight='bold')

# Tilt the y-axis labels
plt.yticks(rotation=45)

plt.show()


## Statistical Comparison of Selected Model Pairs

In [None]:
import pandas as pd
from scipy.stats import shapiro, ttest_rel, wilcoxon

# Load the first Excel spreadsheet into a DataFrame
file1 = "file_path1"  # Replace with the actual file path
df1 = pd.read_excel(file1)

# Load the second Excel spreadsheet into another DataFrame
file2 = "file_path2"  # Replace with the actual file path
df2 = pd.read_excel(file2)

# List of parameters you want to test
parameters = ['BLEU Score', 'METEOR Score', 'ROUGE-L Score', 'BERT Score', 'RadGraph F1']

# Initialize a list to store test results
test_results = []

# Loop through each parameter
for param in parameters:
    # Extract data for the current parameter from both DataFrames
    data1 = df1[param]
    data2 = df2[param]

    # Perform a normality test (Shapiro-Wilk) for each dataset
    _, p_value1 = shapiro(data1)
    _, p_value2 = shapiro(data2)

    # Perform a paired t-test for normally distributed data
    if p_value1 > 0.05 and p_value2 > 0.05:
        _, p_value = ttest_rel(data1, data2)
    else:
        # Perform a Wilcoxon signed-rank test for non-normally distributed data
        _, p_value = wilcoxon(data1, data2, alternative='two-sided')

    test_results.append((param, p_value))

# Print the test results
for param, p_value in test_results:
    print(f"Parameter: {param}")
    print(f"p-value: {p_value:.4f}")
    print()
