In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import krippendorff

from scipy.stats import spearmanr
from sklearn.utils import resample

cur_dir = "C:/Users/Shavius/Documents/Uni/Year 4/Project/ELLMRPCTFVIS/dev"
data_dir = os.path.join(cur_dir, "test_data")

In [None]:
# Calculate Inter Rater Agreement

def calculate_human_krippendorff(rater_dataset):
    
    score_cols = [col for col in rater_dataset.columns if col.startswith("human_score_")]
    score_matrix = rater_dataset[score_cols].values.T
    
    alpha_interval = krippendorff.alpha(reliability_data=score_matrix, level_of_measurement="interval")
    
    return round(alpha_interval, 3)

In [9]:
rater_dataset = pd.read_csv(os.path.join(data_dir, "hanna_annotations_raters.csv"))
human_krippendorff = calculate_human_krippendorff(rater_dataset)
print(f"Human Krippendorff's Alpha: {human_krippendorff}")

Human Krippendorff's Alpha: 0.538


In [10]:

def calculate_metric_correlations(human_scores, candidate_base, candidate_refined, n_bootsraps=1000, significance=0.05, rounding=3):
    
    if len(human_scores) == 0 or len(candidate_base) == 0 or len(candidate_refined) == 0:
        raise ValueError("Input series must not be empty.")
    
    if len(human_scores) != len(candidate_base) or len(human_scores) != len(candidate_refined):
        raise ValueError(f"Input series must have the same length.{len(human_scores)} != {len(candidate_base)}, {len(human_scores)} != {len(candidate_refined)}")
    
    base_corr, _ = spearmanr(human_scores, candidate_base)
    refined_corr, _ = spearmanr(human_scores, candidate_refined)
    
    n_obervations = len(human_scores)
    data_pairs = np.column_stack((human_scores, candidate_base, candidate_refined))
    
    bootstrap_base_corrs = []
    bootstrap_refined_corrs = []
    
    for i in range(n_bootsraps):
        sample = resample(data_pairs, n_samples=n_obervations, replace=True)
        
        boot_base_corr, _ = spearmanr(sample[:, 0], sample[:, 1])
        bootstrap_base_corrs.append(boot_base_corr)
        
        boot_refined_corr, _ = spearmanr(sample[:, 0], sample[:, 2])
        bootstrap_refined_corrs.append(boot_refined_corr)
    
    ci_lower_base = np.percentile(bootstrap_base_corrs, 100 * significance / 2)
    ci_upper_base = np.percentile(bootstrap_base_corrs, 100 * (1 - significance / 2))
    
    ci_lower_refined = np.percentile(bootstrap_refined_corrs, 100 * significance / 2)
    ci_upper_refined = np.percentile(bootstrap_refined_corrs, 100 * (1 - significance / 2))
    
    base_corr = round(base_corr, rounding)
    ci_lower_base = round(ci_lower_base, rounding)
    ci_upper_base = round(ci_upper_base, rounding)
    refined_corr = round(refined_corr, rounding)
    ci_lower_refined = round(ci_lower_refined, rounding)
    ci_upper_refined = round(ci_upper_refined, rounding)
    
    return base_corr, (ci_lower_base, ci_upper_base), refined_corr, (ci_lower_refined, ci_upper_refined)
    
def evaluate_model_result(model_name, prefix, anno_type="average"):
    human_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_annotations_{anno_type}.csv"), index_col=0)["consistency"].values
    base_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_baseline_output_{model_name}.csv"), index_col=0)["consistency"].values
    refined_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_combined_output_{model_name}.csv"), index_col=0)["consistency"].values
    
    base_corr, (base_lower_ci, base_upper_ci), refined_corr, (refined_lower_ci, refined_upper_ci) = calculate_metric_correlations(human_scores, base_scores, refined_scores, significance=0.05)

    print(f"Base model correlation: {base_corr} ({base_lower_ci}-{base_upper_ci})")
    print(f"Refined model correlation: {refined_corr} ({refined_lower_ci}-{refined_upper_ci})")
    return base_corr, refined_corr, (base_lower_ci, base_upper_ci), (refined_lower_ci, refined_upper_ci)

In [11]:
model_names = ["gemini-structured", "gemini-15-structured", "gemini-lite-structured", "deepseek-structured"]
for model_name in model_names:
    print(f"Evaluating model: {model_name}")
    evaluate_model_result(model_name, "hanna", anno_type="average")
    print("\n")

Evaluating model: gemini-structured
Base model correlation: 0.293 (0.228-0.354)
Refined model correlation: 0.304 (0.243-0.36)


Evaluating model: gemini-15-structured
Base model correlation: 0.43 (0.371-0.488)
Refined model correlation: 0.358 (0.294-0.414)


Evaluating model: gemini-lite-structured
Base model correlation: 0.428 (0.371-0.48)
Refined model correlation: 0.359 (0.306-0.414)


Evaluating model: deepseek-structured
Base model correlation: 0.423 (0.366-0.478)
Refined model correlation: 0.318 (0.264-0.377)


