In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import krippendorff

from scipy.stats import kendalltau
from sklearn.utils import resample

cur_dir = "C:/Users/Shavius/Documents/Uni/Year 4/Project/ELLMRPCTFVIS/dev"
data_dir = os.path.join(cur_dir, "test_data")

In [2]:
# Calculate Inter Rater Agreement

def calculate_human_krippendorff(rater_dataset):
    
    score_cols = [col for col in rater_dataset.columns if col.startswith("human_score_")]
    score_matrix = rater_dataset[score_cols].values.T
    
    alpha_interval = krippendorff.alpha(reliability_data=score_matrix, level_of_measurement="interval")
    
    return round(alpha_interval, 3)

In [3]:
rater_dataset = pd.read_csv(os.path.join(data_dir, "hanna_annotations_raters.csv"))
human_krippendorff = calculate_human_krippendorff(rater_dataset)
print(f"Human Krippendorff's Alpha: {human_krippendorff}")

Human Krippendorff's Alpha: -0.055


In [4]:

def calculate_metric_correlations(human_scores, candidate_base, candidate_refined, n_bootsraps=1000, significance=0.05, rounding=3):
    
    if len(human_scores) == 0 or len(candidate_base) == 0 or len(candidate_refined) == 0:
        raise ValueError("Input series must not be empty.")
    
    if len(human_scores) != len(candidate_base) or len(human_scores) != len(candidate_refined):
        raise ValueError(f"Input series must have the same length.{len(human_scores)} != {len(candidate_base)}, {len(human_scores)} != {len(candidate_refined)}")
    
    base_corr, _ = kendalltau(human_scores, candidate_base)
    refined_corr, _ = kendalltau(human_scores, candidate_refined)
    
    n_obervations = len(human_scores)
    data_pairs = np.column_stack((human_scores, candidate_base, candidate_refined))
    
    bootstrap_base_corrs = []
    bootstrap_refined_corrs = []
    
    for i in range(n_bootsraps):
        sample = resample(data_pairs, n_samples=n_obervations, replace=True)
        
        boot_base_corr, _ = kendalltau(sample[:, 0], sample[:, 1])
        bootstrap_base_corrs.append(boot_base_corr)
        
        boot_refined_corr, _ = kendalltau(sample[:, 0], sample[:, 2])
        bootstrap_refined_corrs.append(boot_refined_corr)
    
    ci_lower_base = np.percentile(bootstrap_base_corrs, 100 * significance / 2)
    ci_upper_base = np.percentile(bootstrap_base_corrs, 100 * (1 - significance / 2))
    
    ci_lower_refined = np.percentile(bootstrap_refined_corrs, 100 * significance / 2)
    ci_upper_refined = np.percentile(bootstrap_refined_corrs, 100 * (1 - significance / 2))
    
    base_corr = round(base_corr, rounding)
    ci_range_base = round((ci_upper_base - ci_lower_base) / 2, rounding)
    refined_corr = round(refined_corr, rounding)
    ci_range_refined = round((ci_upper_refined - ci_lower_refined) / 2, rounding)
    
    return base_corr, ci_range_base, refined_corr, ci_range_refined
    
def evaluate_model_result(model_name, prefix, anno_type="average"):
    human_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_annotations_{anno_type}.csv"), index_col=0)["consistency"].values
    base_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_baseline_output_{model_name}.csv"), index_col=0)["consistency"].values
    refined_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_combined_output_{model_name}.csv"), index_col=0)["consistency"].values
    
    base_corr, base_ci, refined_corr, refined_ci = calculate_metric_correlations(human_scores, base_scores, refined_scores, significance=0.05)

    print(f"Base model correlation: {base_corr}±{base_ci}")
    print(f"Refined model correlation: {refined_corr}±{refined_ci}")
    return base_corr, refined_corr

def evaluate_model_results_system_level(model_name, prefix, anno_type="average"):
    annotation = pd.read_csv(os.path.join(data_dir, f"{prefix}_annotations_{anno_type}.csv"), index_col=0).rename(columns={"consistency": "human_score"})
    base_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_baseline_output_{model_name}.csv"), index_col=0).rename(columns={"consistency": "base_score"})
    annotation = annotation.merge(base_scores, left_index=True, right_index=True)
    
    refined_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_combined_output_{model_name}.csv"), index_col=0).rename(columns={"consistency": "refined_score"})
    annotation = annotation.merge(refined_scores, left_index=True, right_index=True)
    
    models = pd.read_csv(os.path.join(data_dir, f"{prefix}_story_models.csv"), index_col=0)
    annotation = annotation.merge(models, left_index=True, right_index=True)
    
    annotation = annotation.groupby("model").mean().reset_index()
    
    human_scores = annotation["human_score"].values
    base_scores = annotation["base_score"].values
    refined_scores = annotation["refined_score"].values
    
    base_corr, base_ci, refined_corr, refined_ci = calculate_metric_correlations(human_scores, base_scores, refined_scores, significance=0.05)

    print(f"Base model correlation: {base_corr}±{base_ci}")
    print(f"Refined model correlation: {refined_corr}±{refined_ci}")
    return base_corr, refined_corr

def evaluate_model_results_different_raters(model_name, prefix):
    human_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_annotations_raters.csv"), index_col=0)
    base_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_baseline_output_{model_name}.csv"), index_col=0)
    base_scores.rename(columns={"consistency": "consistency_x"}, inplace=True)
    refined_scores = pd.read_csv(os.path.join(data_dir, f"{prefix}_combined_output_{model_name}.csv"), index_col=0)
    refined_scores.rename(columns={"consistency": "consistency_y"}, inplace=True)
    
    base_corrs = []
    refined_corrs = []
    
    for i in range(1, 5):
        rater_scores = human_scores[[f"human_score_{i}"]]
        rater_scores = rater_scores.merge(base_scores, left_index=True, right_index=True)
        rater_scores = rater_scores.merge(refined_scores, left_index=True, right_index=True)
        
        rater_scores.dropna(inplace=True, subset=[f"human_score_{i}"])
        if len(rater_scores) <= 1:
            print(f"Not enough data for rater {i}. Skipping...")
            continue
        base_corr, _, refined_corr, _ = calculate_metric_correlations(
            rater_scores[f"human_score_{i}"].values,
            rater_scores["consistency_x"].values,
            rater_scores["consistency_y"].values,
            significance=0.05
        )
        
        base_corr = 0.0 if np.isnan(base_corr) else base_corr.item()
        refined_corr = 0.0 if np.isnan(refined_corr) else refined_corr.item()
        
        base_corrs.append(base_corr)
        refined_corrs.append(refined_corr)
    
    mean_base_corr = round(np.mean(base_corrs), 3)
    mean_refined_corr = round(np.mean(refined_corrs), 3)
    print(f"Base model rater correlations: mean: {mean_base_corr} max: {np.max(base_corrs)} min: {np.min(base_corrs)}")
    print(f"Refined model rater correlations: mean: {mean_refined_corr} max: {np.max(refined_corrs)} min: {np.min(refined_corrs)}")
    return base_corrs, mean_base_corr, refined_corrs, mean_refined_corr
    
    

In [5]:
model_names = ["gemini-structured", "gemini-15-structured", "gemini-lite-structured", "deepseek-structured"]
for model_name in model_names:
    print(f"Evaluating model: {model_name}")
    evaluate_model_result(model_name, "hanna", anno_type="average")
    evaluate_model_results_system_level(model_name, "hanna", anno_type="average")
    evaluate_model_results_different_raters(model_name, "hanna")
    print("\n")

Evaluating model: gemini-structured
Base model correlation: 0.224±0.054
Refined model correlation: 0.232±0.047
Base model correlation: 0.2±0.626
Refined model correlation: 0.164±0.608
Base model rater correlations: mean: 0.231 max: 0.309 min: 0.059
Refined model rater correlations: mean: 0.229 max: 0.3 min: 0.077


Evaluating model: gemini-15-structured
Base model correlation: 0.378±0.049
Refined model correlation: 0.271±0.051
Base model correlation: 0.6±0.303
Refined model correlation: 0.514±0.446
Base model rater correlations: mean: 0.321 max: 0.39 min: 0.214
Refined model rater correlations: mean: 0.266 max: 0.334 min: 0.124


Evaluating model: gemini-lite-structured
Base model correlation: 0.337±0.051
Refined model correlation: 0.3±0.05
Base model correlation: 0.537±0.429
Refined model correlation: 0.564±0.308
Base model rater correlations: mean: 0.299 max: 0.391 min: 0.124
Refined model rater correlations: mean: 0.283 max: 0.34 min: 0.14


Evaluating model: deepseek-structured
Bas