# Agreement Analysis

## Label Distribution Analysis for Summarisation Relevance Judgements

In [1]:
from helpers import *

In [2]:
# Grouped bar plot for dl19_df: x-axis = score columns; groups = full vs summarised
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


def report_agreement_scores(dataset_name, model_name, scenario_name):
    human_qrels = f"../data/msmarco-passage-trec-{dataset_name}-judged/qrels/trec.qrels.txt"
    gpt4o_summ_qrels   = f"../data/msmarco-passage-trec-{dataset_name}-judged/qrels/gpt_4o_summary_umbrella_zeroshot_qrels.txt"
    gpt4o_full_qrels   = f"../data/msmarco-passage-trec-{dataset_name}-judged/qrels/OpenAiGPT-gpt-4o-umbrella_zeroshot_basic.qrels.txt"

    results_full = compute_agreement_metrics(human_qrels, gpt4o_full_qrels)
    results_full['type']='full'
    results_summ = compute_agreement_metrics(human_qrels, gpt4o_summ_qrels)
    results_summ['type']='summarised'
    
    df = pd.DataFrame([results_full, results_summ]).set_index('type').sort_index()
    return df

def plot_label_distribution(dataset_name, scenario_name):
    human_qrels = f"../data/msmarco-passage-trec-{dataset_name}-judged/qrels/trec.qrels.txt"
    gpt4o_summ_qrels   = f"../data/msmarco-passage-trec-{dataset_name}-judged/qrels/gpt-4o_umbrella_zeroshot_qrels_{scenario_name}.txt"
    gpt4o_full_qrels   = f"../data/msmarco-passage-trec-{dataset_name}-judged/qrels/gpt-4o_umbrella_zeroshot_qrels_fulldocs.txt"

    # Read qrels using existing helper
    h_df = read_qrels(human_qrels)
    f_df = read_qrels(gpt4o_full_qrels)
    s_df = read_qrels(gpt4o_summ_qrels)

    # All labels appearing in any source
    all_labels = sorted(set(h_df["rel"].unique()) | set(f_df["rel"].unique()) | set(s_df["rel"].unique()))

    # Counts per label (fill missing with 0)
    h_counts = h_df["rel"].value_counts().to_dict()
    f_counts = f_df["rel"].value_counts().to_dict()
    s_counts = s_df["rel"].value_counts().to_dict()

    vals_h = [h_counts.get(lbl, 0) for lbl in all_labels]
    vals_f = [f_counts.get(lbl, 0) for lbl in all_labels]
    vals_s = [s_counts.get(lbl, 0) for lbl in all_labels]

    # Positions for grouped bars without numpy
    x = list(range(len(all_labels)))
    width = 0.25
    x_h = [i - width for i in x]
    x_f = x
    x_s = [i + width for i in x]

    fig, ax = plt.subplots(figsize=(9, 5))
    ax.bar(x_h, vals_h, width=width, label="Human", color="tab:blue")
    ax.bar(x_f, vals_f, width=width, label="LLM Full", color="tab:orange")
    ax.bar(x_s, vals_s, width=width, label="LLM Summary", color="tab:green")

    ax.set_xticks(x)
    ax.set_xticklabels([str(l) for l in all_labels])
    ax.set_xlabel("Relevance label")
    ax.set_ylabel("Count")
    title_suffix = f" ({dataset_name})" if 'dataset_name' in globals() else ""
    ax.set_title(f"Label distribution: Human vs LLM Full vs LLM Summary{title_suffix}")
    ax.legend()
    ax.grid(axis="y", linestyle="--", alpha=0.3)
    plt.tight_layout()
    plt.show()

In [3]:

def plot_dl_grouped_bars(df: pd.DataFrame,
                         full_label: str = "full",
                         summarised_label: str = "summarised",
                         title: str | None = None,
                         ylabel: str = "Score") -> None:
    """Plot grouped bars where x-axis are score columns and groups are full vs summarised.
    Expects df indexed by [full_label, summarised_label] with numeric score columns.
    """
    if df is None or not isinstance(df, pd.DataFrame):
        raise ValueError("df must be a pandas DataFrame")
    # Ensure the required rows exist
    idx_lower = df.index.astype(str).str.lower()
    have_full = any(idx_lower == full_label.lower())
    have_sum = any(idx_lower == summarised_label.lower())
    if not (have_full and have_sum):
        raise ValueError(f"DataFrame index must contain rows '{full_label}' and '{summarised_label}'.")
    # Reindex to enforce order and filter numeric score columns
    df2 = df.copy()
    # Align exact casing
    def _find_label(label: str) -> str:
        m = [i for i in df2.index if str(i).lower() == label.lower()]
        return m[0] if m else label
    full_row = _find_label(full_label)
    sum_row = _find_label(summarised_label)
    # Keep only numeric columns (scores)
    score_cols = [c for c in df2.columns if pd.api.types.is_numeric_dtype(df2[c])]
    if len(score_cols) == 0:
        raise ValueError("No numeric score columns found to plot.")
    sub = df2.loc[[full_row, sum_row], score_cols]
    # Plot
    n = len(score_cols)
    x = np.arange(n)
    width = 0.42
    full_vals = sub.loc[full_row].to_numpy(dtype=float)
    sum_vals = sub.loc[sum_row].to_numpy(dtype=float)
    fig, ax = plt.subplots(figsize=(max(6, n * 0.9), 4.8))
    bars_full = ax.bar(x - width/2, full_vals, width, label=full_label)
    bars_sum = ax.bar(x + width/2, sum_vals, width, label=summarised_label)

    # Annotate values on top of bars
    def _annotate(bars):
        for rect in bars:
            height = rect.get_height()
            ax.annotate(f"{height:.3f}",
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),
                        textcoords="offset points",
                        ha="center", va="bottom", fontsize=9)
    _annotate(bars_full)
    _annotate(bars_sum)

    ax.set_xticks(x)
    ax.set_xticklabels(score_cols, rotation=30, ha="right")
    ax.set_ylabel(ylabel)
    if title:
        ax.set_title(title)
    ax.legend()
    ax.grid(axis="y", linestyle=":", alpha=0.4)
    fig.tight_layout()
    plt.show()



### Distributions


In [4]:
datasets = ['dl-2019', 'dl-2020','rag-2024']
scenarios = ['summarisation_80tokens', 'summarisation_120tokens', 'fulldocs']
models = ['gpt-4o', 'llama3.18b']
from itertools import product

for dataset_name, scenario_name, model_name in product(datasets, scenarios, models):
    filename = f'../data/msmarco-passage-trec-{dataset_name}-judged/qrels/{model_name}_umbrella_zeroshot_qrels_{scenario_name}.txt'
    # print(f"Processing: {filename}")
    qrel_df = pd.read_csv(filename,
        sep=r"\s+",
        engine="python",
        header=None,
        names=["query_id", "_q0", "doc_id", "rel"],
        dtype={"query_id": str, "doc_id": str},
    )
    
    # Initialize accumulators on first iteration
    if dataset_name == datasets[0] and scenario_name == scenarios[0] and model_name == models[0]:
        _ratio_rows = []
        _ratio_labels = set()

    # Compute normalized label ratios
    dist = qrel_df["rel"].value_counts(normalize=True)
    dist.index = dist.index.astype(int)

    row = {"dataset_name": dataset_name, "scenario_name": scenario_name, "model_name": model_name}
    for lbl, prop in dist.sort_index().items():
        row[f"{lbl}"] = f"{prop:.3f}"
    _ratio_rows.append(row)
    _ratio_labels.update(dist.index.tolist())

# Materialize DataFrame with consistent columns across labels
label_ratio_df = pd.DataFrame(_ratio_rows)
for lbl in sorted(_ratio_labels):
    col = f"{lbl}"
    if col not in label_ratio_df.columns:
        label_ratio_df[col] = "0.000"
label_cols = [f"{lbl}" for lbl in sorted(_ratio_labels)]
label_ratio_df = label_ratio_df[["dataset_name", "scenario_name", "model_name"] + label_cols]
display(label_ratio_df)


Unnamed: 0,dataset_name,scenario_name,model_name,0,1,2,3
0,dl-2019,summarisation_80tokens,gpt-4o,0.5,0.314,0.099,0.087
1,dl-2019,summarisation_80tokens,llama3.18b,0.554,0.018,0.307,0.121
2,dl-2019,summarisation_120tokens,gpt-4o,0.497,0.316,0.1,0.087
3,dl-2019,summarisation_120tokens,llama3.18b,0.552,0.022,0.302,0.124
4,dl-2019,fulldocs,gpt-4o,0.466,0.332,0.111,0.091
5,dl-2019,fulldocs,llama3.18b,0.465,0.042,0.311,0.182
6,dl-2020,summarisation_80tokens,gpt-4o,0.606,0.253,0.076,0.065
7,dl-2020,summarisation_80tokens,llama3.18b,0.3,0.121,0.477,0.102
8,dl-2020,summarisation_120tokens,gpt-4o,0.598,0.263,0.074,0.065
9,dl-2020,summarisation_120tokens,llama3.18b,0.627,0.025,0.264,0.084


In [5]:
label_ratio_df[(label_ratio_df['model_name']=='llama3.18b') & (label_ratio_df['scenario_name']=='summarisation_80tokens')]

Unnamed: 0,dataset_name,scenario_name,model_name,0,1,2,3
1,dl-2019,summarisation_80tokens,llama3.18b,0.554,0.018,0.307,0.121
7,dl-2020,summarisation_80tokens,llama3.18b,0.3,0.121,0.477,0.102
13,rag-2024,summarisation_80tokens,llama3.18b,0.133,0.231,0.45,0.187


### Official Ratios

In [6]:
datasets = ['dl-2019', 'dl-2020','rag-2024']
for dataset_name in datasets:
    filename = f'../data/msmarco-passage-trec-{dataset_name}-judged/qrels/trec.qrels.txt'
    qrel_df = pd.read_csv(filename,
        sep=r"\s+",
        engine="python",
        header=None,
        names=["query_id", "_q0", "doc_id", "rel"],
        dtype={"query_id": str, "doc_id": str},
    )
    dist = qrel_df["rel"].value_counts(normalize=True)
    row = {"dataset_name": dataset_name}
    for lbl, prop in dist.sort_index().items():
        row[f"{lbl}"] = f"{prop:.3f}"    
    display(pd.DataFrame(row, index=[0]))

Unnamed: 0,dataset_name,0,1,2,3
0,dl-2019,0.557,0.173,0.195,0.075


Unnamed: 0,dataset_name,0,1,2,3
0,dl-2020,0.683,0.17,0.09,0.057


Unnamed: 0,dataset_name,0,1,2,3
0,rag-2024,0.373,0.311,0.23,0.086


## Agreement Score Analysis

In [7]:
# datasets = ['dl-2019', 'dl-2020','rag-2024']
pd.set_option('display.float_format', lambda x: f"{x:.5f}")
datasets = ['dl-2019', 'dl-2020','rag-2024']
scenarios = ['summarisation_80tokens', 'summarisation_120tokens', 'fulldocs']
models = ['gpt-4o', 'llama3.18b']
from itertools import product
agreement_rows = []
for dataset_name, scenario_name, model_name in product(datasets, scenarios, models):
    # print(f"Processing: {filename}")
    llm_qrel_df = pd.read_csv(f'../data/msmarco-passage-trec-{dataset_name}-judged/qrels/{model_name}_umbrella_zeroshot_qrels_{scenario_name}.txt',
        sep=r"\s+",
        engine="python",
        header=None,
        names=["query_id", "_q0", "doc_id", "rel"],
        dtype={"query_id": str, "doc_id": str},
    )
    human_qrel_df = read_qrels(f"../data/msmarco-passage-trec-{dataset_name}-judged/qrels/trec.qrels.txt")
    
    # Initialize accumulators on first iteration
    if dataset_name == datasets[0] and scenario_name == scenarios[0] and model_name == models[0]:
        _ratio_rows = []
        _ratio_labels = set()

    # Compute normalized label ratios
    # Compute agreement metrics vs human (binary threshold >= 1 and graded)

    # Prepare aligned vectors (order assumed consistent)
    n = min(len(llm_qrel_df), len(human_qrel_df))
    h_rel = human_qrel_df["rel"].astype(int).values[:n]
    m_rel = llm_qrel_df["rel"].astype(int).values[:n]

    # Optional sanity check on IDs (won't align; just warn if mismatch)
    try:
        same_ids = (
            human_qrel_df["query_id"].astype(str).values[:n] == llm_qrel_df["query_id"].astype(str).values[:n]
        ).all() and (
            human_qrel_df["doc_id"].astype(str).values[:n] == llm_qrel_df["doc_id"].astype(str).values[:n]
        ).all()
        if not same_ids:
            print(f"[warn] ID mismatch for {dataset_name} | {model_name} | {scenario_name} (proceeding by position)")
    except Exception:
        pass

    # Build DataFrames for irrCAC
    df_graded = pd.DataFrame({"r1": h_rel, "r2": m_rel})
    df_binary = pd.DataFrame({"r1": (h_rel >= 1).astype(int), "r2": (m_rel >= 1).astype(int)})

    # Helper: robust Krippendorff alpha via irrCAC (method name may vary)
    def _kripp(cac_obj):
        for meth in ("krippendorff", "krippendorff_alpha", "alpha"):
            if hasattr(cac_obj, meth):
                try:
                    res = getattr(cac_obj, meth)()
                    return float(res["est"]["coefficient_value"])
                except Exception:
                    continue
        return float("nan")

    # Binary metrics
    CAC_bin = CAC(df_binary)
    binary_cohen_kappa = sk_cohen_kappa(df_binary["r1"], df_binary["r2"])
    binary_gwet = float(CAC_bin.gwet()["est"]["coefficient_value"])
    binary_kripp = _kripp(CAC_bin)

    # Graded metrics
    CAC_grad = CAC(df_graded)
    graded_fleiss_kappa = float(CAC_grad.fleiss()["est"]["coefficient_value"])
    graded_gwet = float(CAC_grad.gwet()["est"]["coefficient_value"])
    graded_kripp = _kripp(CAC_grad)
    graded_weighted_kappa = sk_cohen_kappa(df_graded["r1"], df_graded["r2"], weights="quadratic")

    # Collect and optionally display at the end of the loop

    agreement_rows.append({
        "dataset_name": dataset_name,
        "scenario_name": scenario_name,
        "model_name": model_name,
        "binary_cohen_kappa": float(binary_cohen_kappa),
        "binary_gwet": float(binary_gwet),
        "binary_krippendorff_alpha": float(binary_kripp),
        "graded_fleiss_kappa": float(graded_fleiss_kappa),
        "graded_gwet": float(graded_gwet),
        "graded_krippendorff_alpha": float(graded_kripp),
        "graded_weighted_kappa": float(graded_weighted_kappa)
    })

    # If this is the last combo, display a DataFrame
    if (dataset_name == datasets[-1]) and (scenario_name == scenarios[-1]) and (model_name == models[-1]):
        agreement_df = pd.DataFrame(agreement_rows)
        display(agreement_df)
        

Unnamed: 0,dataset_name,scenario_name,model_name,binary_cohen_kappa,binary_gwet,binary_krippendorff_alpha,graded_fleiss_kappa,graded_gwet,graded_krippendorff_alpha,graded_weighted_kappa
0,dl-2019,summarisation_80tokens,gpt-4o,0.52352,0.5251,0.522,0.33106,0.46316,0.3311,0.58258
1,dl-2019,summarisation_80tokens,llama3.18b,0.31972,0.33624,0.31975,0.19336,0.3826,0.1934,0.36232
2,dl-2019,summarisation_120tokens,gpt-4o,0.51801,0.51913,0.51631,0.32591,0.45802,0.32595,0.57403
3,dl-2019,summarisation_120tokens,llama3.18b,0.31101,0.32726,0.31103,0.18783,0.37653,0.18787,0.34956
4,dl-2019,fulldocs,gpt-4o,0.51666,0.51323,0.51272,0.33954,0.45816,0.33957,0.5819
5,dl-2019,fulldocs,llama3.18b,0.30081,0.29559,0.29494,0.15533,0.30351,0.15537,0.33507
6,dl-2020,summarisation_80tokens,gpt-4o,0.49003,0.56589,0.48671,0.34752,0.58082,0.34755,0.56092
7,dl-2020,summarisation_80tokens,llama3.18b,0.21513,0.1002,0.09973,0.00261,0.17132,0.00265,0.24531
8,dl-2020,summarisation_120tokens,gpt-4o,0.48777,0.5596,0.48377,0.3499,0.57958,0.34993,0.56185
9,dl-2020,summarisation_120tokens,llama3.18b,0.29129,0.41373,0.28884,0.14074,0.45311,0.14078,0.31323


In [12]:
pd.set_option('display.float_format', lambda x: f"{x:.3f}")
scenarios = ['summarisation_80tokens', 'summarisation_120tokens', 'fulldocs']
models = ['gpt-4o', 'llama3.18b']
binary_cols = [c for c in agreement_df.columns if c.startswith('binary_')] 
graded_cols = [c for c in agreement_df.columns if c.startswith('graded_')]
binary_cols = [binary_cols[i] for i in [0,2,1]]  # reorder
graded_cols = [graded_cols[i] for i in [2,3]]

binary_cols = ['model_name', 'scenario_name', 'dataset_name'] + binary_cols
graded_cols = ['model_name', 'scenario_name', 'dataset_name'] + graded_cols

print("Binary agreement metrics:")
for m,s in product(models, scenarios):
    print(f"Model: {m}, Scenario: {s}")
    display(agreement_df[(agreement_df['model_name']==m) & (agreement_df['scenario_name']==s)][binary_cols].reset_index(drop=True).T)


print("Graded agreement metrics:")
for m,s in product(models, scenarios):
    print(f"Model: {m}, Scenario: {s}")
    display(agreement_df[(agreement_df['model_name']==m) & (agreement_df['scenario_name']==s)][graded_cols].reset_index(drop=True).T)

Binary agreement metrics:
Model: gpt-4o, Scenario: summarisation_80tokens


Unnamed: 0,0,1,2
model_name,gpt-4o,gpt-4o,gpt-4o
scenario_name,summarisation_80tokens,summarisation_80tokens,summarisation_80tokens
dataset_name,dl-2019,dl-2020,rag-2024
binary_cohen_kappa,0.524,0.490,0.397
binary_krippendorff_alpha,0.522,0.487,0.389
binary_gwet,0.525,0.566,0.533


Model: gpt-4o, Scenario: summarisation_120tokens


Unnamed: 0,0,1,2
model_name,gpt-4o,gpt-4o,gpt-4o
scenario_name,summarisation_120tokens,summarisation_120tokens,summarisation_120tokens
dataset_name,dl-2019,dl-2020,rag-2024
binary_cohen_kappa,0.518,0.488,0.394
binary_krippendorff_alpha,0.516,0.484,0.386
binary_gwet,0.519,0.560,0.530


Model: gpt-4o, Scenario: fulldocs


Unnamed: 0,0,1,2
model_name,gpt-4o,gpt-4o,gpt-4o
scenario_name,fulldocs,fulldocs,fulldocs
dataset_name,dl-2019,dl-2020,rag-2024
binary_cohen_kappa,0.517,0.481,0.396
binary_krippendorff_alpha,0.513,0.474,0.377
binary_gwet,0.513,0.536,0.561


Model: llama3.18b, Scenario: summarisation_80tokens


Unnamed: 0,0,1,2
model_name,llama3.18b,llama3.18b,llama3.18b
scenario_name,summarisation_80tokens,summarisation_80tokens,summarisation_80tokens
dataset_name,dl-2019,dl-2020,rag-2024
binary_cohen_kappa,0.320,0.215,0.101
binary_krippendorff_alpha,0.320,0.100,0.032
binary_gwet,0.336,0.100,0.413


Model: llama3.18b, Scenario: summarisation_120tokens


Unnamed: 0,0,1,2
model_name,llama3.18b,llama3.18b,llama3.18b
scenario_name,summarisation_120tokens,summarisation_120tokens,summarisation_120tokens
dataset_name,dl-2019,dl-2020,rag-2024
binary_cohen_kappa,0.311,0.291,0.181
binary_krippendorff_alpha,0.311,0.289,0.181
binary_gwet,0.327,0.414,0.291


Model: llama3.18b, Scenario: fulldocs


Unnamed: 0,0,1,2
model_name,llama3.18b,llama3.18b,llama3.18b
scenario_name,fulldocs,fulldocs,fulldocs
dataset_name,dl-2019,dl-2020,rag-2024
binary_cohen_kappa,0.301,0.270,0.147
binary_krippendorff_alpha,0.295,0.255,0.120
binary_gwet,0.296,0.326,0.381


Graded agreement metrics:
Model: gpt-4o, Scenario: summarisation_80tokens


Unnamed: 0,0,1,2
model_name,gpt-4o,gpt-4o,gpt-4o
scenario_name,summarisation_80tokens,summarisation_80tokens,summarisation_80tokens
dataset_name,dl-2019,dl-2020,rag-2024
graded_krippendorff_alpha,0.331,0.348,0.235
graded_weighted_kappa,0.583,0.561,0.493


Model: gpt-4o, Scenario: summarisation_120tokens


Unnamed: 0,0,1,2
model_name,gpt-4o,gpt-4o,gpt-4o
scenario_name,summarisation_120tokens,summarisation_120tokens,summarisation_120tokens
dataset_name,dl-2019,dl-2020,rag-2024
graded_krippendorff_alpha,0.326,0.350,0.240
graded_weighted_kappa,0.574,0.562,0.497


Model: gpt-4o, Scenario: fulldocs


Unnamed: 0,0,1,2
model_name,gpt-4o,gpt-4o,gpt-4o
scenario_name,fulldocs,fulldocs,fulldocs
dataset_name,dl-2019,dl-2020,rag-2024
graded_krippendorff_alpha,0.340,0.341,0.225
graded_weighted_kappa,0.582,0.560,0.493


Model: llama3.18b, Scenario: summarisation_80tokens


Unnamed: 0,0,1,2
model_name,llama3.18b,llama3.18b,llama3.18b
scenario_name,summarisation_80tokens,summarisation_80tokens,summarisation_80tokens
dataset_name,dl-2019,dl-2020,rag-2024
graded_krippendorff_alpha,0.193,0.003,0.004
graded_weighted_kappa,0.362,0.245,0.165


Model: llama3.18b, Scenario: summarisation_120tokens


Unnamed: 0,0,1,2
model_name,llama3.18b,llama3.18b,llama3.18b
scenario_name,summarisation_120tokens,summarisation_120tokens,summarisation_120tokens
dataset_name,dl-2019,dl-2020,rag-2024
graded_krippendorff_alpha,0.188,0.141,0.057
graded_weighted_kappa,0.350,0.313,0.247


Model: llama3.18b, Scenario: fulldocs


Unnamed: 0,0,1,2
model_name,llama3.18b,llama3.18b,llama3.18b
scenario_name,fulldocs,fulldocs,fulldocs
dataset_name,dl-2019,dl-2020,rag-2024
graded_krippendorff_alpha,0.155,0.114,0.010
graded_weighted_kappa,0.335,0.277,0.198
