## Annotation analysis

Let's check how cool the annotations are.

In [2]:
from datasets import load_dataset
ds = load_dataset("spanish-ir/ir-relevance-annotated", split="train")

In [3]:
import pandas as pd

pd.options.display.max_colwidth = 300
pd.options.display.max_rows = 300

id_to_annotator = {
    'b02f903f-482b-4b28-a83d-354acbb54a1e': 0,
    '08814b5d-440f-45bf-849b-1bd802888bb6': 1,
    'f8cbe0de-17e3-487a-9f5d-32eb9694f4d5': 2,
}


def flatten_record(record, variable):
    ret = {
        "id": record["id"],
        "query": record["fields"]["query"],
        "text": record["fields"]["text"],
    }

    for response in record["responses"][variable]:
        annotator_id = id_to_annotator[response['user_id']]
        val = response["value"]

        if val == "sí":
            val = 1
        elif val == "no":
            val = 0
        else:
            raise ValueError(f"Invalid value {val}")

        ret[f"annotator_{annotator_id}"] = val
    return ret

def get_df(ds, variable):

    df = pd.DataFrame([flatten_record(rec, variable) for rec in ds])

    ratings = [f"annotator_{i}" for i in range(3)]

    df["sum"] = df[ratings].sum(axis=1)

    return df

dfs = {
    "clarity": get_df(ds, "clarity"),
    "non_ambiguity": get_df(ds, "non_ambiguity"),
    "relevance": get_df(ds, "relevance"),
}

In [12]:

from sklearn.metrics import f1_score, cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

variables = ["clarity", "non_ambiguity", "relevance"]

for var, df in dfs.items():
    print("="*80)
    print(f"Variable {var}")

    ratings = df[[f"annotator_{i}" for i in range(3)]].values
    for i in range(3):
        for j in range(i+1, 3):
            kappa = cohen_kappa_score(ratings[:, i], ratings[:, j])
            f1 = f1_score(ratings[:, i], ratings[:, j])

            print(f"Annotator {i} vs {j}: κ = {kappa:.4f}, F1 = {f1:.4f}")

    table, _ = aggregate_raters(ratings)
    print(f"Ⲕ = {fleiss_kappa(table):.4f}")

    print(f"Freq snippets 3 voters say 1   : {(df['sum'] == 3).mean():.3f}")
    print(f"Freq snippets majority is 1    : {( df['sum'] >= 2).mean():.3f}")
    print(f"Freq snippets 3 voters agree   : {(df['sum'].apply(lambda x: x % 3) == 0).mean()}")
    print(f"Distribution                   : {df['sum'].value_counts().sort_index(ascending=False).to_dict()}")

Variable clarity
Annotator 0 vs 1: κ = 0.3396, F1 = 0.9205
Annotator 0 vs 2: κ = 0.1129, F1 = 0.9412
Annotator 1 vs 2: κ = 0.0718, F1 = 0.9180
Ⲕ = 0.1860
Freq snippets 3 voters say 1   : 0.800
Freq snippets majority is 1    : 0.930
Freq snippets 3 voters agree   : 0.8
Distribution                   : {3: 80, 2: 13, 1: 7}
Variable non_ambiguity
Annotator 0 vs 1: κ = 0.0449, F1 = 0.9061
Annotator 0 vs 2: κ = 0.3611, F1 = 0.8516
Annotator 1 vs 2: κ = 0.0506, F1 = 0.8193
Ⲕ = 0.1463
Freq snippets 3 voters say 1   : 0.640
Freq snippets majority is 1    : 0.880
Freq snippets 3 voters agree   : 0.65
Distribution                   : {3: 64, 2: 24, 1: 11, 0: 1}
Variable relevance
Annotator 0 vs 1: κ = -0.0417, F1 = 0.9583
Annotator 0 vs 2: κ = 0.0691, F1 = 0.9239
Annotator 1 vs 2: κ = 0.2021, F1 = 0.9348
Ⲕ = 0.0893
Freq snippets 3 voters say 1   : 0.830
Freq snippets majority is 1    : 0.970
Freq snippets 3 voters agree   : 0.83
Distribution                   : {3: 83, 2: 14, 1: 3}


In [4]:
for var, df in dfs.items():
    print(df[["annotator_0", "annotator_1", "annotator_2"]].sum(axis=0))

annotator_0    90
annotator_1    86
annotator_2    97
dtype: int64
annotator_0    85
annotator_1    96
annotator_2    70
dtype: int64
annotator_0    96
annotator_1    96
annotator_2    88
dtype: int64
