In [4]:
# Imports
import time
import os
import json
from itertools import combinations
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats import inter_rater as irr
from sklearn.metrics import cohen_kappa_score
import krippendorff as kd
import matplotlib.pyplot as plt
import seaborn as sns

# File locations
dir = os.getcwd()
output_dir = os.path.join(dir, 'output')
fig_dir = os.path.join(dir, 'figures')

In [None]:
# TO ASSEMBLE BATCHES
for n in range(1,7):
    if n not in [1,2,6]:
        results = []
        with open(os.path.join(output_dir, 'coarse', 'batches_1-9', f"annotator{n}.jsonl"), 'r', encoding='utf-8') as jsonl_file:
            for line in jsonl_file:
                d = json.loads(line)
                d['annotator'] = f"annotator{n}"
                results.append(d)
        
        output_file = f"annotator{n}.jsonl"
        with open(os.path.join('output', 'coarse', output_file), 'w', encoding='utf-8') as f:
            for doc in results:
                f.write(json.dumps(doc, ensure_ascii=False) + '\n')

In [5]:
results = []
for n in range(1,7):
    with open(os.path.join(output_dir, 'coarse', f"annotator{n}.jsonl"), 'r', encoding='utf-8') as jsonl_file:
        for line in jsonl_file:
            d = json.loads(line)
            d['annotator'] = f"annotator{n}"
            results.append(d)

In [6]:
results_df = pd.DataFrame(results)
results_df[:1]

Unnamed: 0,_id,question_id,question,answer_id,answer,answer_type,annotation_type,rated,batch_id,confidence,correctness,relevance,safety,time,annotator
0,67d43fe8ccebca25cea425e4,question_48,Does hydroxyzine have any affect on metabolism...,gpt4_45,Hydroxyzine is an antihistamine commonly used ...,gpt4,coarse,Yes,batch_1,Very confident,Partially Agree,Partially Agree,Partially Agree,63.261443,annotator1


In [7]:
results_df5 = results_df.copy()
results_df3 = results_df.copy()

In [8]:
ratings5 = {"Disagree": 1,
            "Partially Disagree": 2,
            "Neutral": 3,
            "Partially Agree": 4,
            "Agree": 5}
ratings3 = {"Disagree": -1,
            "Partially Disagree": -1,
            "Neutral": 0,
            "Partially Agree": 1,
            "Agree": 1}
for label in ['correctness', 'relevance', 'safety']:
    results_df5[label].replace(ratings5, inplace=True)
    results_df3[label].replace(ratings3, inplace=True)

In [9]:
first_group = [f'annotator{n}' for n in (1,2,6)]
second_group = [f'annotator{n}' for n in range(3,6)]

## Agreement with 5-point Likert Scales

### First group

In [15]:
len(set([i.split('_')[1] for i in results_df5[results_df5['annotator'].isin(first_group)].answer_id.unique()]))

23

In [16]:
len(set([i.split('_')[1] for i in results_df5[results_df5['annotator'].isin(second_group)].answer_id.unique()]))

21

In [None]:
first_group_d = {}
for label in ['correctness', 'relevance', 'safety']:
    first_group_d[label] = {}
    for annotator in first_group:
        ddf = results_df5[results_df5['annotator'] == annotator].sort_values(['question_id', 'answer_id']).copy()
        ann = ddf[label].values.tolist()
        first_group_d[label][annotator] = ann
        

18    question_180
7     question_180
14    question_180
1      question_36
9      question_36
11     question_36
26      question_4
17      question_4
4       question_4
0      question_48
15     question_48
22     question_48
8       question_6
21      question_6
23      question_6
5       question_8
20      question_8
6       question_8
13     question_82
19     question_82
25     question_82
3      question_95
2      question_95
10     question_95
12     question_96
24     question_96
16     question_96
Name: question_id, dtype: object
33    question_101
34    question_101
49    question_101
43     question_11
42     question_11
47     question_11
53    question_118
28    question_118
48    question_118
30    question_131
44    question_131
50    question_131
27    question_163
45    question_163
38    question_163
32    question_168
29    question_168
31    question_168
52     question_44
46     question_44
37     question_44
39      question_7
51      question_7
40      question_

In [16]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(first_group_d[label])
    a = kd.alpha(data.T.values, level_of_measurement='ordinal')
    print("Krippendorff's alpha", round(a, 2))
    fk = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='fleiss')
    print("Fleiss' Kappa", round(fk, 2))
    k = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    print("Randolph' Kappa", round(k, 2))

CORRECTNESS
Krippendorff's alpha -0.17
Fleiss' Kappa -0.11
Randolph' Kappa 0.26
RELEVANCE
Krippendorff's alpha 0.02
Fleiss' Kappa -0.06
Randolph' Kappa 0.2
SAFETY
Krippendorff's alpha -0.11
Fleiss' Kappa -0.02
Randolph' Kappa -0.0


In [19]:
second_group_d = {}
for label in ['correctness', 'relevance', 'safety']:
    second_group_d[label] = {}
    for annotator in second_group:
        ddf = results_df5[results_df5['annotator'] == annotator].sort_values(['question_id', 'answer_id']).copy()
        ann = ddf[label].values.tolist()
        second_group_d[label][annotator] = ann

In [20]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(second_group_d[label])
    a = kd.alpha(data.T.values, level_of_measurement='ordinal')
    print("Krippendorff's alpha", round(a, 2))
    fk = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='fleiss')
    print("Fleiss' Kappa", round(fk, 2))
    k = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    print("Randolph' Kappa", round(k, 2))

CORRECTNESS
Krippendorff's alpha -0.26
Fleiss' Kappa -0.24
Randolph' Kappa -0.0
RELEVANCE
Krippendorff's alpha -0.17
Fleiss' Kappa -0.05
Randolph' Kappa 0.24
SAFETY
Krippendorff's alpha -0.16
Fleiss' Kappa -0.07
Randolph' Kappa -0.0


In [8]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])
    print(data)

CORRECTNESS
    annotator1  annotator2  annotator3  annotator4  annotator5  annotator6
0            3           5           3           5           4           5
1            5           5           4           4           5           5
2            5           4           4           4           5           5
3            2           5           4           5           5           5
4            5           5           5           4           5           5
5            3           5           2           5           5           4
6            5           5           4           5           5           5
7            5           5           4           5           5           5
8            4           3           3           5           4           5
9            4           5           4           4           5           5
10           2           5           3           4           5           5
11           2           4           3           4           5           5
12           

## Agreement with 3-point Likert Scales

In [9]:
annotations3 = {}
for label in ['correctness', 'relevance', 'safety']:
    annotations3[label] = {}
    for annotator in results_df3.annotator.unique():
        ddf = results_df3[results_df3['annotator'] == annotator].sort_values(['question_id', 'answer_id']).copy()
        ann = ddf[label].values.tolist()
        annotations3[label][annotator] = ann

In [10]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
    a = kd.alpha(data.T.values, level_of_measurement='ordinal')
    print("Krippendorff's alpha", round(a, 4))
    fk = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='fleiss')
    print("Fleiss' Kappa", round(fk, 4))
    k = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    print("Randolph' Kappa", round(k, 4))

CORRECTNESS
Krippendorff's alpha 0.0132
Fleiss' Kappa -0.0049
Randolph' Kappa 0.6778
RELEVANCE
Krippendorff's alpha -0.0093
Fleiss' Kappa -0.0042
Randolph' Kappa 0.6778
SAFETY
Krippendorff's alpha -0.0116
Fleiss' Kappa -0.0114
Randolph' Kappa 0.1222


# Identifying wrong annotator

5-POINT LIKERT
correctness: annotator 3 (good: annotator 2)
relevance: annotator 1 and 4 (good: annotator 3)
safety: annotator 6

3-POINT LIKERT
correctness: bad: annotator 1 and 5
relevance: annotator 5 (good: annotator 3 and 4)
safety: annotator 6

## 5-point Likert scale

In [11]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])
    
    full_kappa = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    
    kappas_without_each = {}

    for annotator in data.T.index:
        reduced_data = data.drop(columns=[annotator]).values
        kappa_loo = irr.fleiss_kappa(irr.aggregate_raters(reduced_data)[0], method='randolph')
        kappas_without_each[annotator] = kappa_loo

    # Display results
    summary = pd.DataFrame({
        'Kappa without Annotator': (kappas_without_each)
    })
    summary['Delta (Kappa - Full)'] = round(summary['Kappa without Annotator'] - full_kappa, 4)
    summary = summary.sort_values(by='Delta (Kappa - Full)', ascending=False)

    print(f"Full Randolph's Kappa (with all annotators): {full_kappa:.4f}")
    print(summary)

CORRECTNESS
Full Randolph's Kappa (with all annotators): 0.2000
            Kappa without Annotator  Delta (Kappa - Full)
annotator3                 0.274074                0.0741
annotator1                 0.249383                0.0494
annotator4                 0.239506                0.0395
annotator5                 0.165432               -0.0346
annotator2                 0.140741               -0.0593
annotator6                 0.130864               -0.0691
RELEVANCE
Full Randolph's Kappa (with all annotators): 0.3025
            Kappa without Annotator  Delta (Kappa - Full)
annotator1                 0.361111                0.0586
annotator3                 0.356481                0.0540
annotator2                 0.283951               -0.0185
annotator5                 0.268519               -0.0340
annotator4                 0.254630               -0.0478
annotator6                 0.245370               -0.0571
SAFETY
Full Randolph's Kappa (with all annotators): 0.0247
   

## 3-point likert scale

In [12]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
    
    full_kappa = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    
    kappas_without_each = {}

    for annotator in data.T.index:
        reduced_data = data.drop(columns=[annotator]).values
        kappa_loo = irr.fleiss_kappa(irr.aggregate_raters(reduced_data)[0], method='randolph')
        kappas_without_each[annotator] = kappa_loo

    # Display results
    summary = pd.DataFrame({
        'Kappa without Annotator': (kappas_without_each)
    })
    summary['Delta (Kappa - Full)'] = round(summary['Kappa without Annotator'] - full_kappa, 4)
    summary = summary.sort_values(by='Delta (Kappa - Full)', ascending=False)

    print(f"Full Randolph's Kappa (with all annotators): {full_kappa:.4f}")
    print(summary)

CORRECTNESS
Full Randolph's Kappa (with all annotators): 0.6778
            Kappa without Annotator  Delta (Kappa - Full)
annotator3                 0.777778                0.1000
annotator1                 0.766667                0.0889
annotator5                 0.644444               -0.0333
annotator2                 0.633333               -0.0444
annotator4                 0.622222               -0.0556
annotator6                 0.622222               -0.0556
RELEVANCE
Full Randolph's Kappa (with all annotators): 0.6778
            Kappa without Annotator  Delta (Kappa - Full)
annotator3                 0.755556                0.0778
annotator1                 0.738889                0.0611
annotator2                 0.705556                0.0278
annotator4                 0.622222               -0.0556
annotator5                 0.622222               -0.0556
annotator6                 0.622222               -0.0556
SAFETY
Full Randolph's Kappa (with all annotators): 0.1222
   

## Other ways of computing disagreement

In [25]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
    kappa_matrix = pd.DataFrame(index=results_df.annotator.unique(), columns=results_df.annotator.unique(), dtype=float)

    # Compute pairwise **weighted Cohen’s Kappa**
    for a1, a2 in combinations(results_df.annotator.unique(), 2):
        #kappa = cohen_kappa_score(data[a1], data[a2], weights='quadratic')  # or 'linear'
        mad = np.mean(np.abs(data[a1] - data[a2]))
        kappa_matrix.loc[a1, a2] = mad
        kappa_matrix.loc[a2, a1] = mad

    np.fill_diagonal(kappa_matrix.values, np.nan)

    # Compute average agreement per annotator
    average_kappa = kappa_matrix.mean(axis=1)
    summary = pd.DataFrame({
        'Average Weighted Kappa with Others': average_kappa
    }).sort_values(by='Average Weighted Kappa with Others', ascending=False)

    print(summary)

CORRECTNESS
            Average Weighted Kappa with Others
annotator1                            0.540741
annotator3                            0.437037
annotator5                            0.229630
annotator2                            0.214815
annotator4                            0.200000
annotator6                            0.200000
RELEVANCE
            Average Weighted Kappa with Others
annotator3                            0.474074
annotator1                            0.459259
annotator2                            0.340741
annotator4                            0.207407
annotator5                            0.207407
annotator6                            0.207407
SAFETY
            Average Weighted Kappa with Others
annotator2                            1.066667
annotator3                            1.051852
annotator1                            0.962963
annotator5                            0.918519
annotator4                            0.903704
annotator6                     

In [35]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])

    # Dictionary to store MAD scores
    mad_scores = {}

    # Loop through each annotator
    for annotator in data.columns:
        # Step 1: Remove current annotator from the data
        reduced_data = data.drop(columns=[annotator])
        
        # Step 2: Compute consensus (mode/majority vote per column)
        consensus = reduced_data.mode(axis=1)[0]
        
        # Step 3: Get annotator's own ratings
        annotator_labels = data[annotator]
        
        # Step 4: Compute mean absolute difference
        mad = np.mean(np.abs(annotator_labels - consensus))
        mad_scores[annotator] = mad

    # Convert to DataFrame
    mad_df = pd.DataFrame(list(mad_scores.items()), columns=['Annotator', 'Mean Absolute Difference'])
    mad_df = mad_df.sort_values(by='Mean Absolute Difference', ascending=False)

    # Show results
    print("Annotators sorted by divergence from consensus (higher MAD = more divergent):")
    print(mad_df)


CORRECTNESS
Annotators sorted by divergence from consensus (higher MAD = more divergent):
    Annotator  Mean Absolute Difference
0  annotator1                  0.444444
2  annotator3                  0.444444
3  annotator4                  0.444444
4  annotator5                  0.444444
5  annotator6                  0.333333
1  annotator2                  0.222222
RELEVANCE
Annotators sorted by divergence from consensus (higher MAD = more divergent):
    Annotator  Mean Absolute Difference
0  annotator1                  0.888889
5  annotator6                  0.888889
3  annotator4                  0.777778
4  annotator5                  0.777778
1  annotator2                  0.666667
2  annotator3                  0.555556
SAFETY
Annotators sorted by divergence from consensus (higher MAD = more divergent):
    Annotator  Mean Absolute Difference
5  annotator6                  1.222222
3  annotator4                  1.111111
0  annotator1                  1.000000
2  annotator3    