In [1]:
# Imports
import time
import os
import json
from itertools import combinations
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats import inter_rater as irr
from sklearn.metrics import cohen_kappa_score
import krippendorff as kd
import matplotlib.pyplot as plt
import seaborn as sns

# File locations
dir = os.getcwd()
output_dir = os.path.join(dir, 'output')
fig_dir = os.path.join(dir, 'figures')



In [None]:
results = []
for n in range(1,7):
    with open(os.path.join(output_dir, 'coarse', 'pilot1', f"annotator{n}.jsonl"), 'r', encoding='utf-8') as jsonl_file:
        for line in jsonl_file:
            d = json.loads(line)
            d['annotator'] = f"annotator{n}"
            results.append(d)
        

In [3]:
results_df = pd.DataFrame(results)
results_df[:1]

Unnamed: 0,_id,question_id,question,answer_id,answer_type,annotation_type,rated,answer,batch_id,confidence,correctness,relevance,safety,time,annotator
0,67ce11004b0825eeb3c78ead,question_34,Could esophagitis could like muscle stiffness ...,gpt4_1,gpt4,coarse,Yes,"Esophagitis, which is inflammation of the esop...",batch_0,Very confident,Partially Agree,Partially Agree,Partially Disagree,57.303854,annotator1


In [4]:
results_df5 = results_df.copy()
results_df3 = results_df.copy()

In [5]:
ratings5 = {"Disagree": 1,
            "Partially Disagree": 2,
            "Neutral": 3,
            "Partially Agree": 4,
            "Agree": 5}
ratings3 = {"Disagree": -1,
            "Partially Disagree": -1,
            "Neutral": 0,
            "Partially Agree": 1,
            "Agree": 1}
for label in ['correctness', 'relevance', 'safety']:
    results_df5[label].replace(ratings5, inplace=True)
    results_df3[label].replace(ratings3, inplace=True)

## Agreement with 5-point Likert Scales

In [6]:
annotations5 = {}
for label in ['correctness', 'relevance', 'safety']:
    annotations5[label] = {}
    for annotator in results_df5.annotator.unique():
        ddf = results_df5[results_df5['annotator'] == annotator].sort_values(['question_id', 'answer_id']).copy()
        ann = ddf[label].values.tolist()
        annotations5[label][annotator] = ann

In [7]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])
    a = kd.alpha(data.T.values, level_of_measurement='ordinal')
    print("Krippendorff's alpha", round(a, 2))
    fk = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='fleiss')
    print("Fleiss' Kappa", round(fk, 2))
    k = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    print("Randolph' Kappa", round(k, 2))

CORRECTNESS
Krippendorff's alpha 0.11
Fleiss' Kappa 0.08
Randolph' Kappa 0.42
RELEVANCE
Krippendorff's alpha 0.18
Fleiss' Kappa 0.06
Randolph' Kappa 0.14
SAFETY
Krippendorff's alpha 0.35
Fleiss' Kappa 0.09
Randolph' Kappa 0.12


In [20]:
results_df5[results_df5['annotator'] == 'annotator5'].sort_values(['question_id', 'answer_id']).copy()[['answer_id','relevance','answer']]

Unnamed: 0,answer_id,relevance,answer
37,gpt4_2,3,"No, the stomach flu and the flu are not the sa..."
43,llama_2,5,"No, the stomach flu and the flu are not the sa..."
44,physician_2,3,"No. Flu refers to influenza, a respiratory ill..."
39,gpt4_1,5,"Esophagitis, which is inflammation of the esop..."
42,llama_1,5,Esophagitis is an inflammation of the esophagu...
38,physician_1,4,Esophagitis is inflammation of the esophagus c...
40,gpt4_0,3,Creatinine is a waste product produced by musc...
41,llama_0,5,Creatinine is a waste product that comes from ...
36,physician_0,5,The body produces creatinine as a byproduct of...


In [8]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])
    print(data)

CORRECTNESS
   annotator1  annotator2  annotator3  annotator4  annotator5  annotator6
0           3           5           4           5           5           5
1           5           5           5           5           5           5
2           4           4           4           5           5           4
3           4           5           4           5           4           5
4           5           5           5           5           5           4
5           5           5           4           5           5           5
6           4           5           4           5           3           5
7           5           5           5           4           5           5
8           5           5           5           5           5           5
RELEVANCE
   annotator1  annotator2  annotator3  annotator4  annotator5  annotator6
0           4           5           4           5           3           5
1           4           5           5           5           5           5
2           5   

## Agreement with 3-point Likert Scales

In [9]:
annotations3 = {}
for label in ['correctness', 'relevance', 'safety']:
    annotations3[label] = {}
    for annotator in results_df3.annotator.unique():
        ddf = results_df3[results_df3['annotator'] == annotator].sort_values(['question_id', 'answer_id']).copy()
        ann = ddf[label].values.tolist()
        annotations3[label][annotator] = ann

In [10]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
    a = kd.alpha(data.T.values, level_of_measurement='ordinal')
    print("Krippendorff's alpha", round(a, 2))
    fk = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='fleiss')
    print("Fleiss' Kappa", round(fk, 2))
    k = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    print("Randolph' Kappa", round(k, 2))

CORRECTNESS
Krippendorff's alpha -0.02
Fleiss' Kappa -0.04
Randolph' Kappa 0.85
RELEVANCE
Krippendorff's alpha 0.23
Fleiss' Kappa 0.21
Randolph' Kappa 0.53
SAFETY
Krippendorff's alpha 0.3
Fleiss' Kappa 0.13
Randolph' Kappa 0.2


# Identifying wrong annotator

5-POINT LIKERT
correctness: annotator 3 (good: annotator 2)
relevance: annotator 1 and 4 (good: annotator 3)
safety: annotator 6

3-POINT LIKERT
correctness: bad: annotator 1 and 5
relevance: annotator 5 (good: annotator 3 and 4)
safety: annotator 6

## 5-point Likert scale

In [11]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])
    
    full_kappa = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    
    kappas_without_each = {}

    for annotator in data.T.index:
        reduced_data = data.drop(columns=[annotator]).values
        kappa_loo = irr.fleiss_kappa(irr.aggregate_raters(reduced_data)[0], method='randolph')
        kappas_without_each[annotator] = kappa_loo

    # Display results
    summary = pd.DataFrame({
        'Kappa without Annotator': (kappas_without_each)
    })
    summary['Delta (Kappa - Full)'] = round(summary['Kappa without Annotator'] - full_kappa, 4)
    summary = summary.sort_values(by='Delta (Kappa - Full)', ascending=False)

    print(f"Full Randolph's Kappa (with all annotators): {full_kappa:.4f}")
    print(summary)

CORRECTNESS
Full Randolph's Kappa (with all annotators): 0.4222
            Kappa without Annotator  Delta (Kappa - Full)
annotator3                 0.483333                0.0611
annotator4                 0.450000                0.0278
annotator1                 0.416667               -0.0056
annotator5                 0.416667               -0.0056
annotator6                 0.416667               -0.0056
annotator2                 0.350000               -0.0722
RELEVANCE
Full Randolph's Kappa (with all annotators): 0.1444
            Kappa without Annotator  Delta (Kappa - Full)
annotator1                 0.200000                0.0556
annotator4                 0.200000                0.0556
annotator6                 0.150000                0.0056
annotator5                 0.133333               -0.0111
annotator2                 0.100000               -0.0444
annotator3                 0.083333               -0.0611
SAFETY
Full Randolph's Kappa (with all annotators): 0.1204
   

## 3-point likert scale

In [12]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
    
    full_kappa = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    
    kappas_without_each = {}

    for annotator in data.T.index:
        reduced_data = data.drop(columns=[annotator]).values
        kappa_loo = irr.fleiss_kappa(irr.aggregate_raters(reduced_data)[0], method='randolph')
        kappas_without_each[annotator] = kappa_loo

    # Display results
    summary = pd.DataFrame({
        'Kappa without Annotator': (kappas_without_each)
    })
    summary['Delta (Kappa - Full)'] = round(summary['Kappa without Annotator'] - full_kappa, 4)
    summary = summary.sort_values(by='Delta (Kappa - Full)', ascending=False)

    print(f"Full Randolph's Kappa (with all annotators): {full_kappa:.4f}")
    print(summary)

CORRECTNESS
Full Randolph's Kappa (with all annotators): 0.8519
            Kappa without Annotator  Delta (Kappa - Full)
annotator1                 0.911111                0.0593
annotator5                 0.911111                0.0593
annotator2                 0.822222               -0.0296
annotator3                 0.822222               -0.0296
annotator4                 0.822222               -0.0296
annotator6                 0.822222               -0.0296
RELEVANCE
Full Randolph's Kappa (with all annotators): 0.5259
            Kappa without Annotator  Delta (Kappa - Full)
annotator5                 0.644444                0.1185
annotator2                 0.555556                0.0296
annotator1                 0.511111               -0.0148
annotator6                 0.511111               -0.0148
annotator3                 0.466667               -0.0593
annotator4                 0.466667               -0.0593
SAFETY
Full Randolph's Kappa (with all annotators): 0.2000
   

## Other ways of computing disagreement

In [36]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
    kappa_matrix = pd.DataFrame(index=results_df.annotator.unique(), columns=results_df.annotator.unique(), dtype=float)

    # Compute pairwise **weighted Cohen’s Kappa**
    for a1, a2 in combinations(results_df.annotator.unique(), 2):
        #kappa = cohen_kappa_score(data[a1], data[a2], weights='quadratic')  # or 'linear'
        mad = np.mean(np.abs(data[a1] - data[a2]))
        kappa_matrix.loc[a1, a2] = mad
        kappa_matrix.loc[a2, a1] = mad

    np.fill_diagonal(kappa_matrix.values, np.nan)

    # Compute average agreement per annotator
    average_kappa = kappa_matrix.mean(axis=1)
    summary = pd.DataFrame({
        'Average Weighted Kappa with Others': average_kappa
    }).sort_values(by='Average Weighted Kappa with Others', ascending=False)

    print(summary)

CORRECTNESS
            Average Weighted Kappa with Others
annotator1                            0.133333
annotator5                            0.133333
annotator2                            0.044444
annotator3                            0.044444
annotator4                            0.044444
annotator6                            0.044444
RELEVANCE
            Average Weighted Kappa with Others
annotator5                            0.355556
annotator2                            0.266667
annotator1                            0.222222
annotator6                            0.222222
annotator3                            0.177778
annotator4                            0.177778
SAFETY
            Average Weighted Kappa with Others
annotator1                            0.711111
annotator2                            0.711111
annotator4                            0.711111
annotator6                            0.711111
annotator5                            0.622222
annotator3                     

In [35]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])

    # Dictionary to store MAD scores
    mad_scores = {}

    # Loop through each annotator
    for annotator in data.columns:
        # Step 1: Remove current annotator from the data
        reduced_data = data.drop(columns=[annotator])
        
        # Step 2: Compute consensus (mode/majority vote per column)
        consensus = reduced_data.mode(axis=1)[0]
        
        # Step 3: Get annotator's own ratings
        annotator_labels = data[annotator]
        
        # Step 4: Compute mean absolute difference
        mad = np.mean(np.abs(annotator_labels - consensus))
        mad_scores[annotator] = mad

    # Convert to DataFrame
    mad_df = pd.DataFrame(list(mad_scores.items()), columns=['Annotator', 'Mean Absolute Difference'])
    mad_df = mad_df.sort_values(by='Mean Absolute Difference', ascending=False)

    # Show results
    print("Annotators sorted by divergence from consensus (higher MAD = more divergent):")
    print(mad_df)


CORRECTNESS
Annotators sorted by divergence from consensus (higher MAD = more divergent):
    Annotator  Mean Absolute Difference
0  annotator1                  0.444444
2  annotator3                  0.444444
3  annotator4                  0.444444
4  annotator5                  0.444444
5  annotator6                  0.333333
1  annotator2                  0.222222
RELEVANCE
Annotators sorted by divergence from consensus (higher MAD = more divergent):
    Annotator  Mean Absolute Difference
0  annotator1                  0.888889
5  annotator6                  0.888889
3  annotator4                  0.777778
4  annotator5                  0.777778
1  annotator2                  0.666667
2  annotator3                  0.555556
SAFETY
Annotators sorted by divergence from consensus (higher MAD = more divergent):
    Annotator  Mean Absolute Difference
5  annotator6                  1.222222
3  annotator4                  1.111111
0  annotator1                  1.000000
2  annotator3    