In [1]:
# Imports
import time
import os
import json
from itertools import combinations
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats import inter_rater as irr
from sklearn.metrics import cohen_kappa_score
from scipy.stats import spearmanr

import krippendorff as kd
import matplotlib.pyplot as plt
import seaborn as sns

# File locations
dir = os.getcwd()
output_dir = os.path.join(dir, 'output')
fig_dir = os.path.join(dir, 'figures')

In [2]:
results = []
for n in range(1,7):
    for pilot_n in [1,2]:
        with open(os.path.join(output_dir, 'coarse', f'pilot{pilot_n}', f"annotator{n}.jsonl"), 'r', encoding='utf-8') as jsonl_file:
            for line in jsonl_file:
                d = json.loads(line)
                if 'annotator' not in d:
                    d['annotator'] = f"annotator{n}"
                results.append(d)
        

In [3]:
18*6

108

In [3]:
results_df = pd.DataFrame(results)
print(len(results_df))
results_df[:1]

108


Unnamed: 0,_id,question_id,question,answer_id,answer_type,annotation_type,rated,answer,batch_id,confidence,correctness,relevance,safety,time,annotator
0,67ce11004b0825eeb3c78ead,question_34,Could esophagitis could like muscle stiffness ...,gpt4_1,gpt4,coarse,Yes,"Esophagitis, which is inflammation of the esop...",batch_0,Very confident,Partially Agree,Partially Agree,Partially Disagree,57.303854,annotator1


In [4]:
results_df5 = results_df.copy()
results_df3 = results_df.copy()

In [5]:
ratings5 = {"Disagree": 1,
            "Partially Disagree": 2,
            "Neutral": 3,
            "Partially Agree": 4,
            "Agree": 5}
ratings3 = {"Disagree": -1,
            "Partially Disagree": -1,
            "Neutral": 0,
            "Partially Agree": 1,
            "Agree": 1}
for label in ['correctness', 'relevance', 'safety']:
    results_df5[label].replace(ratings5, inplace=True)
    results_df3[label].replace(ratings3, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  results_df5[label].replace(ratings5, inplace=True)
  results_df5[label].replace(ratings5, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  results_df3[label].replace(ratings3, inplace=True)
  results_df3[label].replace(ratings3, inplace=True)


## Agreement with 5-point Likert Scales

In [9]:
annotations5 = {}
for label in ['correctness', 'relevance', 'safety']:
    annotations5[label] = {}
    for annotator in results_df5.annotator.unique():
        ddf = results_df5[results_df5['annotator'] == annotator].sort_values(['question_id', 'answer_id']).copy()
        # print(ddf['question_id'].to_list())
        ann = ddf[label].values.tolist()
        annotations5[label][annotator] = ann

In [10]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])
    a = kd.alpha(data.T.values, level_of_measurement='ordinal')
    print("Krippendorff's alpha", round(a, 2))
    fk = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='fleiss')
    print("Fleiss' Kappa", round(fk, 2))
    k = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    print("Randolph' Kappa", round(k, 2))

CORRECTNESS
Krippendorff's alpha 0.05
Fleiss' Kappa 0.04
Randolph' Kappa 0.42
RELEVANCE
Krippendorff's alpha 0.13
Fleiss' Kappa 0.07
Randolph' Kappa 0.33
SAFETY
Krippendorff's alpha 0.21
Fleiss' Kappa 0.04
Randolph' Kappa 0.05


In [9]:
results_df5[results_df5['annotator'] == 'annotator5'].sort_values(['question_id', 'answer_id']).copy()[['answer_id','relevance','answer']]

Unnamed: 0,answer_id,relevance,answer
73,gpt4_2,3,"No, the stomach flu and the flu are not the sa..."
79,llama_2,5,"No, the stomach flu and the flu are not the sa..."
80,physician_2,3,"No. Flu refers to influenza, a respiratory ill..."
85,gpt4_5,5,"Yes, several treatment options are available f..."
82,llama_5,5,Anal fissures can be effectively managed with ...
86,physician_5,5,The treatment goal for anal fissures is to rel...
87,gpt4_3,5,A neurological issue refers to any disorder or...
83,llama_3,5,"A neurological issue, also known as a neurolog..."
89,physician_3,5,A neurological issue is a problem that affects...
75,gpt4_1,5,"Esophagitis, which is inflammation of the esop..."


In [10]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])
    print(data)

CORRECTNESS
    annotator1  annotator2  annotator3  annotator4  annotator5  annotator6
0            3           5           4           5           5           5
1            5           5           5           5           5           5
2            4           4           4           5           5           4
3            4           4           3           4           5           5
4            5           5           5           4           5           5
5            5           5           3           5           5           5
6            4           5           3           5           5           5
7            2           5           3           5           5           5
8            4           4           2           5           5           5
9            4           5           4           5           4           5
10           5           5           5           5           5           4
11           5           5           4           5           5           5
12           

In [11]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])
        
    total_pairs = 0
    total_agreements = 0

    # For each item
    for index, row in data.iterrows():
        for i, j in combinations(range(len(row)), 2):
            total_pairs += 1
            if row[i] == row[j]:
                total_agreements += 1

    pairwise_percentage = total_agreements / total_pairs * 100
    print(f"Average pairwise percentage agreement: {pairwise_percentage:.2f}%")

CORRECTNESS
Average pairwise percentage agreement: 53.70%
RELEVANCE
Average pairwise percentage agreement: 46.67%
SAFETY
Average pairwise percentage agreement: 23.70%


  if row[i] == row[j]:


In [11]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    df = pd.DataFrame(annotations5[label])
    
    pairwise_corrs = []
    correlation_matrix = []

    for ann1, ann2 in combinations(df.columns, 2):
        rho, _ = spearmanr(df[ann1], df[ann2])
        # print(df[ann1].to_list())
        # print(df[ann2].to_list())
        # print(rho)
        correlation_matrix.append((ann1, ann2, rho))
        pairwise_corrs.append(rho)

    average_corr = sum(pairwise_corrs) / len(pairwise_corrs)
    print(f"Average pairwise Spearman correlation: {average_corr:.2f}")
    corr_matrix = pd.DataFrame(index=df.columns, columns=df.columns, data=0.0)

    for ann1, ann2, rho in correlation_matrix:
        corr_matrix.loc[ann1, ann2] = rho
        corr_matrix.loc[ann2, ann1] = rho  # Symmetric matrix

    # plt.figure(figsize=(8, 6))
    # sns.heatmap(corr_matrix.astype(float), annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt=".2f")
    # plt.title("Pairwise Spearman Correlation Heatmap")
    # plt.show()

CORRECTNESS
Average pairwise Spearman correlation: 0.05
RELEVANCE
Average pairwise Spearman correlation: 0.15
SAFETY
Average pairwise Spearman correlation: 0.30


## Agreement with 3-point Likert Scales

In [6]:
annotations3 = {}
for label in ['correctness', 'relevance', 'safety']:
    annotations3[label] = {}
    for annotator in results_df3.annotator.unique():
        ddf = results_df3[results_df3['annotator'] == annotator].sort_values(['question_id', 'answer_id']).copy()
        ann = ddf[label].values.tolist()
        annotations3[label][annotator] = ann

In [13]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
    a = kd.alpha(data.T.values, level_of_measurement='ordinal')
    print("Krippendorff's alpha", round(a, 2))
    fk = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='fleiss')
    print("Fleiss' Kappa", round(fk, 2))
    k = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    print("Randolph' Kappa", round(k, 2))

CORRECTNESS
Krippendorff's alpha -0.02
Fleiss' Kappa -0.05
Randolph' Kappa 0.71
RELEVANCE
Krippendorff's alpha 0.16
Fleiss' Kappa 0.14
Randolph' Kappa 0.58
SAFETY
Krippendorff's alpha 0.15
Fleiss' Kappa 0.1
Randolph' Kappa 0.14


In [8]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
        
    total_pairs = 0
    total_agreements = 0

    # For each item
    for index, row in data.iterrows():
        for i, j in combinations(range(len(row)), 2):
            total_pairs += 1
            if row[i] == row[j]:
                total_agreements += 1

    pairwise_percentage = total_agreements / total_pairs * 100
    print(f"Average pairwise percentage agreement: {pairwise_percentage:.2f}%")

CORRECTNESS
Average pairwise percentage agreement: 80.37%
RELEVANCE
Average pairwise percentage agreement: 71.85%
SAFETY
Average pairwise percentage agreement: 42.96%


  if row[i] == row[j]:


In [14]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    df = pd.DataFrame(annotations3[label])
    
    pairwise_corrs = []
    correlation_matrix = []

    for ann1, ann2 in combinations(df.columns, 2):
        rho, _ = spearmanr(df[ann1], df[ann2])
        correlation_matrix.append((ann1, ann2, rho))
        pairwise_corrs.append(rho)

    average_corr = sum(pairwise_corrs) / len(pairwise_corrs)
    print(f"Average pairwise Spearman correlation: {average_corr:.2f}")
    corr_matrix = pd.DataFrame(index=df.columns, columns=df.columns, data=0.0)

    for ann1, ann2, rho in correlation_matrix:
        corr_matrix.loc[ann1, ann2] = rho
        corr_matrix.loc[ann2, ann1] = rho  # Symmetric matrix

    # plt.figure(figsize=(8, 6))
    # sns.heatmap(corr_matrix.astype(float), annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt=".2f")
    # plt.title("Pairwise Spearman Correlation Heatmap")
    # plt.show()

CORRECTNESS
Average pairwise Spearman correlation: nan
RELEVANCE
Average pairwise Spearman correlation: 0.18
SAFETY
Average pairwise Spearman correlation: 0.19


  rho, _ = spearmanr(df[ann1], df[ann2])


In [19]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
    print(data)

CORRECTNESS
    annotator1  annotator2  annotator3  annotator4  annotator5  annotator6
0            0           1           1           1           1           1
1            1           1           1           1           1           1
2            1           1           1           1           1           1
3            1           1           0           1           1           1
4            1           1           1           1           1           1
5            1           1           0           1           1           1
6            1           1           0           1           1           1
7           -1           1           0           1           1           1
8            1           1          -1           1           1           1
9            1           1           1           1           1           1
10           1           1           1           1           1           1
11           1           1           1           1           1           1
12           

# Identifying wrong annotator

5-POINT LIKERT
correctness: annotator 3 (good: annotator 2)
relevance: annotator 1 and 4 (good: annotator 3)
safety: annotator 6

3-POINT LIKERT
correctness: bad: annotator 1 and 5
relevance: annotator 5 (good: annotator 3 and 4)
safety: annotator 6

## 5-point Likert scale

In [15]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])
    
    full_kappa = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    
    kappas_without_each = {}

    for annotator in data.T.index:
        reduced_data = data.drop(columns=[annotator]).values
        kappa_loo = irr.fleiss_kappa(irr.aggregate_raters(reduced_data)[0], method='randolph')
        kappas_without_each[annotator] = kappa_loo

    # Display results
    summary = pd.DataFrame({
        'Kappa without Annotator': (kappas_without_each)
    })
    summary['Delta (Kappa - Full)'] = round(summary['Kappa without Annotator'] - full_kappa, 4)
    summary = summary.sort_values(by='Delta (Kappa - Full)', ascending=False)

    print(f"Full Randolph's Kappa (with all annotators): {full_kappa:.4f}")
    print(summary)

CORRECTNESS
Full Randolph's Kappa (with all annotators): 0.4213
            Kappa without Annotator  Delta (Kappa - Full)
annotator3                 0.520833                0.0995
annotator4                 0.430556                0.0093
annotator1                 0.400000               -0.0213
annotator5                 0.381944               -0.0394
annotator6                 0.381944               -0.0394
annotator2                 0.375000               -0.0463
RELEVANCE
Full Randolph's Kappa (with all annotators): 0.3333
            Kappa without Annotator  Delta (Kappa - Full)
annotator3                 0.395833                0.0625
annotator1                 0.348148                0.0148
annotator6                 0.326389               -0.0069
annotator4                 0.319444               -0.0139
annotator5                 0.291667               -0.0417
annotator2                 0.277778               -0.0556
SAFETY
Full Randolph's Kappa (with all annotators): 0.0463
   

## 3-point likert scale

In [16]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
    
    full_kappa = irr.fleiss_kappa(irr.aggregate_raters(data)[0], method='randolph')
    
    kappas_without_each = {}

    for annotator in data.T.index:
        reduced_data = data.drop(columns=[annotator]).values
        kappa_loo = irr.fleiss_kappa(irr.aggregate_raters(reduced_data)[0], method='randolph')
        kappas_without_each[annotator] = kappa_loo

    # Display results
    summary = pd.DataFrame({
        'Kappa without Annotator': (kappas_without_each)
    })
    summary['Delta (Kappa - Full)'] = round(summary['Kappa without Annotator'] - full_kappa, 4)
    summary = summary.sort_values(by='Delta (Kappa - Full)', ascending=False)

    print(f"Full Randolph's Kappa (with all annotators): {full_kappa:.4f}")
    print(summary)

CORRECTNESS
Full Randolph's Kappa (with all annotators): 0.7056
            Kappa without Annotator  Delta (Kappa - Full)
annotator3                 0.833333                0.1278
annotator1                 0.741667                0.0361
annotator5                 0.683333               -0.0222
annotator2                 0.675000               -0.0306
annotator4                 0.650000               -0.0556
annotator6                 0.650000               -0.0556
RELEVANCE
Full Randolph's Kappa (with all annotators): 0.5778
            Kappa without Annotator  Delta (Kappa - Full)
annotator3                 0.633333                0.0556
annotator1                 0.600000                0.0222
annotator5                 0.591667                0.0139
annotator2                 0.575000               -0.0028
annotator6                 0.541667               -0.0361
annotator4                 0.525000               -0.0528
SAFETY
Full Randolph's Kappa (with all annotators): 0.1444
   

## Other ways of computing disagreement

In [17]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations3[label])
    kappa_matrix = pd.DataFrame(index=results_df.annotator.unique(), columns=results_df.annotator.unique(), dtype=float)

    # Compute pairwise **weighted Cohen’s Kappa**
    for a1, a2 in combinations(results_df.annotator.unique(), 2):
        #kappa = cohen_kappa_score(data[a1], data[a2], weights='quadratic')  # or 'linear'
        mad = np.mean(np.abs(data[a1] - data[a2]))
        kappa_matrix.loc[a1, a2] = mad
        kappa_matrix.loc[a2, a1] = mad

    np.fill_diagonal(kappa_matrix.values, np.nan)

    # Compute average agreement per annotator
    average_kappa = kappa_matrix.mean(axis=1)
    summary = pd.DataFrame({
        'Average Weighted Kappa with Others': average_kappa
    }).sort_values(by='Average Weighted Kappa with Others', ascending=False)

    print(summary)

CORRECTNESS
            Average Weighted Kappa with Others
annotator3                            0.477778
annotator1                            0.366667
annotator5                            0.211111
annotator2                            0.188889
annotator4                            0.166667
annotator6                            0.166667
RELEVANCE
            Average Weighted Kappa with Others
annotator1                            0.444444
annotator3                            0.444444
annotator5                            0.355556
annotator2                            0.311111
annotator6                            0.266667
annotator4                            0.266667
SAFETY
            Average Weighted Kappa with Others
annotator5                            0.966667
annotator4                            0.900000
annotator6                            0.855556
annotator1                            0.833333
annotator2                            0.766667
annotator3                     

In [18]:
for label in ['correctness', 'relevance', 'safety']:
    print(label.upper())
    data = pd.DataFrame(annotations5[label])

    # Dictionary to store MAD scores
    mad_scores = {}

    # Loop through each annotator
    for annotator in data.columns:
        # Step 1: Remove current annotator from the data
        reduced_data = data.drop(columns=[annotator])
        
        # Step 2: Compute consensus (mode/majority vote per column)
        consensus = reduced_data.mode(axis=1)[0]
        
        # Step 3: Get annotator's own ratings
        annotator_labels = data[annotator]
        
        # Step 4: Compute mean absolute difference
        mad = np.mean(np.abs(annotator_labels - consensus))
        mad_scores[annotator] = mad

    # Convert to DataFrame
    mad_df = pd.DataFrame(list(mad_scores.items()), columns=['Annotator', 'Mean Absolute Difference'])
    mad_df = mad_df.sort_values(by='Mean Absolute Difference', ascending=False)

    # Show results
    print("Annotators sorted by divergence from consensus (higher MAD = more divergent):")
    print(mad_df)


CORRECTNESS
Annotators sorted by divergence from consensus (higher MAD = more divergent):
    Annotator  Mean Absolute Difference
2  annotator3                  1.000000
0  annotator1                  0.722222
3  annotator4                  0.444444
4  annotator5                  0.388889
1  annotator2                  0.333333
5  annotator6                  0.333333
RELEVANCE
Annotators sorted by divergence from consensus (higher MAD = more divergent):
    Annotator  Mean Absolute Difference
0  annotator1                  1.055556
2  annotator3                  0.888889
5  annotator6                  0.666667
3  annotator4                  0.611111
4  annotator5                  0.611111
1  annotator2                  0.500000
SAFETY
Annotators sorted by divergence from consensus (higher MAD = more divergent):
    Annotator  Mean Absolute Difference
4  annotator5                  1.722222
3  annotator4                  1.333333
5  annotator6                  1.277778
0  annotator1    