In [1]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa
import numpy as np

# 1. Load the data
df = pd.read_csv("annotations.csv")

# Strip whitespace from columns
df["Annotator_1"] = df["Annotator_1"].str.strip()
df["Annotator_2"] = df["Annotator_2"].str.strip()
df["Annotator_3"] = df["Annotator_3"].str.strip()

# Function to calculate Cohen's Kappa for each pair of annotators
def calculate_cohen_kappa(df, annotators):
    kappa_results = {}
    for i in range(len(annotators)):
        for j in range(i + 1, len(annotators)):
            annotator_1 = annotators[i]
            annotator_2 = annotators[j]
            kappa = cohen_kappa_score(df[annotator_1], df[annotator_2])
            kappa_results[f"{annotator_1} vs {annotator_2}"] = kappa
    return kappa_results

# Function to calculate Fleiss' Kappa
def calculate_fleiss_kappa(df, annotators):
    unique_labels = set()
    for annotator in annotators:
        unique_labels.update(df[annotator].unique())
    unique_labels = sorted(unique_labels)

    ratings = np.zeros((len(df), len(unique_labels)))
    for i, annotator in enumerate(annotators):
        for j, label in enumerate(unique_labels):
            ratings[:, j] += (df[annotator] == label).astype(int)

    fleiss = fleiss_kappa(ratings, method='fleiss')
    return fleiss

# Annotator columns
annotators = ['Annotator_1', 'Annotator_2', 'Annotator_3']

# Calculate Cohen's Kappa
cohen_kappa_results = calculate_cohen_kappa(df, annotators)

# Fleiss' Kappa
fleiss_kappa_result = calculate_fleiss_kappa(df, annotators)

# Count of labels for each annotator
label_counts = {annotator: df[annotator].value_counts().to_dict() for annotator in annotators}

print("\nCohen's Kappa Results:")
for pair, kappa in cohen_kappa_results.items():
    print(f"{pair}: {kappa:.2f}")

print("\nFleiss' Kappa Result:")
print(f"{fleiss_kappa_result:.2f}")

print("\nLabel Counts:")
for annotator, counts in label_counts.items():
    print(f"{annotator}:")
    for label, count in counts.items():
        print(f"  {label}: {count}")



Cohen's Kappa Results:
Annotator_1 vs Annotator_2: 0.82
Annotator_1 vs Annotator_3: 0.60
Annotator_2 vs Annotator_3: 0.71

Fleiss' Kappa Result:
0.71

Label Counts:
Annotator_1:
  U: 40
  C: 36
  D: 24
Annotator_2:
  U: 48
  C: 26
  D: 26
Annotator_3:
  U: 51
  C: 30
  D: 19
