In [10]:
import pandas as pd

from sklearn.metrics import cohen_kappa_score, classification_report
import krippendorff
import numpy as np

In [2]:
data = pd.read_parquet("data/publicsphere/publicsphere.cardiff_prompt_classify_anon.parquet")
data.head()

Unnamed: 0,StartDate,RecordedDate,IPAddress,Finished,Coder,ID,Mark_ID,Genre,topiccode,Platform,...,rationality_topic_relevance,political_negativity,rationality_background_info,rationality_reasoning,sentiment,offensive,topics,emotions,irony,hate
0,5/30/2021 13:03:17,5/30/2021 13:04:17,62.194.51.29,1,6,UgyPHwv8G0cDE6-wEgl4AaABAg.8_0ZjJKSJty8_0kXGkAd2U,119,0,0,1,...,No,not political/negative,No,No,"[negative, neutral]","{'offensive', 'non-offensive'}",set(),"{'sadness', 'pessimism'}","{'non_irony', 'irony'}",{'NOT-HATE'}
1,10/11/2021 10:34:05,10/11/2021 10:36:46,213.127.109.191,1,6,Ugx2WXq9UdV8mPPjejJ4AaABAg.8yHCKV0Boe58yYRxEQEF45,282,1,2,1,...,Yes,not political/negative,No,Yes,"[negative, neutral]","{'offensive', 'non-offensive'}",{'news_&_social_concern'},"{'disgust', 'anger'}",{'non_irony'},{'NOT-HATE'}
2,9/9/2021 18:49:48,9/9/2021 18:51:32,213.127.110.0,1,6,1110578710648890000,372,2,4,2,...,Yes,political/negative,No,No,"[negative, neutral]",{'non-offensive'},{'news_&_social_concern'},"{'anticipation', 'disgust', 'anger'}",{'non_irony'},{'NOT-HATE'}
3,6/6/2021 16:12:46,6/6/2021 16:16:16,213.127.76.145,1,6,UgwUPFScjJ0MCeaP2F54AaABAg.8lvp3fc9Euf8lvvgsUgEgV,769,0,0,1,...,Yes,political/negative,Yes,Yes,"[negative, neutral]",{'non-offensive'},{'news_&_social_concern'},{'disgust'},{'irony'},{'NOT-HATE'}
4,6/13/2021 13:25:49,6/13/2021 13:27:28,213.127.82.232,1,6,UgwWKCWtSJdFvjGHvTp4AaABAg.8kUC5dGrQ2H8kUDRihE2f3,1206,0,0,1,...,No,not political/negative,No,No,"[positive, neutral]",{'non-offensive'},{'diaries_&_daily_life'},{'sadness'},"{'non_irony', 'irony'}",{'NOT-HATE'}


In [3]:
llm_human_column_pairs = [
    ("Acknowledgement", "interactivity_acknowledgement"),
    ("BackgroundInfo", "rationality_background_info"),
    ("ExternalEvidence", "rationality_external_evidence"),
    ("Reasoning", "rationality_reasoning"),
    ("TopicRelevance", "rationality_topic_relevance"),
]

In [24]:
print("Inter-coder reliablity: classification_report(human as gold, llm as predicted), cohen_kappa(human, llm):")

for human_col, llm_col in llm_human_column_pairs:
    subset = data[[human_col, llm_col]].dropna()
    human = subset[human_col].map({0: "No", 1: "Yes"}).tolist()
    llm = subset[llm_col].tolist()
    
    print("---")
    print(f"{classification_report(human, llm)}")
    print(f"cohen_kappa_score({human_col}): {cohen_kappa_score(human, llm)}")
    print(f"krippendorf({human_col}): {krippendorff.alpha(np.array([human, llm]), level_of_measurement="nominal")}")

Inter-coder reliablity: classification_report(human as gold, llm as predicted), cohen_kappa(human, llm):
---
              precision    recall  f1-score   support

          No       0.83      0.74      0.78      2854
         Yes       0.44      0.58      0.50      1004

    accuracy                           0.70      3858
   macro avg       0.64      0.66      0.64      3858
weighted avg       0.73      0.70      0.71      3858

cohen_kappa_score(Acknowledgement): 0.2889861702288804
krippendorf(Acknowledgement): 0.2829925650557621
---
              precision    recall  f1-score   support

          No       0.96      0.87      0.92      3469
         Yes       0.38      0.68      0.49       389

    accuracy                           0.86      3858
   macro avg       0.67      0.78      0.70      3858
weighted avg       0.90      0.86      0.87      3858

cohen_kappa_score(BackgroundInfo): 0.41023725361398966
krippendorf(BackgroundInfo): 0.4024105964926574
---
              precisio