This notebook compares annotations using different GLLMs with codebooks based prompts of Boukes 2024, Jaidka 2022 and Naab 2025 on their respective datasets

In [2]:
import requests
import os
from dotenv import load_dotenv
load_dotenv("sjoerdAzure.env")  # Load environment variables from .env file
import time

import typing

from sklearn.metrics import cohen_kappa_score, classification_report
import krippendorff
import yaml

import pandas as pd

import config
import src
import tqdm
import json
import numpy as np
import logging

#import cltrier_lib as lib
import pyreadstat
import yaml
pd.set_option('display.max_colwidth', 100) 
#set up helper variables and functions:
CFG = config.Config()


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\sstolwi\Github\llmdiv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\sstolwi\Github\llmdiv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\sstolwi\Github\llmdiv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\sstolwi\Git

In [3]:
#load data:

# Jaida2024 data
jaidka = pd.read_parquet('data/jaidka2022/TwitterDeliberativePolitics2.parquet')
# Boukes
boukes = pd.read_parquet('data/publicsphere/publicsphere.cardiff_prompt_classify_anon.parquet')
boukesT = pd.read_csv('data/publicsphere/full_data.csv') # this includes the comments
#the Boukes2024 data is a subset of this, select YT part of Boukes in line with Boukes2024:
boukesTYT = boukesT[boukesT['Platform'] == 1]
#MH_clemm 2024
MHclemm = pd.read_parquet('data/MH_BClemm_data/Ideo_Val_GPT_USA_L33_70b.parquet')


list the variables we want to use:
**rationality** - prompt: 'rationality_simple2', 'rationality_jaidka',        
  manual coding: "Justification" (Jaidka), RATIONALITY_DUMMY
**incivility** - prompt: 'incivility_simple2', 'incivility_jaidka',  civility_jaidka         
  manual coding: INCIVILITY_DUMMY, Incivility_tot ('Uncivil_abuse', 'Empathy_Respect'), Uncivil_abuse, "Empathy_Respect" (jaidka)
**interactivity** - prompt: 'interactivity_acknowledgement_simple', interactivity_acknowledgement_jaidka       
  manual coding: INTERACTIVITY_DUMMY, Reciprocity (Jaidka)
**diversity/ideology** - prompt: 'political_ideology_US', 'political_ideology' (german)  -> no ideology in Jaidka
  manual coding: LIBERAL_DUMMY, CONSERVATIVE_DUMMY
**political_dum** - prompt: 'political_post', political_post_jaidka 
  manual coding: HAS_OPINION_DUMMY

In [None]:
#model variants:
# llama31_8b
# llama31_70b
# gpt4o
# gpt4Turbo

#optional:
# gpt4 (OPenAI, microsoft)
# llama33_70b (Meta)
# Gemma3:22b (US, google) (based on Gemini 2)
# "id":"deepseek-r1:70b","name":"DeepSeek-R1 (china)
# qwen2.5:70b (china)
# mistral-large:123b","name":"Mistral" (europe)


list the annotations we have available per dataset:
**Jaidka**: 'rationality_simple2_Llama8b_dum', 'rationality_simple2_gpt4o_dum','rationality_jaidka_Llama8b_dum','rationality_jaidka_gpt4o_dum', 'civility_jaidka_gpt4o_dum', incivility_simple2_gpt4o_dum, incivility_jaidka_gpt4o_dum, reciprocity_jaidka_gpt4o_dum, interactivity_acknowledgement_simple_gpt4o_dum, political_post_jaidka_gpt4o_dum, political_post_gpt4o_dum
**Boukes**: 
*rationality*: rationality_simple2_dum (+ rationality_simple_dum, rationality_combine_dum, rationality_combine_exactexample_dum, rationality_prompt_dum (aggregation of indicator prompt scores)), rationality_simple2_gpt4o_system_dum, rationality_simple2_gpt4T_system_dum, rationality_simple2_small_dum, 
*incivility*: incivility_simple2_dum (+ incivility_simple_dum, incivility_combine_dum), incivility_prompt_dum (aggregation of indicator prompt scores), incivility_simple2_gpt4o_system_dum, incivility_simple2_gpt4T_system_dum, incivility_simple2_small_dum 
*interactivity*: interactivity_acknowledgement_simple_dum (+ interactivity_acknowledgement_simple2_dum), interactivity_acknowledgement_simple_gpt4o_system_dum, interactivity_acknowledgement_simple_gpt4T_system_dum, interactivity_acknowledgement_simple_small_dum (+interactivity_acknowledgement_simple_small2_dum)
*diversity*: political_liberal_US_dum, political_conservative_US_dum, political_liberal_US_gpt4o_system_dum, political_liberal_US_gpt4T_system_dum, political_conservative_US_gpt4o_system_dum, political_conservative_US_gpt4T_system_dum, political_liberal_US_small_dum, political_conservative_US_small_dum
*political_dum*: political_opinion_US_dum, political_opinion_US_gpt4o_system_dum, political_opinion_US_gpt4T_system_dum, political_opinion_US_small_dum (either liberal/conservative; Boukes)

Update and clarify variable names

In [67]:
MHclemm.columns.to_list()

['text',
 'label',
 'GPT1',
 'GPT2',
 'GPT_Reconciled',
 'political_ideology_US_L33_70b_zero',
 'political_ideology_US_para1_L33_70b_zero',
 'political_ideology_US_L31_8b_zero',
 'political_ideology_US_para1_L31_8b_zero',
 'political_ideology_US_Q25_72b_zero',
 'political_ideology_US_para1_Q25_72b_zero',
 'political_ideology_US_L33_70b_zero_con_dum',
 'political_ideology_US_L33_70b_zero_lib_dum',
 'political_ideology_US_para1_L33_70b_zero_con_dum',
 'political_ideology_US_para1_L33_70b_zero_lib_dum',
 'political_ideology_US_L31_8b_zero_con_dum',
 'political_ideology_US_L31_8b_zero_lib_dum',
 'political_ideology_US_para1_L31_8b_zero_con_dum',
 'political_ideology_US_para1_L31_8b_zero_lib_dum',
 'political_ideology_US_Q25_72b_zero_con_dum',
 'political_ideology_US_Q25_72b_zero_lib_dum',
 'political_ideology_US_para1_Q25_72b_zero_con_dum',
 'political_ideology_US_para1_Q25_72b_zero_lib_dum']

In [66]:
jaidka.head()

Unnamed: 0,message_id,message,Constructiveness,Justification,Justification_internal,Justification_external,Relevance,Reciprocity,Empathy_Respect,Uncivil_abuse,...,political_post_jaidka_L31_8b_zero_dum,civility_jaidka_Q25_72b_zero_dum,incivility_jaidka_Q25_72b_zero_dum,incivility_simple2_Q25_72b_zero_dum,reciprocity_jaidka_Q25_72b_zero_dum,interactivity_acknowledgement_simple_Q25_72b_zero_dum,political_post_Q25_72b_zero_dum,political_post_jaidka_Q25_72b_zero_dum,rationality_jaidka_Q25_72b_zero_dum,rationality_simple2_Q25_72b_zero_dum
0,1,@USER- #GrahamCassidy will devastate #MilitaryFamilies w/ kids like Justin who need #Medicaid. P...,0,1,0,0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,1,1,0,1,0
1,2,@USER- The US people &amp; Minnesotans must see the Senate Ethics investigation committee hearin...,0,1,0,0,1.0,1.0,1.0,1.0,...,0,0,1,1,0,1,1,0,1,0
2,4,"=@USER - ""we all want the same thing when you look at the big picture""",0,1,0,0,1.0,,1.0,,...,0,1,0,0,0,1,0,0,0,0
3,5,@USER - A poison in our island - Rising seas caused by climate change are seeping inside a Unite...,0,1,0,0,1.0,0.0,,0.0,...,0,0,0,1,0,1,1,0,1,1
4,6,=@USER - hypocrite. You are A porn surfer and claim to be holier than thou.,0,0,0,0,,0.0,0.0,,...,0,0,1,1,0,0,0,0,1,0


In [44]:
zerolist = [    'rationality_simple2_Llama8b',
 'rationality_jaidka_Llama8b',  'rationality_jaidka_gpt4o',  'reciprocity_jaidka_gpt4o', 'civility_jaidka_gpt4o',
 'incivility_jaidka_gpt4o',
 'political_post_jaidka_gpt4o',
 'constructiveness_jaidka_gpt4o',
 'interactivity_acknowledgement_simple_gpt4o',
 'incivility_simple2_gpt4o', 'political_post_gpt4o',


 ]


##change column names in boukes for lowlist:
for col in zerolist:
    if col in jaidka.columns:
        jaidka.rename(columns={col: col + '_zero'}, inplace=True)
    if col + '_dum' in jaidka.columns:
        jaidka.rename(columns={col + '_dum': col + '_zero_dum'}, inplace=True)

In [None]:
lowlist = ['rationality_simple2_L33_70b',
 'civility_jaidka_L33_70b',
 'incivility_jaidka_L33_70b',
 'incivility_simple2_L33_70b',
 'rationality_jaidka_L33_70b',
 'reciprocity_jaidka_L33_70b',
 
]

#change column names in boukes for zerolist:
for col in lowlist:
    if col in boukes.columns:
        boukes.rename(columns={col: col + '_low'}, inplace=True)
    if col + '_dum' in boukes.columns:
        boukes.rename(columns={col + '_dum': col + '_low_dum'}, inplace=True)

In [58]:
#create dummy variables:
ideolist = ['political_ideology_US_L33_70b_low', 'political_ideology_US_L33_70b_seed2_low', 'political_ideology_US_para1_L33_70b_low',
 'political_ideology_US_para2_L33_70b_low',
 'political_ideology_US_simpa1_L33_70b_low', 'political_ideology_US_L33_70b_seed2_run2_low',  'political_ideology_US_para1_L31_8b_low',
 'political_ideology_US_para2_L31_8b_low',
 'political_ideology_US_simpa1_L31_8b_low', 'political_ideology_US_L31_8b_seed2_low',  'political_ideology_US_para1_Q25_72b_zero', 'political_ideology_US_Q72b_zero', 'political_ideology_US_para2_Q25_72b_zero',
 'political_ideology_US_simpa1_Q25_72b_zero', 'political_ideology_US_Q72b_seed2_zero', 'political_ideology_US_L33_70b_zero', 'political_ideology_US_L33_70b_zero_seed2',  'political_ideology_US_L31_8b_zero',
 'political_ideology_US_L31_8b_zero_seed2',  'political_ideology_US_Q25_72b_low',  'political_ideology_US_Q25_72b_low_seed2',


]

#make a two dummy variable for each ideology column, one for conservative and one for liberal:
for col in ideolist:
    if col in boukes.columns:
        boukes[col + '_con_dum'] = boukes[col].apply(lambda x: 1 if x == 'conservative' else 0)
        boukes[col + '_lib_dum'] = boukes[col].apply(lambda x: 1 if x == 'liberal' else 0)

In [60]:
#make dummy variables for dummy_list:
dummy_list = [ 'civility_jaidka_L33_70b_low',
 'interactivity_acknowledgement_simple_L33_70b_low',
 'rationality_simple2_para1_L33_70b_low',
 'incivility_simple2_L33_70b_seed2_low',
 'interactivity_acknowledgement_simple_L33_70b_seed2_low',
 'political_post_L33_70b_seed2_low',
 'rationality_simple2_L33_70b_seed2_low',
 'political_post_para1_L33_70b_low',
 'political_post_para2_L33_70b_low',
 'political_post_simpa1_L33_70b_low',
 'rationality_simple2_para2_L33_70b_low',
 'rationality_simple2_simpa1_L33_70b_low',
 'incivility_para1_L33_70b_low',
 'incivility_para2_L33_70b_low',
 'incivility_simpa1_L33_70b_low',
 'interactivity_acknowledgement_simple_para1_L33_70b_low',
 'interactivity_acknowledgement_simple_para2_L33_70b_low',
 'interactivity_acknowledgement_simple_simpa1_L33_70b_low',
 'incivility_simple2_L33_70b_seed2_run2_low',
 'interactivity_acknowledgement_simple_L33_70b_seed2_run2_low',
 'political_post_L33_70b_seed2_run2_low',
 'rationality_simple2_L33_70b_seed2_run2_low',
 'civility_jaidka_L31_8b_low',
 'incivility_jaidka_L31_8b_low',
 'incivility_para1_L31_8b_low',
 'incivility_para2_L31_8b_low',
 'incivility_simpa1_L31_8b_low',
 'reciprocity_jaidka_L31_8b_low',
 'interactivity_acknowledgement_simple_para1_L31_8b_low',
 'interactivity_acknowledgement_simple_para2_L31_8b_low',
 'interactivity_acknowledgement_simple_simpa1_L31_8b_low',
 'political_post_L31_8b_low',
 'political_post_jaidka_L31_8b_low',
 'political_post_para1_L31_8b_low',
 'political_post_para2_L31_8b_low',
 'political_post_simpa1_L31_8b_low',
 'rationality_jaidka_L31_8b_low',
 'rationality_simple2_para1_L31_8b_low',
 'rationality_simple2_para2_L31_8b_low',
 'rationality_simple2_simpa1_L31_8b_low',
 'incivility_simple2_L31_8b_seed2_low',
 'interactivity_acknowledgement_simple_L31_8b_seed2_low',
  'political_post_L31_8b_seed2_low',
 'rationality_simple2_L31_8b_seed2_low',
 'incivility_simple2_Q72b_zero',
 'interactivity_acknowledgement_simple_Q72b_zero',
 'political_post_Q72b_zero',
 'rationality_simple2_Q72b_zero',
 'civility_jaidka_Q25_72b_zero',
 'incivility_jaidka_Q25_72b_zero',
 'incivility_para1_Q25_72b_zero',
 'reciprocity_jaidka_Q25_72b_zero',
 'interactivity_acknowledgement_simple_para1_Q25_72b_zero',
 'political_post_jaidka_Q25_72b_zero',
 'political_post_para1_Q25_72b_zero',
 'rationality_jaidka_Q25_72b_zero',
 'rationality_simple2_para1_Q25_72b_zero',
 'incivility_para2_Q25_72b_zero',
 'incivility_simpa1_Q25_72b_zero',
 'interactivity_acknowledgement_simple_para2_Q25_72b_zero',
 'interactivity_acknowledgement_simple_simpa1_Q25_72b_zero',
  'political_post_para2_Q25_72b_zero',
 'political_post_simpa1_Q25_72b_zero',
 'rationality_simple2_para2_Q25_72b_zero',
 'rationality_simple2_simpa1_Q25_72b_zero',
 'incivility_simple2_Q72b_seed2_zero',
 'interactivity_acknowledgement_simple_Q72b_seed2_zero',
  'political_post_Q72b_seed2_zero',
 'rationality_simple2_Q72b_seed2_zero',
 'incivility_simple2_L33_70b_zero',
 'interactivity_acknowledgement_simple_L33_70b_zero',
  'political_post_L33_70b_zero',
 'rationality_simple2_L33_70b_zero',
 'incivility_simple2_L33_70b_zero_seed2',
 'interactivity_acknowledgement_simple_L33_70b_zero_seed2',
  'political_post_L33_70b_zero_seed2',
 'rationality_simple2_L33_70b_zero_seed2',
 'incivility_simple2_L31_8b_zero',
 'interactivity_acknowledgement_simple_L31_8b_zero',
 'political_post_L31_8b_zero',
 'rationality_simple2_L31_8b_zero',
 'incivility_simple2_L31_8b_zero_seed2',
 'interactivity_acknowledgement_simple_L31_8b_zero_seed2',
 'political_post_L31_8b_zero_seed2',
 'rationality_simple2_L31_8b_zero_seed2',
 'incivility_simple2_Q25_72b_low',
 'interactivity_acknowledgement_simple_Q25_72b_low',
 'political_post_Q25_72b_low',
 'rationality_simple2_Q25_72b_low',
 'incivility_simple2_Q25_72b_low_seed2',
 'interactivity_acknowledgement_simple_Q25_72b_low_seed2',
 'political_post_Q25_72b_low_seed2',
 'rationality_simple2_Q25_72b_low_seed2',
 'rationality_jaidka_gpt4o_system_zero',
 'incivility_jaidka_gpt4o_system_zero',
 'civility_jaidka_gpt4o_system_zero',
 'interactivity_acknowledgement_jaidka_gpt4o_system_zero',
 'political_post_jaidka_gpt4o_system_zero',
 'incivility_para1_gpt4o_system_zero',
 'incivility_para2_gpt4o_system_zero',
 'interactivity_acknowledgement_para1_gpt4o_system_zero',
 'interactivity_acknowledgement_para2_gpt4o_system_zero',
 'incivility_simpa1_gpt4o_system_zero']

#first check if which unique values are in each of these columns:
unique_values = {}
for col in dummy_list:
    if col in boukes.columns:
        unique_values[col] = boukes[col].unique().tolist()
        print(f"Unique values in {col}: {unique_values[col]}")

#if a column starts with political_post, make dummy for 'political' and 'non-political':
#if a column does not start with political_post, make dummy for 'yes' and 'no':
for col in dummy_list:
    if col in boukes.columns:
        if 'political_post' in col:
            boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'political' else 0)
        else:
            boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)



Unique values in civility_jaidka_L33_70b_low: ['No', 'Yes', None]
Unique values in interactivity_acknowledgement_simple_L33_70b_low: ['No', 'Yes']
Unique values in rationality_simple2_para1_L33_70b_low: ['No', None, 'Yes']
Unique values in incivility_simple2_L33_70b_seed2_low: ['No', 'Yes', None]
Unique values in interactivity_acknowledgement_simple_L33_70b_seed2_low: ['No', None, 'Yes']
Unique values in political_post_L33_70b_seed2_low: ['non-political', 'political', None]
Unique values in rationality_simple2_L33_70b_seed2_low: ['No', None, 'Yes']
Unique values in political_post_para1_L33_70b_low: ['non-political', 'political', None]
Unique values in political_post_para2_L33_70b_low: ['non-political', 'political', None]
Unique values in political_post_simpa1_L33_70b_low: ['non-political', 'political', None]
Unique values in rationality_simple2_para2_L33_70b_low: ['No', None, 'Yes']
Unique values in rationality_simple2_simpa1_L33_70b_low: ['No', None, 'Yes']
Unique values in incivility

  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'political' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'political' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == 'Yes' else 0)
  boukes[col + '_dum'] = boukes[col].apply(lambda x: 1 if x == '

In [64]:
#dummies for jaidka:
jaidkadum = ['incivility_jaidka_L33_70b_zero',
 'incivility_simple2_L33_70b_zero',
 'reciprocity_jaidka_L33_70b_zero',
 'interactivity_acknowledgement_simple_L33_70b_zero',
 'political_post_L33_70b_zero',
 'political_post_jaidka_L33_70b_zero',
 'rationality_jaidka_L33_70b_zero',
 'rationality_simple2_L33_70b_zero',
 'civility_jaidka_L31_8b_zero',
 'incivility_jaidka_L31_8b_zero',
 'incivility_simple2_L31_8b_zero',
 'reciprocity_jaidka_L31_8b_zero',
 'interactivity_acknowledgement_simple_L31_8b_zero',
 'political_post_L1_8b_zero',
 'political_post_jaidka_L31_8b_zero',
 'civility_jaidka_Q25_72b_zero',
 'incivility_jaidka_Q25_72b_zero',
 'incivility_simple2_Q25_72b_zero',
 'reciprocity_jaidka_Q25_72b_zero',
 'interactivity_acknowledgement_simple_Q25_72b_zero',
 'political_post_Q25_72b_zero',
 'political_post_jaidka_Q25_72b_zero',
 'rationality_jaidka_Q25_72b_zero',
 'rationality_simple2_Q25_72b_zero']

#if a column starts with political_post, make dummy for 'political' and 'non-political':
#if a column does not start with political_post, make dummy for 'yes' and 'no':
for col in jaidkadum:
    if col in jaidka.columns:
        if 'political_post' in col:
            jaidka[col + '_dum'] = jaidka[col].apply(lambda x: 1 if x == 'political' else 0)
        else:
            jaidka[col + '_dum'] = jaidka[col].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
#save the updated dataset with dummy variables:
boukes.to_parquet('data/publicsphere/publicsphere.cardiff_prompt_classify_anon.parquet', index=False)

In [None]:
MHclemm.to_parquet('data/MH_BClemm_data/Ideo_Val_GPT_USA_L33_70b.parquet', index=False)

In [None]:
jaidka.to_parquet('data/jaidka2022/TwitterDeliberativePolitics2.parquet', index=False)

In [None]:
#feasability check:
#do annotations of Llama3.3:70b correlate with gpt4o?
boukes.loc[:, ['rationality_simple2_L33_70b_dum', 'rationality_jaidka_L33_70b_dum', 'rationality_simple2_small_dum', 'rationality_simple2_gpt4o_dum', 'rationality_simple_dum', 'RATIONALITY_DUMMY']] \
    .corr(method='pearson').round(2)

Unnamed: 0,rationality_simple2_L33_70b_dum,rationality_jaidka_L33_70b_dum,rationality_simple2_small_dum,rationality_simple2_gpt4o_dum,rationality_simple_dum,RATIONALITY_DUMMY
rationality_simple2_L33_70b_dum,1.0,0.29,0.46,0.39,0.71,0.41
rationality_jaidka_L33_70b_dum,0.29,1.0,0.12,0.11,0.31,0.3
rationality_simple2_small_dum,0.46,0.12,1.0,0.54,0.42,0.25
rationality_simple2_gpt4o_dum,0.39,0.11,0.54,1.0,0.32,0.25
rationality_simple_dum,0.71,0.31,0.42,0.32,1.0,0.33
RATIONALITY_DUMMY,0.41,0.3,0.25,0.25,0.33,1.0


improved performance of L33_70b compared to L31_70b (default), and the two models also show the highest overlap

In [None]:
#and in crosstabulations:
pd.crosstab(boukes['RATIONALITY_DUMMY'], [boukes['rationality_simple2_L33_70b_dum'], boukes['rationality_simple2_small_dum']], margins=True, margins_name='Total')

rationality_simple2_L33_70b_dum,0,0,1,1,Total
rationality_simple2_small_dum,0,1,0,1,Unnamed: 5_level_1
RATIONALITY_DUMMY,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,2930,14,181,40,3165
1,394,1,202,100,697
Total,3324,15,383,140,3862


#L33_70b and small share 100+2930=3030 correct classifications (78%) and share 40+394=434 errors (11%) and differ on 14+1+181+202=398 errors (10%)-> they differ on 48% of errors
#L33_70b makes 2930+14+202+100=3246 correct classifications
and 394+1+181+40=616 errors = 16%
#small makes 2930+181+1+100=3212 correct classifications
and 394+14+202+40=650 errors = 17%
#we would thus expect 0.17*0.16 = only 3% overlap between errors if the models were random -> they thus do a lot better than that


In [None]:
#and in crosstabulations:
pd.crosstab(boukes['RATIONALITY_DUMMY'], [boukes['rationality_simple2_L33_70b_dum'], boukes['rationality_simple2_gpt4o_dum']], margins=True, margins_name='Total')

rationality_simple2_L33_70b_dum,0,1,1,Total
rationality_simple2_gpt4o_dum,0,0,1,Unnamed: 4_level_1
RATIONALITY_DUMMY,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,2944,205,16,3165
1,395,230,72,697
Total,3339,435,88,3862


#L33_70b and gpt4o share only 16 errors (0%) and differ on 205+230=435 errors (11%) -> they differ on 96% of errors
#L33_70b and gpt4o correctly classify 2944+72=3016 (78%)
#so L33_70b shares the same share of correct classifications in combination with small and gpt4o, but errors overlap much more with small than with gpt4o, which makes sense, overlap between errors of L33_70b and gpt4o is equal to chance.
#this indicates that these two models don't agree on which manual coding are actually coding errors -> together they only mark 16 comments as potentially wrong coded even though they haven't seen our annotations in their training data, so should be independently judging the rationality of the comments.

In [None]:
#do annotations of Llama3.3:70b correlate with gpt4o?
boukes.loc[:, ['incivility_simple2_L33_70b_dum', 'incivility_jaidka_L33_70b_dum', 'incivility_simple2_small_dum', 'incivility_simple2_gpt4o_dum', 'INCIVILITY_DUMMY']] \
    .corr(method='pearson').round(2)

Unnamed: 0,incivility_simple2_L33_70b_dum,incivility_jaidka_L33_70b_dum,incivility_simple2_small_dum,incivility_simple2_gpt4o_dum,INCIVILITY_DUMMY
incivility_simple2_L33_70b_dum,1.0,0.76,0.49,0.58,0.54
incivility_jaidka_L33_70b_dum,0.76,1.0,0.53,0.63,0.51
incivility_simple2_small_dum,0.49,0.53,1.0,0.68,0.48
incivility_simple2_gpt4o_dum,0.58,0.63,0.68,1.0,0.55
INCIVILITY_DUMMY,0.54,0.51,0.48,0.55,1.0


In [None]:
#do annotations of Llama3.3:70b correlate with gpt4o?
boukes.loc[:, ['political_post_L33_70b_dum', 'political_post_jaidka_L33_70b_dum', 'TopicRelevance']] \
    .corr(method='pearson').round(2)

Unnamed: 0,political_post_L33_70b_dum,political_post_jaidka_L33_70b_dum,TopicRelevance
political_post_L33_70b_dum,1.0,0.49,0.58
political_post_jaidka_L33_70b_dum,0.49,1.0,0.38
TopicRelevance,0.58,0.38,1.0


In [None]:
#what is the infuence of temperature on the results of intraprompt annotation reliability?

#Note the logic of our comparisons:
#we compare the results of the same prompt with different models, and different seeds, to see if the model and/or seed influences the results.
#we use similar options per model, but the options are not the same for all models, they differ in temperature and seed (since the same seed might mean something different for different models).
#but since we compare intraprompt annotation reliability for the same prompt with different seeds, the difference in temperature is not a problem, the annotation might differ per output of the model for that seed/temperature, but the difference with another seed should be minimal.
#anyway we can test the origins of intraprompt reliability by comparing the result of the same prompt with the same seed and low temperature, different seed and low temperature and same seed and zero temperature
#if it turns out that temperature does have a larger influence, but still the influence of different models or prompts is larger, we can still conclude that the model and prompt are more important than the temperature.
#stronger still a higher temperature intraprompt benchmark is harder to beat especially for the simpa-prompts
#for the between model comparisons temperature per model should not be a problem, since it will only vary the result of the model, not the comparison between models, only downside of low temperature is potential slightly lower reproducibility of the exact results and potetially slightly lower performance due to less creativity, but Barry ea 2025 does not seem to suggest this is the case for such low temperatures.

In [11]:
#calculate correlations between the different models for all columns ending with _dum and containing 'incivility' in boukes, start with the gpt4o model and zero temperature::
incivility_cols = [col for col in boukes.columns if 'incivility' in col]
incivility_cols = [col for col in incivility_cols if col.endswith('_dum')]
incivility_cols = [col for col in incivility_cols if 'gpt4o' in col or 'L33_70b' in col or 'L31_8b' in col or 'Q25_72b' in col or 'Q72b' in col]
incivility_cols = [col for col in incivility_cols if 'zero' in col]  
incivility_corr = boukes[incivility_cols].corr(method='pearson').round(2)
print("Incivility correlations:")
incivility_corr

Incivility correlations:


Unnamed: 0,incivility_simple2_gpt4o_zero_dum,incivility_simple2_gpt4o_system_zero_dum,incivility_simple2_Q72b_zero_dum,incivility_jaidka_Q25_72b_zero_dum,incivility_para1_Q25_72b_zero_dum,incivility_para2_Q25_72b_zero_dum,incivility_simpa1_Q25_72b_zero_dum,incivility_simple2_Q72b_seed2_zero_dum,incivility_simple2_L33_70b_zero_dum,incivility_simple2_L33_70b_zero_seed2_dum,incivility_simple2_L31_8b_zero_dum,incivility_simple2_L31_8b_zero_seed2_dum,incivility_jaidka_gpt4o_system_zero_dum,incivility_para1_gpt4o_system_zero_dum,incivility_para2_gpt4o_system_zero_dum,incivility_simpa1_gpt4o_system_zero_dum
incivility_simple2_gpt4o_zero_dum,1.0,0.88,0.56,0.6,0.55,0.51,0.54,0.56,0.55,0.55,0.64,0.64,0.73,0.79,0.82,0.81
incivility_simple2_gpt4o_system_zero_dum,0.88,1.0,0.55,0.59,0.54,0.5,0.53,0.55,0.53,0.53,0.67,0.67,0.76,0.85,0.84,0.84
incivility_simple2_Q72b_zero_dum,0.56,0.55,1.0,0.87,0.96,0.92,0.96,1.0,0.85,0.85,0.55,0.54,0.55,0.59,0.64,0.59
incivility_jaidka_Q25_72b_zero_dum,0.6,0.59,0.87,1.0,0.86,0.84,0.86,0.87,0.8,0.8,0.57,0.57,0.59,0.62,0.67,0.62
incivility_para1_Q25_72b_zero_dum,0.55,0.54,0.96,0.86,1.0,0.92,0.94,0.96,0.84,0.84,0.54,0.54,0.54,0.58,0.63,0.59
incivility_para2_Q25_72b_zero_dum,0.51,0.5,0.92,0.84,0.92,1.0,0.93,0.92,0.83,0.83,0.51,0.5,0.51,0.54,0.59,0.55
incivility_simpa1_Q25_72b_zero_dum,0.54,0.53,0.96,0.86,0.94,0.93,1.0,0.96,0.85,0.85,0.54,0.53,0.53,0.57,0.62,0.58
incivility_simple2_Q72b_seed2_zero_dum,0.56,0.55,1.0,0.87,0.96,0.92,0.96,1.0,0.85,0.85,0.55,0.54,0.55,0.59,0.64,0.59
incivility_simple2_L33_70b_zero_dum,0.55,0.53,0.85,0.8,0.84,0.83,0.85,0.85,1.0,1.0,0.54,0.54,0.53,0.58,0.63,0.58
incivility_simple2_L33_70b_zero_seed2_dum,0.55,0.53,0.85,0.8,0.84,0.83,0.85,0.85,1.0,1.0,0.54,0.54,0.53,0.58,0.63,0.58


In [27]:
#calculate correlations between the different models for all columns ending with _dum and containing 'rationality' in boukes, start with the gpt4o model and zero temperature::
incivility_cols = [col for col in boukes.columns if 'incivility' in col]
incivility_cols = [col for col in incivility_cols if col.endswith('_dum')]
incivility_cols = [col for col in incivility_cols if 'gpt4o' in col or 'L33_70b' in col or 'L31_8b' in col or 'Q25_72b' in col or 'Q72b' in col]
incivility_cols = [col for col in incivility_cols if 'zero' in col]  
incivility_cols = [col for col in incivility_cols if 'para' not in col and 'simpa' not in col and 'seed2' not in col and 'gpt4o_zero' not in col and 'jaidka' not in col]  #remove para and simpa columns, since these are not comparable with the gpt4o model
#add groundtruth columns:
incivility_cols += ['INCIVILITY_DUMMY']
incivility_corr = boukes[incivility_cols].corr(method='pearson').round(2)

incivility_corr

Unnamed: 0,incivility_simple2_gpt4o_system_zero_dum,incivility_simple2_Q72b_zero_dum,incivility_simple2_L33_70b_zero_dum,incivility_simple2_L31_8b_zero_dum,INCIVILITY_DUMMY
incivility_simple2_gpt4o_system_zero_dum,1.0,0.55,0.53,0.67,0.51
incivility_simple2_Q72b_zero_dum,0.55,1.0,0.85,0.55,0.47
incivility_simple2_L33_70b_zero_dum,0.53,0.85,1.0,0.54,0.48
incivility_simple2_L31_8b_zero_dum,0.67,0.55,0.54,1.0,0.45
INCIVILITY_DUMMY,0.51,0.47,0.48,0.45,1.0


In [None]:
#model has quite a large effect on the results, comparable to the error rate of the groundtruth.

In [29]:
#calculate correlations between the different models for all columns ending with _dum and containing 'rationality' in boukes, start with the gpt4o model and zero temperature::
incivility_cols = [col for col in boukes.columns if 'incivility' in col]
incivility_cols = [col for col in incivility_cols if col.endswith('_dum')]
incivility_cols = [col for col in incivility_cols if 'L33_70b' in col]
#add groundtruth columns:
incivility_cols += ['INCIVILITY_DUMMY']
incivility_corr = boukes[incivility_cols].corr(method='pearson').round(2)

incivility_corr

Unnamed: 0,incivility_simple2_L33_70b_low_dum,incivility_jaidka_L33_70b_low_dum,incivility_simple2_L33_70b_seed2_low_dum,incivility_para1_L33_70b_low_dum,incivility_para2_L33_70b_low_dum,incivility_simpa1_L33_70b_low_dum,incivility_simple2_L33_70b_seed2_run2_low_dum,incivility_simple2_L33_70b_zero_dum,incivility_simple2_L33_70b_zero_seed2_dum,INCIVILITY_DUMMY
incivility_simple2_L33_70b_low_dum,1.0,0.76,0.85,0.82,0.79,0.81,0.85,0.85,0.85,0.54
incivility_jaidka_L33_70b_low_dum,0.76,1.0,0.67,0.68,0.68,0.69,0.67,0.67,0.67,0.51
incivility_simple2_L33_70b_seed2_low_dum,0.85,0.67,1.0,0.96,0.94,0.95,0.99,1.0,1.0,0.48
incivility_para1_L33_70b_low_dum,0.82,0.68,0.96,1.0,0.94,0.95,0.96,0.96,0.96,0.48
incivility_para2_L33_70b_low_dum,0.79,0.68,0.94,0.94,1.0,0.92,0.94,0.94,0.94,0.48
incivility_simpa1_L33_70b_low_dum,0.81,0.69,0.95,0.95,0.92,1.0,0.95,0.95,0.95,0.48
incivility_simple2_L33_70b_seed2_run2_low_dum,0.85,0.67,0.99,0.96,0.94,0.95,1.0,0.99,0.99,0.48
incivility_simple2_L33_70b_zero_dum,0.85,0.67,1.0,0.96,0.94,0.95,0.99,1.0,1.0,0.48
incivility_simple2_L33_70b_zero_seed2_dum,0.85,0.67,1.0,0.96,0.94,0.95,0.99,1.0,1.0,0.48
INCIVILITY_DUMMY,0.54,0.51,0.48,0.48,0.48,0.48,0.48,0.48,0.48,1.0


In [None]:
#it appears rewording, reformatting, changing seed and temperature has about equal influence, changing to Jaidka prompt has a larger influence, the zero temperature prompt does have a better correlation with a different seed than the low temperature prompt, 
#suprisingly low correlation between incivility_simple2_L33_70b_seed2_low_dum/incivility_simple2_L33_70b_seed2_run2_low_dum and incivility_simple2_L33_70b_low_dum -> temperature can sometimes have a larger effect than expected, but correlation is still much higher than with the groundtruth

In [30]:
#calculate correlations between the different models for all columns ending with _dum and containing 'rationality' in boukes, start with the gpt4o model and zero temperature::
incivility_cols = [col for col in boukes.columns if 'incivility' in col]
incivility_cols = [col for col in incivility_cols if col.endswith('_dum')]
incivility_cols = [col for col in incivility_cols if 'gpt4o' in col]
#add groundtruth columns:
incivility_cols += ['INCIVILITY_DUMMY']
incivility_corr = boukes[incivility_cols].corr(method='pearson').round(2)

incivility_corr

Unnamed: 0,incivility_simple2_gpt4o_zero_dum,incivility_simple2_gpt4o_system_zero_dum,incivility_jaidka_gpt4o_system_zero_dum,incivility_para1_gpt4o_system_zero_dum,incivility_para2_gpt4o_system_zero_dum,incivility_simpa1_gpt4o_system_zero_dum,INCIVILITY_DUMMY
incivility_simple2_gpt4o_zero_dum,1.0,0.88,0.73,0.79,0.82,0.81,0.55
incivility_simple2_gpt4o_system_zero_dum,0.88,1.0,0.76,0.85,0.84,0.84,0.51
incivility_jaidka_gpt4o_system_zero_dum,0.73,0.76,1.0,0.81,0.78,0.79,0.45
incivility_para1_gpt4o_system_zero_dum,0.79,0.85,0.81,1.0,0.9,0.91,0.47
incivility_para2_gpt4o_system_zero_dum,0.82,0.84,0.78,0.9,1.0,0.89,0.5
incivility_simpa1_gpt4o_system_zero_dum,0.81,0.84,0.79,0.91,0.89,1.0,0.49
INCIVILITY_DUMMY,0.55,0.51,0.45,0.47,0.5,0.49,1.0


In [None]:
#calculate correlations between the different models for all columns ending with _dum and containing 'rationality' in boukes, start with the gpt4o model and zero temperature::
rationality_cols = [col for col in boukes.columns if 'rationality' in col]
rationality_cols = [col for col in rationality_cols if col.endswith('_dum')]
rationality_cols = [col for col in rationality_cols if 'L33_70b' in col]
#add groundtruth columns:
rationality_cols += ['RATIONALITY_DUMMY']
rationality_corr = boukes[rationality_cols].corr(method='pearson').round(2)

rationality_corr

Unnamed: 0,rationality_simple2_L33_70b_low_dum,rationality_jaidka_L33_70b_low_dum,rationality_simple2_para1_L33_70b_low_dum,rationality_simple2_L33_70b_seed2_low_dum,rationality_simple2_para2_L33_70b_low_dum,rationality_simple2_simpa1_L33_70b_low_dum,rationality_simple2_L33_70b_seed2_run2_low_dum,rationality_simple2_L33_70b_zero_dum,rationality_simple2_L33_70b_zero_seed2_dum,RATIONALITY_DUMMY
rationality_simple2_L33_70b_low_dum,1.0,0.29,0.9,0.97,0.92,0.88,0.95,0.97,0.97,0.41
rationality_jaidka_L33_70b_low_dum,0.29,1.0,0.28,0.28,0.28,0.27,0.29,0.28,0.28,0.3
rationality_simple2_para1_L33_70b_low_dum,0.9,0.28,1.0,0.92,0.96,0.91,0.91,0.92,0.92,0.39
rationality_simple2_L33_70b_seed2_low_dum,0.97,0.28,0.92,1.0,0.94,0.91,0.97,1.0,1.0,0.4
rationality_simple2_para2_L33_70b_low_dum,0.92,0.28,0.96,0.94,1.0,0.91,0.93,0.94,0.94,0.4
rationality_simple2_simpa1_L33_70b_low_dum,0.88,0.27,0.91,0.91,0.91,1.0,0.89,0.91,0.91,0.38
rationality_simple2_L33_70b_seed2_run2_low_dum,0.95,0.29,0.91,0.97,0.93,0.89,1.0,0.97,0.97,0.4
rationality_simple2_L33_70b_zero_dum,0.97,0.28,0.92,1.0,0.94,0.91,0.97,1.0,1.0,0.4
rationality_simple2_L33_70b_zero_seed2_dum,0.97,0.28,0.92,1.0,0.94,0.91,0.97,1.0,1.0,0.4
RATIONALITY_DUMMY,0.41,0.3,0.39,0.4,0.4,0.38,0.4,0.4,0.4,1.0


In [None]:
#it appears rewording, reformatting, changing seed and temperature has about equal influence, changing to Jaidka prompt has a larger influence, the zero temperature prompt does have a better correlation with a different seed than the low temperature prompt, but effects are small

