In [86]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from scipy.stats import norm
import numpy as np


In [49]:
# Help function
def counter_to_relative(counter):
    total_count = sum(counter.values())
    relative = {}
    for key in counter:
        relative[key] = counter[key] / total_count
    return relative

def calculate_mean_and_ci(L):
    p_hat = np.mean(L)
    SE = np.sqrt(p_hat * (1 - p_hat) / len(L))
    conf_level = 0.95
    z_score = norm.ppf((1 + conf_level) / 2)
    ME = z_score * SE
    CI = [p_hat - ME, p_hat + ME]
    return p_hat, ME

# Load MTurk data

In [17]:
turk = pd.read_csv('MTURK.csv')

In [20]:
facit = pd.read_csv('US_sample_tweets.csv')

In [21]:
turk['id'] = [None if q.startswith('q_KNOWN') else int(q[2:]) for q in turk['question']]
mturk = facit.merge(turk,on='id')
mturk['correct'] = [a.lower().strip()==b.lower().strip() for a,b in zip(mturk['answer'],mturk['party'])]

In [90]:
#By worker, identify the highest accuracy
mturkcorrectbyworker = []
for wid in mturk.workerid.unique():
    kuk = mturk[mturk.workerid == wid]
    percentright = len(kuk.loc[((kuk.party == 'Democrat') & (kuk.answer == 'democrat')) | ((kuk.party == 'Republican') & (kuk.answer == 'republican'))]) / len(kuk)
    mturkcorrectbyworker.append(percentright)

In [92]:
# Wisdom of the crowd
wisdomofcrowd = mturk.groupby(['id','answer']).count()['created_at'].reset_index().sort_values('created_at', ascending=False).drop_duplicates('id').sort_index()
wischeck = facit.merge(wisdomofcrowd,on='id')
wisanswers = [a for a in wischeck.answer]
crowdcombined = len(wischeck.loc[wischeck.party.str.lower()==wischeck.answer])/len(wischeck)

# Load expert data

In [24]:
# Load expert answers
exp1 = pd.read_csv('Expert1.csv')
exp1['expert'] = '1'
exp2 = pd.read_csv('Expert2.csv')
exp2['expert'] = '2'
exp3 = pd.read_csv('Expert3.csv')
exp3['expert'] = '3'
experts = pd.concat([exp1,exp2,exp3])

# Clean up and merge
experts = experts.dropna()
experts = experts.rename(columns={'Your answer: Republican/Democrat. Type: "r" or "d"':'answer'})
experts = experts.astype({'id':'int'})
experts = experts.drop(columns=['Tweet text'])
facit = pd.read_csv('US_sample_tweets.csv')
expertmerge = facit.merge(experts,on='id')


In [25]:
expertmerge['correct'] = [(answer == 'd' and party == 'Democrat') or (answer == 'r' and party == 'Republican') for answer,party in zip(expertmerge.answer, expertmerge.party)]

In [26]:
averageexpert = Counter(expertmerge['correct'])[True]/len(expertmerge)

In [27]:
expertcorrect = [expertmerge.groupby(['expert','correct'])['id'].count()[('1',  True)]/500, expertmerge.groupby(['expert','correct'])['id'].count()[('2',  True)]/500, expertmerge.groupby(['expert','correct'])['id'].count()[('3',  True)]/500]

# Load LLM data

In [79]:
# Load, clean and make initial checks on LLM data
llm = pd.read_pickle("US_sample_tweets_llm.pkl")
llm['gpt4_temp02_correct'] = [[party==a for a in answers] for party,answers in zip(llm.party,llm.gpt4_temp02)]
llm['gpt4_temp10_correct'] = [[party==a for a in answers] for party,answers in zip(llm.party,llm.gpt4_temp10)]
llm['gpt35_correct'] = [party==answer for party,answer in zip(llm.party,llm.gpt35_guess)]
llm['gpt4_temp02_variation1_correct'] = [[party==a for a in answers] for party,answers in zip(llm.party,llm.gpt4_temp02_variation1)]
llm['gpt4_temp02_variation2_correct'] = [[party==a for a in answers] for party,answers in zip(llm.party,llm.gpt4_temp02_variation2)]

averagecorrect35 = Counter(llm.gpt35_correct)
averagecorrect35 = counter_to_relative(averagecorrect35)[True]

averagecorrecttemp02 = sum ([Counter(l) for l in llm.gpt4_temp02_correct], Counter())
averagecorrecttemp02 = counter_to_relative(averagecorrecttemp02)[True]

averagecorrecttemp10 = sum ([Counter(l) for l in llm.gpt4_temp10_correct], Counter())
averagecorrecttemp10 = counter_to_relative(averagecorrecttemp10)[True]

averagecorrecttemp02variation1 = sum ([Counter(l) for l in llm.gpt4_temp02_variation1_correct], Counter())
averagecorrecttemp02variation1 = counter_to_relative(averagecorrecttemp02variation1)[True]

averagecorrecttemp02variation2 = sum ([Counter(l) for l in llm.gpt4_temp02_variation2_correct], Counter())
averagecorrecttemp02variation2 = counter_to_relative(averagecorrecttemp02variation2)[True]

llmcorrect35 = [averagecorrect35]

nrruns = len(llm.gpt4_temp10[0])
llmcorrect02 = [counter_to_relative(Counter([a[i] for a in llm['gpt4_temp02_correct']]))[True] for i in range(nrruns)]
llmcorrect10 = [counter_to_relative(Counter([a[i] for a in llm['gpt4_temp10_correct']]))[True] for i in range(nrruns)]

# Accuracy

In [96]:
# Plot 1
fig, ax1 = plt.subplots(figsize=(8, 5),dpi=300)

plt.hist(llmcorrect02, bins=20,range=[0.5,1],density=True,alpha=0.5,label='LLM t=0.2')
plt.hist(llmcorrect10, bins=20,range=[0.5,1],density=True,alpha=0.5,label='LLM t=1.0')
plt.hist(expertcorrect, bins=20,range=[0.5,1],density=True,alpha=0.5,label='Experts')
plt.hist(mturkcorrectbyworker, bins=20,range=[0.5,1],density=True,alpha=0.5,label='MTurk')

# Averages
plt.axvline(x = averagecorrect35, color = 'yellow', label = 'Mean GPT3.5', linestyle='--')
plt.axvline(x = averagecorrecttemp02, color = 'blue', label = 'Mean GPT4 t=0.2') 
plt.axvline(x = averagecorrecttemp10, color = 'orange', label = 'Mean GPT4 t=1.0', linestyle=':')
plt.axvline(x = averageexpert, color = 'green', label = 'Mean experts', linestyle='--')
plt.axvline(x = crowdcombined, color = 'red', label = 'Combined MTurk', linestyle='-.')

# Fix for bug to normalize density.
y_ticks = [i for i in range(0,41,10)]
y_tick_labels = [str(i/40) for i in y_ticks]
plt.yticks(y_ticks, y_tick_labels)

plt.xlabel('Accuracy', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.legend(fontsize=12)

plt.tight_layout(pad=0)
plt.savefig('./figure_accuracy.png',dpi=300)

# Kippendorf alpha

In [None]:
#Compare kippendorf alpha between MTurkers, expert coders, and LLM

In [410]:
!pip install simpledorff


Collecting simpledorff
  Downloading simpledorff-0.0.2-py3-none-any.whl (5.6 kB)
Installing collected packages: simpledorff
Successfully installed simpledorff-0.0.2


In [63]:
import simpledorff
import pandas as pd
import random
import numpy as np


In [64]:
#bootstrap simulate to get confidence interval
def manual_bootstrap(df, experiment_col,annotator_col,class_col, ci=0.95, samplesize=300,iterations=1000):
    res = []
    for i in range(iterations):
        randomids = set(np.random.choice(df[experiment_col].unique(), samplesize, False))
        sample = df.loc[df[experiment_col].isin(randomids)]
        res.append(simpledorff.calculate_krippendorffs_alpha_for_df(sample,experiment_col=experiment_col,annotator_col=annotator_col,class_col=class_col))
    return np.mean(res),np.percentile(res,[100*(1-ci)/2,100*(1-(1-ci)/2)]) 

In [65]:
#LLM 0.2
data = pd.DataFrame([{'document_id': row['id'],'coder_id':i,'annotation':row['gpt4_temp02'][i]} for index, row in llm.iterrows() for i in range(5)])
llm02KA = simpledorff.calculate_krippendorffs_alpha_for_df(data,experiment_col='document_id',annotator_col='coder_id',class_col='annotation')
llm02KAci = manual_bootstrap(data,experiment_col='document_id',annotator_col='coder_id', class_col='annotation')
llm02interval = llm02KAci[0] - llm02KAci[1][0]

In [66]:
# LLM 1.0
data = pd.DataFrame([{'document_id': row['id'],'coder_id':i,'annotation':row['gpt4_temp10'][i]} for index, row in llm.iterrows() for i in range(5)])
llm10KA = simpledorff.calculate_krippendorffs_alpha_for_df(data,experiment_col='document_id',annotator_col='coder_id',class_col='annotation')
llm10KAci = manual_bootstrap(data,experiment_col='document_id',annotator_col='coder_id', class_col='annotation')
llm10interval = llm10KAci[0] - llm10KAci[1][0]

In [67]:
#LLM Variations
data = pd.DataFrame([{'document_id': row['id'],'coder_id':0,'annotation':row['gpt4_temp02'][0]} for index, row in llm.iterrows()]+[{'document_id': row['id'],'coder_id':1,'annotation':row['gpt4_temp02_variation1'][0]} for index, row in llm.iterrows()]+[{'document_id': row['id'],'coder_id':2,'annotation':row['gpt4_temp02_variation2'][0]} for index, row in llm.iterrows()])
llm02varKA = simpledorff.calculate_krippendorffs_alpha_for_df(data,experiment_col='document_id',annotator_col='coder_id',class_col='annotation')
llm02varKAci = manual_bootstrap(data,experiment_col='document_id',annotator_col='coder_id', class_col='annotation')
llm02varinterval = llm02varKAci[0] - llm02varKAci[1][0]

In [68]:
#Mturk
mturkKA = simpledorff.calculate_krippendorffs_alpha_for_df(mturk,experiment_col='id',annotator_col='workerid',class_col='answer')
mturkKAci = manual_bootstrap(mturk,experiment_col='id',annotator_col='workerid',class_col='answer')
mturkinterval = mturkKAci[0] - mturkKAci[1][0]

In [69]:
#Experts
expertKA = simpledorff.calculate_krippendorffs_alpha_for_df(experts,experiment_col='id',annotator_col='expert',class_col='answer')
expertsKAci = manual_bootstrap(experts,experiment_col='id',annotator_col='expert',class_col='answer')
expertinterval = expertsKAci[0] - expertsKAci[1][0]

In [95]:
# Plot the reliability

import matplotlib.pyplot as plt
import numpy as np

# Define the means and confidence intervals for the four groups
means = [llm02KA,llm10KA, llm02varKA, expertKA, mturkKA]
confidence_intervals = [llm02interval, llm10interval, llm02varinterval, expertinterval, mturkinterval]

# Define the x-axis labels for each group
x_labels = ['LLM t=0.2', 'LLM t=1.0', 'LLM t=0.2 Variations', 'Expert', 'MTurk' ]

# Set the figure size and dpi
fig, ax = plt.subplots(figsize=(7, 4), dpi=300)

colors = ['blue','orange','yellow','green','red']

for pos, y, err, colors in zip(x_labels, means, confidence_intervals, colors):
    ax.barh(pos, y, xerr=err, capsize = 4,  alpha=0.4, color = colors)

ax.tick_params(axis='both', which='major', labelsize=10)
ax.set_xlabel('Krippendorf\'s Alpha', fontsize=12)

ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)

# Set the padding between the plot and the edge of the figure
plt.tight_layout(pad=1)

# Show the plot
# plt.show()
plt.savefig('./figure_krippen.png',dpi=300)
plt.savefig('./figure_krippen.eps',dpi=300)
plt.savefig('./figure_krippen.pdf',dpi=300)

## Bias

In [97]:
# Is there the same bias in guessing democrat or republican as the manual classifiers?

In [98]:
# Calculate bias and significance
llmoutcome10 = [1 if b == 'Democrat' else 0 for a in llm.gpt4_temp10 for b in a ]
llm10bias,llm10biasci = calculate_mean_and_ci(llmoutcome10)

llmoutcome02 = [1 if b == 'Democrat' else 0 for a in llm.gpt4_temp02 for b in a ]
llm02bias,llm02biasci = calculate_mean_and_ci(llmoutcome02)

expertoutcome = [1 if b == 'd' else 0 for b in expertmerge.answer]
expertbias,expertbiasci = calculate_mean_and_ci(expertoutcome)

mturkoutcome = [1 if b == 'democrat' else 0 for b in mturk.answer]
mturkbias,mturkbiasci = calculate_mean_and_ci(mturkoutcome)

In [82]:
# CALCULATE AND PLOT THE BIAS
import matplotlib.pyplot as plt
import numpy as np

# Define the means and confidence intervals for the four groups
means = [llm02bias, llm10bias, expertbias, mturkbias]
confidence_intervals = [llm02biasci, llm10biasci, expertbiasci, mturkbiasci]

# Define the x-axis labels for each group
x_labels = ['LLM t=0.2', 'LLM t=1.0', 'Expert', 'MTurk' ]

# Set the figure size and dpi
fig, ax = plt.subplots(figsize=(5, 5), dpi=300)

colors = ['blue','orange','green','red']

for pos, y, err, colors in zip(x_labels, means, confidence_intervals, colors):
    ax.errorbar(pos, y, err, capsize = 4, markersize=8, alpha=0.4,fmt='o', color = colors)

# Set the font size for the axis labels and title
ax.tick_params(axis='both', which='major', labelsize=10)
ax.set_ylabel('Democratic bias', fontsize=12)

# Remove the top and right spines
ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)

ax.axhline(y=0.5,  c="black", linewidth=1, zorder=0,linestyle=':')
plt.tight_layout(pad=1)
plt.ylim([0.45, 0.65])

# Show the plot
# plt.show()
plt.savefig('./figure_bias.png',dpi=300)
plt.savefig('./figure_bias.pdf',dpi=300)
plt.savefig('./figure_bias.eps',dpi=300)


# Across different countries

In [122]:
counts = []

# This is a bit messy due to the different use of party names in the van Vliet et al database

df = pd.read_pickle('tweet_process_canada.pkl')
df['correct'] = df.party == df['gpt4_temp02'].str[0]
c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'Canada','correct':c[True],'incorrect':c[False],'stderr':stderr})

df = pd.read_pickle('tweet_process_germany.pkl')
df['correct'] = df.party == df['gpt4_temp02'].str[0]
c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'Germany','correct':c[True],'incorrect':c[False],'stderr':stderr})

df = pd.read_pickle('tweet_process_spain.pkl')
df['correct'] = df.party == df['gpt4_temp02'].str[0]
c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'Spain','correct':c[True],'incorrect':c[False],'stderr':stderr})

df = pd.read_pickle('tweet_process_sweden.pkl')
df['correct'] = df.party.str[0].str.lower() == df['gpt4_temp02'].str[0].str.lower()
c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'Sweden','correct':c[True],'incorrect':c[False],'stderr':stderr})

df = pd.read_pickle('tweet_process_NZ.pkl')
df['correct'] = ((df.party=='Labour Party') & (df['gpt4_temp02'].str[0] == 'Labour')) | ((df.party=='National Party') & (df['gpt4_temp02'].str[0] == 'National'))
c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'New Zealand','correct':c[True],'incorrect':c[False],'stderr':stderr})

df = pd.read_pickle('tweet_process_Poland.pkl')
df['correct'] = ((df.party=='Civic Coalition') & (df['gpt4_temp02'].str[0] == 'KO')) | ((df.party=='Law and Justice') & (df['gpt4_temp02'].str[0] == 'PiS'))
c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'Poland','correct':c[True],'incorrect':c[False],'stderr':stderr})

df = pd.read_pickle('tweet_process_UK.pkl')
df['correct'] = (df.party==df['gpt4_temp02'].str[0])
c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'United Kingdom','correct':c[True],'incorrect':c[False],'stderr':stderr})

df = pd.read_pickle('tweet_process_TURKEY.pkl')
df['correct'] = (df.party == df['gpt4_temp02'].str[0])
c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'Turkey','correct':c[True],'incorrect':c[False],'stderr':stderr})

df = pd.read_pickle('tweet_process_DENMARK.pkl')
df['correct'] = ((df.party=='The Social Democratic Party') & (df['gpt4_temp02'].str[0] == 'Social Democrats')) | ((df.party=='The Liberal Party') & (df['gpt4_temp02'].str[0] == 'Venstre'))
c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'Denmark','correct':c[True],'incorrect':c[False],'stderr':stderr})

df = pd.read_pickle('tweet_process_AUSTRALIA.pkl')
df['correct'] = ((df.party=='Australian Labor Party') & (df['gpt4_temp02'].str[0] == 'Labor')) | ((df.party=='Liberal Party of Australia') & (df['gpt4_temp02'].str[0] == 'Liberal'))
c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'Australia','correct':c[True],'incorrect':c[False],'stderr':stderr})

# US, 
df = pd.read_pickle("US_sample_tweets.pkl")
df['correct'] = (df.party==df['gpt4_temp02'].str[0])

c = Counter(df['correct'])
stderr = stats.sem([1 if e else 0 for e in df['correct']])
counts.append({'country':'United States','correct':c[True],'incorrect':c[False],'stderr':stderr})

countrycounts = pd.DataFrame(counts)

In [123]:
countrycounts['mean'] = countrycounts['correct']/(countrycounts['correct']+countrycounts['incorrect'])

In [124]:
countrycounts['interval'] = [stats.t.interval(0.95, corr+incorr+-1, loc=corr/(corr+incorr), scale=stderr) for corr,incorr,stderr in zip(countrycounts['correct'],countrycounts['incorrect'],countrycounts['stderr'])]


In [83]:
# Plot cross country comparison

df = countrycounts.sort_values(['mean'])

import matplotlib.pyplot as plt
import numpy as np

# Extract the data from the dataframe
countries = df['country']
means = df['mean']
intervals = [np.abs(df.loc[i, 'interval'][1] - df.loc[i, 'interval'][0]) / 2 for i in df.index]

fig, ax = plt.subplots(figsize=(8, 5))

ax.errorbar(x=means, y=countries, xerr=intervals, fmt='o', capsize=5, markersize=8, color='black')

ax.plot([averageexpert], ['United States'], 'x', color='green', markersize=8,alpha=0.8)
ax.plot([crowdcombined], ['United States'], 's', color='r', markersize=8,alpha=0.5)
ax.legend(['Experts mean','MTurk combined', 'LLM t=0.2'], loc='lower right')


# Set the labels and title
ax.set_xlabel('Accuracy')

# Set the x-axis limits
ax.set_xlim(0.5, 1)

plt.tight_layout()
plt.savefig('figure_crosscountries.png',dpi=300)
plt.savefig('figure_crosscountries.eps',dpi=300)
plt.savefig('figure_crosscountries.pdf',dpi=300)
plt.show()

# Regular people

In [75]:
randomllm = pd.read_pickle('regularpeoplellm.pkl')

In [76]:
randomexpert1 = pd.read_csv('RegularPeopleExpert1.csv')[['ind','label1']]

In [77]:
randomexpert2 = pd.read_csv('RegularPeopleExpert2.csv').rename(columns={'Column1':'ind','label':'label2'})[['ind','label2']]

In [78]:
randommerge = pd.merge(randomexpert1,randomexpert2, on='ind')
randommerge = pd.merge(randommerge,randomllm, on='ind')


In [None]:
# Average distance between model and human classifiers. 

In [796]:
c1 = [1 if e else 0 for e in randommerge.label1 == randommerge.gpt4_temp02]
i1 = stats.t.interval(0.95, len(c1)-1, loc=np.mean(c1), scale=stats.sem(c1))


In [797]:
c2 = [1 if e else 0 for e in randommerge.label2 == randommerge.gpt4_temp02]
i2 = stats.t.interval(0.95, len(c2)-1, loc=np.mean(c2), scale=stats.sem(c2))

In [807]:
c0 = [1 if e else 0 for e in randommerge.label1 == randommerge.label2]
i0 = stats.t.interval(0.95, len(c0)-1, loc=np.mean(c0), scale=stats.sem(c0))

In [84]:
import matplotlib.pyplot as plt
import numpy as np

# Extract the data from the dataframe
comp = ['LLM vs Expert 2','LLM vs Expert 1','Expert 1 vs Expert 2']
means = [np.mean(c2),np.mean(c1),np.mean(c0)]
intervals = [(i2[1]-i2[0])/2,(i1[1]-i1[0])/2,(i0[1]-i0[0])/2]

fig, ax = plt.subplots(figsize=(8, 2))
ax.errorbar(x=means, y=comp, xerr=intervals, fmt='o', capsize=5, markersize=8, color='black')
ax.set_xlabel('Correspondance')
ax.set_xlim(0., 1)

plt.tight_layout()
plt.margins(y=0.3, tight=True)
plt.savefig('figure_randompeople.png',dpi=300)
plt.savefig('figure_randompeople.eps',dpi=300)
plt.savefig('figure_randompeople.pdf',dpi=300)
plt.show()