# Result Analysis

This carries out evaluation of the performance of the various models that have been run. (The supervised models' performance is calculated in the file SupervisedModels.ipynb)

## Define performance metrics

These functions provide standardized calculations of performance measures: precision, recall, accuracy, balanced F1, etc. 

In [2]:
# Imports 
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from scipy.stats import norm
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import statsmodels.api as sm
from statsmodels.stats.proportion import proportion_confint   

#Help functions: We first define functions to calculate model performance.

# Calculate performance scores.
def calculate_accuracy(S, P):
    S = np.array(S)
    P = np.array(P)
    accuracy = accuracy_score(S, P)
    correct_predictions = sum(P == S)
    confidence_lower, confidence_upper = proportion_confint(correct_predictions, len(S), method='wilson')
    return accuracy, confidence_lower, confidence_upper 

def calculate_precision(true_labels, predicted_labels):
    # Assumes binary classification; for multi-class, set the 'average' parameter
    return precision_score(true_labels, predicted_labels, average='binary')

def calculate_recall(true_labels, predicted_labels):
    # Assumes binary classification; for multi-class, set the 'average' parameter
    return recall_score(true_labels, predicted_labels, average='binary')

def calculate_f1_score(true_labels, predicted_labels):
    # Assumes binary classification; for multi-class, set the 'average' parameter
    return f1_score(true_labels, predicted_labels, average='binary')

def invert_labels(labels):
    return [1 if l == 0 else 0 if l==1 else None for l in labels]

def estimate_accuracies(true_labels, predicted_labels):
    accuracy,lower_acc,upper_acc = calculate_accuracy(true_labels, predicted_labels)
    
    precision = calculate_precision(true_labels, predicted_labels)
    recall = calculate_recall(true_labels, predicted_labels)
    
    f1 = calculate_f1_score(true_labels, predicted_labels)
    f1_second = calculate_f1_score(invert_labels(true_labels), invert_labels(predicted_labels))

    macro_f1 = np.mean([f1,f1_second])
    
    print(f"Accuracy: {accuracy} [{lower_acc},{upper_acc}]")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Second F1 Score: {f1_second}")
    print(f"Macro F1: {macro_f1}")
    
    return accuracy, precision, recall, f1, f1_second, macro_f1, lower_acc,upper_acc


## MTurk performance analysis

We start by calculating the performance of the average and plurality decison of the MTurkers. 

In [25]:
# Average accuracy by MTurkers
ap = pd.read_csv('Data/mturk/MTURK.csv')
solution = [1 if a=='Democrat' else 0 for a in ap['party']]
predicted = [1 if a=='democrat' else 0 for a in ap['answer']] 
res = estimate_accuracies(solution,predicted)


Accuracy: 0.8042 [0.7929692518481564,0.814963678288471]
Precision: 0.7598223436966177
Recall: 0.8896
F1 Score: 0.8196056753270684
Second F1 Score: 0.7859173409140608
Macro F1: 0.8027615081205646


In [32]:
# Wisdom of crowds accuracy on MTurkers (plurality vote)

#Load answers
ap = pd.read_csv('Data/mturk/MTURK.csv')

#Create an question:answer dict
answerdict = ap.set_index('id')['party'].to_dict()

# Group by question id and answer, pick the answer with the most votes
wisdomofcrowd = ap.groupby(['id','answer']).count()['created_at'].reset_index().sort_values('created_at', ascending=False).drop_duplicates('id').sort_index()

# Map the answers back using the answerdict
wisdomofcrowd['answer'] = [answerdict[id] for id in wisdomofcrowd['id']]

# Calculate performance
solution = [1 if a=='Democrat' else 0 for a in wischeck['party']]
predicted = [1 if a=='democrat' else 0 for a in wischeck['answer']] 
res = estimate_accuracies(solution,predicted)


Accuracy: 0.854 [0.8203507337796904,0.8822512331735307]
Precision: 0.8020477815699659
Recall: 0.94
F1 Score: 0.8655616942909761
Second F1 Score: 0.8402625820568929
Macro F1: 0.8529121381739344


## Expert performance analysis

Next, we calculate the performance of the experts, looking both at the average and the plurality decision. 

In [44]:
# Load the data on the experts
experts = pd.read_csv('Data/expert/experts.csv')

In [54]:
# Average expert
solution = [1 if a=='Democrat' else 0 for a in experts['party']]
predicted = [1 if a=='d' else 0 for a in experts['answer']] 
print("Average Expert")
res = estimate_accuracies(solution,predicted)

# Expert per expert
for expert in experts.expert.unique():
    print(f'\n Expert {expert}')
    onexp = experts.loc[experts['expert']==expert]
    solution = [1 if a=='Democrat' else 0 for a in onexp['party']]
    predicted = [1 if a=='d' else 0 for a in onexp['answer']] 
    res = estimate_accuracies(solution,predicted)
    
    print(f'{expert}: \t {res[0]} \t\t {res[5]} \t {2*abs(0.5-np.mean(predicted))}\n')




Average Expert
Accuracy: 0.83 [0.8101533340185327,0.8481607417061474]
Precision: 0.803680981595092
Recall: 0.8733333333333333
F1 Score: 0.8370607028753994
Second F1 Score: 0.8222996515679442
Macro F1: 0.8296801772216718

 Expert 1
Accuracy: 0.826 [0.790318402100132,0.8567105279189359]
Precision: 0.7942238267148014
Recall: 0.88
F1 Score: 0.8349146110056925
Second F1 Score: 0.8160676532769556
Macro F1: 0.8254911321413241
1: 	 0.826 		 0.8254911321413241 	 0.1080000000000001


 Expert 2
Accuracy: 0.846 [0.811734636678946,0.8749893197216598]
Precision: 0.8502024291497976
Recall: 0.84
F1 Score: 0.8450704225352113
Second F1 Score: 0.8469184890656064
Macro F1: 0.8459944558004089
2: 	 0.846 		 0.8459944558004089 	 0.01200000000000001


 Expert 3
Accuracy: 0.818 [0.7817973646706083,0.8493535547958443]
Precision: 0.7731958762886598
Recall: 0.9
F1 Score: 0.8317929759704252
Second F1 Score: 0.8017429193899783
Macro F1: 0.8167679476802018
3: 	 0.818 		 0.8167679476802018 	 0.16399999999999992



In [57]:
# Wisdom of crowds: plurality choice by the experts 

#Create an question:answer dict
answerdict = experts.set_index('id')['party'].to_dict()

wisdomofexperts = experts.groupby(['id','answer']).count()['created_at'].reset_index().sort_values('created_at', ascending=False).drop_duplicates('id').sort_index()

# Map the answers back using the answerdict
wisdomofexperts['party'] = [answerdict[id] for id in wisdomofexperts['id']]

prediction = [0 if a == 'd' else 1 for a in wisdomofexperts['answer']]
solution = [0 if a == 'Democrat' else 1 for a in wisdomofexperts['party']]

res = estimate_accuracies(solution,prediction)


Accuracy: 0.86 [0.826833135055546,0.8876773398121365]
Precision: 0.8879310344827587
Recall: 0.824
F1 Score: 0.8547717842323651
Second F1 Score: 0.8648648648648648
Macro F1: 0.859818324548615


## Calculate LLM performance

Here is how the LLM's performance is calculated. We focus here on the case of the United States, but all the countries are calculated in the same way. The LLM output are stored in the pickled dataframes in the countries folder.

In [75]:
ap.to_pickle("Data/countries/US_sample_tweets_llm.pkl")

In [76]:
prediction = [0 if a == 'Democrat' else 1 for a in ap['gpt4']]
solution = [0 if a == 'Democrat' else 1 for a in ap['party']]

res = estimate_accuracies(solution,prediction)

Accuracy: 0.934 [0.9087605988391466,0.9526214736402262]
Precision: 0.9779735682819384
Recall: 0.888
F1 Score: 0.9308176100628931
Second F1 Score: 0.9369024856596558
Macro F1: 0.9338600478612744


## Bias

Here we examine the level of bias in the probability that the different classifiers select Republicans vs Democrats. 

In [98]:
# Calculate bias and significance
llmoutcome10 = [1 if b == 'Democrat' else 0 for a in llm.gpt4_temp10 for b in a ]
llm10bias,llm10biasci = calculate_mean_and_ci(llmoutcome10)

llmoutcome02 = [1 if b == 'Democrat' else 0 for a in llm.gpt4_temp02 for b in a ]
llm02bias,llm02biasci = calculate_mean_and_ci(llmoutcome02)

expertoutcome = [1 if b == 'd' else 0 for b in expertmerge.answer]
expertbias,expertbiasci = calculate_mean_and_ci(expertoutcome)

mturkoutcome = [1 if b == 'democrat' else 0 for b in mturk.answer]
mturkbias,mturkbiasci = calculate_mean_and_ci(mturkoutcome)

In [82]:
# CALCULATE AND PLOT THE BIAS
import matplotlib.pyplot as plt
import numpy as np

# Define the means and confidence intervals for the four groups
means = [llm02bias, llm10bias, expertbias, mturkbias]
confidence_intervals = [llm02biasci, llm10biasci, expertbiasci, mturkbiasci]

# Define the x-axis labels for each group
x_labels = ['LLM t=0.2', 'LLM t=1.0', 'Expert', 'MTurk' ]

# Set the figure size and dpi
fig, ax = plt.subplots(figsize=(5, 5), dpi=300)

colors = ['blue','orange','green','red']

for pos, y, err, colors in zip(x_labels, means, confidence_intervals, colors):
    ax.errorbar(pos, y, err, capsize = 4, markersize=8, alpha=0.4,fmt='o', color = colors)

# Set the font size for the axis labels and title
ax.tick_params(axis='both', which='major', labelsize=10)
ax.set_ylabel('Democratic bias', fontsize=12)

# Remove the top and right spines
ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)

ax.axhline(y=0.5,  c="black", linewidth=1, zorder=0,linestyle=':')
plt.tight_layout(pad=1)
plt.ylim([0.45, 0.65])

# Show the plot
# plt.show()
plt.savefig('./figure_bias.png',dpi=300)
plt.savefig('./figure_bias.pdf',dpi=300)
plt.savefig('./figure_bias.eps',dpi=300)
