In [1]:
import glob
import os

import pandas as pd

folder = "/Users/johnmorris/Projects/interpretable-embeddings/scripts/d3/out"
paths = glob.glob(os.path.join(folder, "d3_*.csv"))

paths[0]

'/Users/johnmorris/Projects/interpretable-embeddings/scripts/d3/out/d3_16_hillary.csv'

In [2]:
def read_csv(path):
    lines = open(path).readlines()
    lines = lines[7:] # discard header
    all_data = []
    for line in lines:
        idx, data_str = eval(line) 
        all_data.append(eval(data_str))
    return pd.DataFrame(all_data, columns=['task', 'idx', 'true_label', 'pred_label', 'answer', 'model'])
    
dfs = [read_csv(path) for path in paths]
df = pd.concat(dfs)

In [3]:
df['correct']  = df['true_label'] == df['pred_label']
df.head()

Unnamed: 0,task,idx,true_label,pred_label,answer,model,correct
0,d3_16_hillary,143,0,1,yes,openai,False
1,d3_16_hillary,143,0,0,no,meta,True
2,d3_16_hillary,143,0,0,no,mistral,True
3,d3_16_hillary,143,0,0,no,gpt4,True
4,d3_16_hillary,5,1,1,yes,mistral,True


In [4]:
df.groupby('model')['correct'].mean()

model
gpt4       0.817391
meta       0.795652
mistral    0.808696
openai     0.813043
Name: correct, dtype: float64

In [5]:
df.groupby(['model', 'task'])['correct'].mean()

model   task                
gpt4    d3_0_irony              0.90
        d3_10_infrastructure    0.65
        d3_13_water             0.85
        d3_14_search            0.45
        d3_15_utility           0.70
                                ... 
openai  d3_5_evacuate           0.80
        d3_6_terorrism          0.95
        d3_7_crime              0.80
        d3_8_shelter            0.80
        d3_9_food               0.80
Name: correct, Length: 184, dtype: float64

In [6]:
df.groupby(['model'])['correct'].sem()

model
gpt4       0.012744
meta       0.013301
mistral    0.012975
openai     0.012861
Name: correct, dtype: float64

In [9]:
import sklearn
import sklearn.metrics

for model in df['model'].unique():
    model_df = df[df['model'] == model]
    print(model, sklearn.metrics.f1_score(model_df['true_label'], model_df['pred_label']))

openai 0.7712765957446809
meta 0.7614213197969543
mistral 0.7628032345013477
gpt4 0.78125


In [8]:
df.groupby(['model', 'task'])['correct'].mean().reset_index().pivot(index='task', columns='model', values='correct').to_latex()

'\\begin{tabular}{lrrrr}\n\\toprule\nmodel & gpt4 & meta & mistral & openai \\\\\ntask &  &  &  &  \\\\\n\\midrule\nd3_0_irony & 0.900000 & 0.750000 & 0.900000 & 0.800000 \\\\\nd3_10_infrastructure & 0.650000 & 0.550000 & 0.700000 & 0.650000 \\\\\nd3_13_water & 0.850000 & 0.800000 & 0.800000 & 0.800000 \\\\\nd3_14_search & 0.450000 & 0.500000 & 0.450000 & 0.700000 \\\\\nd3_15_utility & 0.700000 & 0.550000 & 0.650000 & 0.600000 \\\\\nd3_16_hillary & 0.800000 & 0.800000 & 0.800000 & 0.850000 \\\\\nd3_17_hillary & 0.950000 & 0.950000 & 1.000000 & 1.000000 \\\\\nd3_18_offensive & 0.600000 & 0.550000 & 0.550000 & 0.700000 \\\\\nd3_19_offensive & 0.650000 & 0.650000 & 0.700000 & 0.600000 \\\\\nd3_1_objective & 0.950000 & 0.750000 & 0.850000 & 0.650000 \\\\\nd3_20_pro-life & 0.700000 & 0.900000 & 0.750000 & 0.700000 \\\\\nd3_21_pro-choice & 0.900000 & 0.950000 & 0.900000 & 0.950000 \\\\\nd3_25_math & 0.700000 & 0.900000 & 0.800000 & 0.650000 \\\\\nd3_27_grammar & 0.750000 & 0.750000 & 0.80000