In [None]:
import glob
import os
import pandas as pd
import sklearn
import sklearn.metrics
import seaborn as sns
import matplotlib.pyplot as plt

# folder = "/Users/johnmorris/Projects/interpretable-embeddings/scripts/d3/out"
folder = 'out'
# folder = 'out__one_shot'
paths = glob.glob(os.path.join(folder, "d3_*.csv"))

paths[0]

In [None]:
def read_csv(path):
    lines = open(path).readlines()
    lines = lines[7:]  # discard header
    all_data = []
    for line in lines:
        idx, data_str = eval(line)
        all_data.append(eval(data_str))
    return pd.DataFrame(all_data, columns=['task', 'idx', 'true_label', 'pred_label', 'answer', 'model'])


dfs = [read_csv(path) for path in paths]
df = pd.concat(dfs)

# add example_len
df['example_len'] = None
for task in df.task.unique():
    task_str = '_'.join(task.split('_')[:-1])
    examples = pd.read_csv(f'd3_processed/{task_str}.csv')
    examples['example_len'] = examples['input'].apply(lambda x: len(x.split()))
    # set example_len for each task based on "idx" column
    df.loc[df.task == task, 'example_len'] = df[df.task ==
                                                task].idx.apply(lambda x: examples.loc[x].example_len)

# add ensemble model
df['majority_vote'] = df.groupby(['task', 'idx'])['pred_label'].transform(
    lambda x: x.value_counts().idxmax())
d_ens = df[df.model == 'meta']
d_ens['model'] = 'ensemble'
d_ens['pred_label'] = d_ens['majority_vote']
df = pd.concat([df, d_ens])

df['correct'] = df['true_label'] == df['pred_label']

In [None]:
df['example_len'].mean()

In [None]:
df['example_len'].max()

In [None]:
df.groupby('model')['correct'].mean()
# df[df.example_len <= 100000].groupby('model')['correct'].mean()

In [None]:
sns.set_context('talk')
d_plot = df
# d_plot = df[df.example_len <= 1000]
d_plot = d_plot[~d_plot.model.isin(['ensemble'])]
d_plot = d_plot.groupby(['model', 'task'])['correct'].mean().reset_index()
d_plot['correct'] = d_plot['correct'].apply(lambda x: max(x, 1 - x))
d_plot['model'] = d_plot['model'].apply(lambda x:
                                        {'meta': 'LLaMA-3 (8B)', 'ensemble': 'Ensemble', 'mistral': 'Mistral (7B)', 'openai': 'GPT-3.5', 'gpt4': 'GPT-4'}.get(x, x))
# display(d_plot)

display(d_plot.groupby('model')['correct'].mean())
# plot boxplot where each point is a task
plt.figure(figsize=(10, 5))
barplot = sns.barplot(x='model', y='correct', data=d_plot,
                      order=['LLaMA-3 (8B)', 'Mistral (7B)',
                             'GPT-3.5', 'GPT-4'],
                      estimator='mean', errorbar=('ci', 95), err_kws={'color': 'black'}, capsize=0.1, alpha=0.3)

# Annotate the bars with the mean values
for p in barplot.patches:
    height = p.get_height()
    barplot.annotate(f'{height:.3f}',
                     xy=(p.get_x() + p.get_width() / 2., height),
                     xytext=(33, 6),  # 9 points vertical offset
                     fontsize='x-small',
                     textcoords='offset points',
                     color='#72a6c2',
                     ha='center', va='center')

# show points
sns.stripplot(x='model', y='correct', data=d_plot, color='gray', alpha=0.5)
sns.despine()
plt.xlabel('Question-answering LLM')
plt.ylabel('Accuracy')
plt.savefig('d3_accuracy.pdf', bbox_inches='tight')
plt.show()

In [None]:
df.groupby(['model'])['correct'].sem()

In [None]:
for model in df['model'].unique():
    model_df = df[df['model'] == model]
    print(model, sklearn.metrics.f1_score(
        model_df['true_label'], model_df['pred_label']))

In [None]:
# .reset_index().pivot(index='task', columns='model', values='correct').to_latex()
d = df.groupby(['model', 'task'])['correct'].mean().reset_index()

In [None]:
d['correct'] = d['correct'].apply(lambda x: max(x, 1 - x))

In [None]:
d.groupby('model')['correct'].mean()

In [None]:
# fullly display d
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(d)