In [1]:
import pandas as pd
import sys
import os

In [2]:
df = pd.read_csv(os.path.join('..', 'cefr_results.csv'))

In [3]:
df.head()

Unnamed: 0,model,cefr,precision,recall,f05
0,gpt-4-0613,A,0.5561,0.5157,0.5475
1,gpt-4-0613,B,0.5243,0.4849,0.5159
2,gpt-4-0613,C,0.4196,0.4573,0.4267
3,gpt-4-0613,N,0.4732,0.6056,0.4948
4,StableBeluga2,A,0.4937,0.4766,0.4902


In [4]:
dfmelted = pd.melt(df, id_vars=['model', 'cefr'], value_vars=['precision', 'recall', 'f05'],
        var_name='metric', value_name='value')
dfmelted.head()


Unnamed: 0,model,cefr,metric,value
0,gpt-4-0613,A,precision,0.5561
1,gpt-4-0613,B,precision,0.5243
2,gpt-4-0613,C,precision,0.4196
3,gpt-4-0613,N,precision,0.4732
4,StableBeluga2,A,precision,0.4937


In [5]:
dfmelted.pivot(index=['model', 'metric'], columns='cefr', values=['value'])


Unnamed: 0_level_0,Unnamed: 1_level_0,value,value,value,value
Unnamed: 0_level_1,cefr,A,B,C,N
model,metric,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
InstructPalmyra-20b,f05,0.4084,0.3753,0.2799,0.3882
InstructPalmyra-20b,precision,0.4446,0.4099,0.2857,0.3826
InstructPalmyra-20b,recall,0.3082,0.2807,0.2588,0.4121
Llama-2-70b-chat-hf,f05,0.4125,0.3801,0.2735,0.3151
Llama-2-70b-chat-hf,precision,0.4027,0.3665,0.249,0.2824
Llama-2-70b-chat-hf,recall,0.4567,0.4463,0.451,0.5869
StableBeluga2,f05,0.4902,0.4618,0.344,0.4341
StableBeluga2,precision,0.4937,0.4659,0.3289,0.4077
StableBeluga2,recall,0.4766,0.4463,0.4214,0.5858
bloomz-7b1,f05,0.349,0.328,0.3276,0.3958


In [6]:
models = df.model.unique()
cefrs = df.cefr.unique()

In [7]:
rows = []
for model in sorted(models):
    row = [model]
    for cefr in ['A', 'B', 'C', 'N']:
        # select precision, recall, and f05 from row where model == model and cefr == cefr

        # precision
        precision = df[(df['model'] == model) & (df['cefr'] == cefr)]['precision'].values[0]
        row.append(precision)
        # recall
        recall = df[(df['model'] == model) & (df['cefr'] == cefr)]['recall'].values[0]
        row.append(recall)
        # f05
        f05 = df[(df['model'] == model) & (df['cefr'] == cefr)]['f05'].values[0]
        row.append(f05)
    
    rows.append(row)

df = pd.DataFrame(rows, columns=['model', 'A_precision', 'A_recall', 'A_f05', 'B_precision', 'B_recall', 'B_f05', 'C_precision', 'C_recall', 'C_f05', 'N_precision', 'N_recall', 'N_f05'])


In [8]:
df

Unnamed: 0,model,A_precision,A_recall,A_f05,B_precision,B_recall,B_f05,C_precision,C_recall,C_f05,N_precision,N_recall,N_f05
0,InstructPalmyra-20b,0.4446,0.3082,0.4084,0.4099,0.2807,0.3753,0.2857,0.2588,0.2799,0.3826,0.4121,0.3882
1,Llama-2-70b-chat-hf,0.4027,0.4567,0.4125,0.3665,0.4463,0.3801,0.249,0.451,0.2735,0.2824,0.5869,0.3151
2,StableBeluga2,0.4937,0.4766,0.4902,0.4659,0.4463,0.4618,0.3289,0.4214,0.344,0.4077,0.5858,0.4341
3,bloomz-7b1,0.531,0.1472,0.349,0.5256,0.131,0.328,0.455,0.1545,0.3276,0.4876,0.2258,0.3958
4,command,0.465,0.3604,0.4395,0.4223,0.33,0.3999,0.2833,0.2866,0.284,0.363,0.4412,0.3764
5,falcon-40b-instruct,0.5086,0.4163,0.487,0.488,0.3893,0.4645,0.3771,0.3558,0.3727,0.4225,0.488,0.4342
6,flan-t5-xxl,0.6405,0.1842,0.4283,0.6178,0.1545,0.3861,0.5514,0.1447,0.3529,0.6408,0.3174,0.5323
7,gpt-3.5-turbo-0613,0.4801,0.5241,0.4883,0.4482,0.4976,0.4573,0.322,0.478,0.3445,0.3661,0.6431,0.4006
8,gpt-4-0613,0.5561,0.5157,0.5475,0.5243,0.4849,0.5159,0.4196,0.4573,0.4267,0.4732,0.6056,0.4948
9,opt-iml-max-30b,0.6078,0.189,0.4211,0.5668,0.1453,0.3587,0.4935,0.1375,0.3251,0.5853,0.2893,0.4858


In [9]:
# print to latex with 3 decimal places
print(df.to_latex(index=False, float_format="%.3f"))

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
model & A_precision & A_recall & A_f05 & B_precision & B_recall & B_f05 & C_precision & C_recall & C_f05 & N_precision & N_recall & N_f05 \\
\midrule
InstructPalmyra-20b & 0.445 & 0.308 & 0.408 & 0.410 & 0.281 & 0.375 & 0.286 & 0.259 & 0.280 & 0.383 & 0.412 & 0.388 \\
Llama-2-70b-chat-hf & 0.403 & 0.457 & 0.412 & 0.366 & 0.446 & 0.380 & 0.249 & 0.451 & 0.274 & 0.282 & 0.587 & 0.315 \\
StableBeluga2 & 0.494 & 0.477 & 0.490 & 0.466 & 0.446 & 0.462 & 0.329 & 0.421 & 0.344 & 0.408 & 0.586 & 0.434 \\
bloomz-7b1 & 0.531 & 0.147 & 0.349 & 0.526 & 0.131 & 0.328 & 0.455 & 0.154 & 0.328 & 0.488 & 0.226 & 0.396 \\
command & 0.465 & 0.360 & 0.440 & 0.422 & 0.330 & 0.400 & 0.283 & 0.287 & 0.284 & 0.363 & 0.441 & 0.376 \\
falcon-40b-instruct & 0.509 & 0.416 & 0.487 & 0.488 & 0.389 & 0.465 & 0.377 & 0.356 & 0.373 & 0.422 & 0.488 & 0.434 \\
flan-t5-xxl & 0.640 & 0.184 & 0.428 & 0.618 & 0.154 & 0.386 & 0.551 & 0.145 & 0.353 & 0.641 & 0.317 & 0.532 \\
gpt-3.5-turb