In [1]:
import os
import pandas as pd

In [2]:
# load test set results
df = pd.read_csv(os.path.join('..', 'test_results.csv'))
df.head()

Unnamed: 0,model,prompt_type_index,dev_split,prompt_type,prompt_index,test_split,run,TP,FP,FN,Prec,Rec,F0.5,GLEU,prompt_template
0,gpt-4-0613,0-shot_7,fce-dev,0-shot,7,fce-test,1,2233,2110,2356,0.5142,0.4866,0.5084,,system: You are a grammatical error correction...
1,gpt-3.5-turbo-0613,0-shot_5,fce-dev,0-shot,5,fce-test,1,2345,3136,2244,0.4278,0.511,0.4422,,system: You are an English language teacher. A...
2,stabilityai/StableBeluga2,0-shot_7,fce-dev,0-shot,7,fce-test,1,2077,2491,2512,0.4547,0.4526,0.4543,,### System: You are a grammatical error correc...
3,facebook/opt-iml-max-30b,0-shot_7,fce-dev,0-shot,7,fce-test,1,835,627,3754,0.5711,0.182,0.4,,You are a grammatical error correction tool. Y...
4,command,0-shot_6,fce-dev,0-shot,6,fce-test,1,1731,2426,2858,0.4164,0.3772,0.4079,,You are a grammatical error correction tool. Y...


In [3]:
# write a latex row using the format: \diagbox[dir=SW]{metric}{prompt_index}
# where each row is a model and each column is a split
def write_latex_row_expanded_subcript(df, model, metric):
    # metric = 'F0.5' if 'jfleg' not in split else 'GLEU'
    # get the metric value for the model from the df
    metric_value = df[df['model'] == model][metric].values[0]
    
    # check if metric_value is the max value for this dataframe
    max_metric_value = df[metric].max()
    if metric_value == max_metric_value:
        metric_value = '\\textbf{' + '{:.3f}'.format(metric_value) + '}'
    else:
        metric_value = '{:.3f}'.format(metric_value)

    # get the prompt_index for the model from the df
    prompt_type_index = df[df['model'] == model]['prompt_type_index'].values[0]

    prompt_type = prompt_type_index.split('_')[0] 
    prompt_index = prompt_type_index.split('_')[1]

    if prompt_type == '0-shot':
        prompt_type = '0'
        if int(prompt_index) == 10:
            prompt_index = '\\textsc{coyne}'
        elif int(prompt_index) in [6,7]:
            prompt_index = '\\textsc{tool}'
        elif int(prompt_index) == 5:
            prompt_index = '\\textsc{elt}'
    else:
        if 'Coyne' in prompt_type:
            prompt_type = '2'
            prompt_index = '\\textsc{coyne}$^{*}$'
        else:
            prompt_type = prompt_type.split('-')[0]
            if int(prompt_index) == 1:
                prompt_index = '\\textsc{coyne}'
            elif int(prompt_index) == 2:
                prompt_index = '\\textsc{tool}'
            else:
                prompt_index = '\\textsc{elt}'
        
    # write the latex row
    # $_{{{str(prompt_index)}}}$
    return f'{str(metric_value)} & {prompt_type} & {str(prompt_index)}'

In [4]:

# write the latex table
# print(' & '.join([''] + [split for split in sorted(splits)]) + ' \\\\')

model_order = [
    'bigscience/bloomz-7b1',
    'google/flan-t5-xxl',
    'Writer/InstructPalmyra-20b',
    'facebook/opt-iml-max-30b',
    'tiiuae/falcon-40b-instruct',
    'meta-llama/Llama-2-70b-chat-hf',
    'stabilityai/StableBeluga2',
    'command', 
    'gpt-3.5-turbo-0613',
    # 'gpt-4-0613',
    ]

split_order = ['fce-test', 'jfleg-test', 'conll14-test']

for model in model_order:
    if model not in df['model'].values:
        continue
    model_name = model.split('/')[-1]
    row = [model_name]
    for split in split_order:
        df_split_temp = df[df['test_split'] == split]
        metric = 'F0.5' if 'jfleg' not in split else 'GLEU'
        row.append(write_latex_row_expanded_subcript(df_split_temp, model, metric))
    
    print(' & '.join(row) + ' \\\\')
    # print('\\hline')

bloomz-7b1 & 0.358 & 3 & \textsc{coyne} & 0.498 & 2 & \textsc{coyne}$^{*}$ & 0.405 & 3 & \textsc{coyne} \\
flan-t5-xxl & 0.463 & 1 & \textsc{tool} & 0.508 & 1 & \textsc{tool} & 0.397 & 3 & \textsc{tool} \\
InstructPalmyra-20b & 0.396 & 2 & \textsc{coyne} & 0.572 & 0 & \textsc{tool} & 0.499 & 2 & \textsc{coyne} \\
opt-iml-max-30b & 0.400 & 0 & \textsc{tool} & 0.521 & 2 & \textsc{coyne}$^{*}$ & 0.396 & 3 & \textsc{elt} \\
falcon-40b-instruct & 0.456 & 2 & \textsc{tool} & 0.602 & 4 & \textsc{coyne} & 0.560 & 4 & \textsc{tool} \\
Llama-2-70b-chat-hf & 0.374 & 0 & \textsc{tool} & 0.560 & 0 & \textsc{tool} & 0.517 & 0 & \textsc{tool} \\
StableBeluga2 & 0.454 & 0 & \textsc{tool} & 0.613 & 0 & \textsc{coyne} & 0.572 & 0 & \textsc{tool} \\
command & 0.408 & 0 & \textsc{tool} & 0.592 & 2 & \textsc{coyne}$^{*}$ & 0.538 & 0 & \textsc{tool} \\
gpt-3.5-turbo-0613 & 0.442 & 0 & \textsc{elt} & 0.625 & 4 & \textsc{tool} & 0.572 & 1 & \textsc{tool} \\


In [5]:
# filter to test_split == 'conll14-test'
df[df['test_split'] == 'conll14-test']

Unnamed: 0,model,prompt_type_index,dev_split,prompt_type,prompt_index,test_split,run,TP,FP,FN,Prec,Rec,F0.5,GLEU,prompt_template
20,gpt-4-0613,0-shot_6,wibea-dev,0-shot,6,conll14-test,1,0,0,0,0.6229,0.5447,0.6055,,system: You are a grammatical error correction...
21,stabilityai/StableBeluga2,0-shot_7,wibea-dev,0-shot,7,conll14-test,1,0,0,0,0.582,0.5335,0.5716,,### System: You are a grammatical error correc...
22,command,0-shot_6,wibea-dev,0-shot,6,conll14-test,1,0,0,0,0.5712,0.4376,0.5384,,You are a grammatical error correction tool. Y...
23,meta-llama/Llama-2-70b-chat-hf,0-shot_6,wibea-dev,0-shot,6,conll14-test,1,0,0,0,0.5115,0.5428,0.5175,,<s>[INST] <<SYS>> You are a grammatical error ...
24,tiiuae/falcon-40b-instruct,4-shot_2,wibea-dev,4-shot,2,conll14-test,1,0,0,0,0.5917,0.4616,0.5601,,You are a grammatical error correction tool. Y...
25,gpt-3.5-turbo-0613,1-shot_2,wibea-dev,1-shot,2,conll14-test,1,0,0,0,0.5743,0.563,0.572,,system: You are a grammatical error correction...
26,google/flan-t5-xxl,3-shot_2,wibea-dev,3-shot,2,conll14-test,1,0,0,0,0.6284,0.1603,0.3967,,You are a grammatical error correction tool. Y...
27,facebook/opt-iml-max-30b,3-shot_3,wibea-dev,3-shot,3,conll14-test,1,0,0,0,0.5821,0.1734,0.3956,,You are an English language teacher. A student...
28,Writer/InstructPalmyra-20b,2-shot_1,wibea-dev,2-shot,1,conll14-test,1,0,0,0,0.5486,0.3671,0.4993,,"Below is an instruction that describes a task,..."
29,bigscience/bloomz-7b1,3-shot_1,wibea-dev,3-shot,1,conll14-test,1,0,0,0,0.5764,0.1847,0.4047,,Reply with a corrected version of the input se...
