In [1]:
import os
import glob
import pandas as pd

In [2]:
# load error dataframe
df = pd.read_csv('../all_error_type_results.csv')

# type of prompt_index
df['prompt_index'] = df['prompt_index'].astype(str)

# create column combining model, prompt_type, prompt_index, split
df['model_prompt_type_prompt_index_split'] = df['model'] + '_' + df['prompt_type'] + '_' + df['prompt_index'] + '_' + df['split']

In [3]:
# load best dev results
dev_df = pd.read_csv(os.path.join('..', 'best_dev_set_results.csv'))
# drop GLEU and F0.5 columns
dev_df = dev_df.drop(columns=['GLEU', 'F0.5'])

# split prompt_type_index into prompt_type and prompt_index
dev_df['prompt_type'] = dev_df['prompt_type_index'].apply(lambda x: x.split('_')[0])
dev_df['prompt_index'] = dev_df['prompt_type_index'].apply(lambda x: x.split('_')[1])

# drop prompt_type_index column
dev_df = dev_df.drop(columns=['prompt_type_index'])

# cast prompt_indx to str
dev_df['prompt_index'] = dev_df['prompt_index'].astype(str)

# create column combining model, prompt_type, prompt_index, split
dev_df['model_prompt_type_prompt_index_split'] = dev_df['model'] + '_' + dev_df['prompt_type'] + '_' + dev_df['prompt_index'] + '_' + dev_df['split']

In [4]:
def check_unique(df):
    dfcopy = df.copy()
    # select columns: model, prompt_type, prompt_index, split
    dfcopy = dfcopy[['model', 'prompt_type', 'prompt_index', 'split']]
    # drop duplicates
    dfcopy = dfcopy.drop_duplicates()

    return dfcopy.shape

In [5]:
check_unique(dev_df)

(30, 4)

In [6]:
dev_df.head()

Unnamed: 0,model,split,prompt_type,prompt_index,model_prompt_type_prompt_index_split
0,gpt-4-0613,jfleg-dev,0-shot,8,gpt-4-0613_0-shot_8_jfleg-dev
1,stabilityai/StableBeluga2,jfleg-dev,0-shot,10,stabilityai/StableBeluga2_0-shot_10_jfleg-dev
2,meta-llama/Llama-2-70b-chat-hf,jfleg-dev,0-shot,6,meta-llama/Llama-2-70b-chat-hf_0-shot_6_jfleg-dev
3,Writer/InstructPalmyra-20b,jfleg-dev,0-shot,7,Writer/InstructPalmyra-20b_0-shot_7_jfleg-dev
4,facebook/opt-iml-max-30b,jfleg-dev,2-shot-Coyne,1,facebook/opt-iml-max-30b_2-shot-Coyne_1_jfleg-dev


In [7]:
df.head()

Unnamed: 0,model,prompt_type,prompt_index,split,Category,TP,FP,FN,P,R,F0.5,model_prompt_type_prompt_index_split
0,bigscience/bloomz-7b1,1-shot,1,fce-dev,M:ADJ,0,0,10,1.0,0.0,0.0,bigscience/bloomz-7b1_1-shot_1_fce-dev
1,bigscience/bloomz-7b1,1-shot,1,fce-dev,M:ADV,1,4,26,0.2,0.037,0.1064,bigscience/bloomz-7b1_1-shot_1_fce-dev
2,bigscience/bloomz-7b1,1-shot,1,fce-dev,M:CONJ,0,0,16,1.0,0.0,0.0,bigscience/bloomz-7b1_1-shot_1_fce-dev
3,bigscience/bloomz-7b1,1-shot,1,fce-dev,M:CONTR,0,0,1,1.0,0.0,0.0,bigscience/bloomz-7b1_1-shot_1_fce-dev
4,bigscience/bloomz-7b1,1-shot,1,fce-dev,M:DET,16,14,198,0.5333,0.0748,0.2395,bigscience/bloomz-7b1_1-shot_1_fce-dev


In [8]:
# filter df to model=='stabilityai/StableBeluga2', prompt_type='0-shot', split='dev'
dev_df[(dev_df['model']=='stabilityai/StableBeluga2') & (dev_df['prompt_type']=='0-shot') & (dev_df['split']=='jfleg-dev') & (dev_df['prompt_index']=='10')]

Unnamed: 0,model,split,prompt_type,prompt_index,model_prompt_type_prompt_index_split
1,stabilityai/StableBeluga2,jfleg-dev,0-shot,10,stabilityai/StableBeluga2_0-shot_10_jfleg-dev


In [9]:
# check before merge
check_unique(df)

(561, 4)

In [10]:
# use dev_def to select rows from df, using model_prompt_type_prompt_index_split as the key
dfmerge = df.merge(dev_df, how='inner', on=['model', 'prompt_type', 'prompt_index', 'split', 'model_prompt_type_prompt_index_split'])

In [11]:
dfmerge

Unnamed: 0,model,prompt_type,prompt_index,split,Category,TP,FP,FN,P,R,F0.5,model_prompt_type_prompt_index_split
0,bigscience/bloomz-7b1,3-shot,1,fce-dev,M:ADJ,0,0,10,1.0000,0.0000,0.0000,bigscience/bloomz-7b1_3-shot_1_fce-dev
1,bigscience/bloomz-7b1,3-shot,1,fce-dev,M:ADV,2,2,25,0.5000,0.0741,0.2326,bigscience/bloomz-7b1_3-shot_1_fce-dev
2,bigscience/bloomz-7b1,3-shot,1,fce-dev,M:CONJ,0,3,16,0.0000,0.0000,0.0000,bigscience/bloomz-7b1_3-shot_1_fce-dev
3,bigscience/bloomz-7b1,3-shot,1,fce-dev,M:CONTR,0,1,1,0.0000,0.0000,0.0000,bigscience/bloomz-7b1_3-shot_1_fce-dev
4,bigscience/bloomz-7b1,3-shot,1,fce-dev,M:DET,38,21,176,0.6441,0.1776,0.4222,bigscience/bloomz-7b1_3-shot_1_fce-dev
...,...,...,...,...,...,...,...,...,...,...,...,...
1604,stabilityai/StableBeluga2,0-shot,7,fce-dev,U:PRON,8,25,10,0.2424,0.4444,0.2667,stabilityai/StableBeluga2_0-shot_7_fce-dev
1605,stabilityai/StableBeluga2,0-shot,7,fce-dev,U:PUNCT,18,50,27,0.2647,0.4000,0.2839,stabilityai/StableBeluga2_0-shot_7_fce-dev
1606,stabilityai/StableBeluga2,0-shot,7,fce-dev,U:VERB,7,12,17,0.3684,0.2917,0.3500,stabilityai/StableBeluga2_0-shot_7_fce-dev
1607,stabilityai/StableBeluga2,0-shot,7,fce-dev,U:VERB:FORM,1,4,1,0.2000,0.5000,0.2273,stabilityai/StableBeluga2_0-shot_7_fce-dev


In [12]:
# filter df to model=='stabilityai/StableBeluga2', prompt_type='0-shot', split='dev'
dfmerge[(dfmerge['model']=='stabilityai/StableBeluga2') & (dfmerge['prompt_type']=='0-shot') & (dfmerge['split']=='jfleg-dev') & (dfmerge['prompt_index']=='10')]

Unnamed: 0,model,prompt_type,prompt_index,split,Category,TP,FP,FN,P,R,F0.5,model_prompt_type_prompt_index_split
1232,stabilityai/StableBeluga2,0-shot,10,jfleg-dev,M:ADJ,0,5,5,0.0,0.0,0.0,stabilityai/StableBeluga2_0-shot_10_jfleg-dev
1233,stabilityai/StableBeluga2,0-shot,10,jfleg-dev,M:ADV,3,7,13,0.3,0.1875,0.2679,stabilityai/StableBeluga2_0-shot_10_jfleg-dev
1234,stabilityai/StableBeluga2,0-shot,10,jfleg-dev,M:CONJ,4,3,13,0.5714,0.2353,0.4444,stabilityai/StableBeluga2_0-shot_10_jfleg-dev
1235,stabilityai/StableBeluga2,0-shot,10,jfleg-dev,M:CONTR,1,0,0,1.0,1.0,1.0,stabilityai/StableBeluga2_0-shot_10_jfleg-dev
1236,stabilityai/StableBeluga2,0-shot,10,jfleg-dev,M:DET,93,27,31,0.775,0.75,0.7699,stabilityai/StableBeluga2_0-shot_10_jfleg-dev
1237,stabilityai/StableBeluga2,0-shot,10,jfleg-dev,M:NOUN,9,17,6,0.3462,0.6,0.3782,stabilityai/StableBeluga2_0-shot_10_jfleg-dev
1238,stabilityai/StableBeluga2,0-shot,10,jfleg-dev,M:NOUN:POSS,4,0,0,1.0,1.0,1.0,stabilityai/StableBeluga2_0-shot_10_jfleg-dev
1239,stabilityai/StableBeluga2,0-shot,10,jfleg-dev,M:OTHER,13,36,53,0.2653,0.197,0.2481,stabilityai/StableBeluga2_0-shot_10_jfleg-dev
1240,stabilityai/StableBeluga2,0-shot,10,jfleg-dev,M:PART,0,1,0,0.0,1.0,0.0,stabilityai/StableBeluga2_0-shot_10_jfleg-dev
1241,stabilityai/StableBeluga2,0-shot,10,jfleg-dev,M:PREP,27,13,17,0.675,0.6136,0.6618,stabilityai/StableBeluga2_0-shot_10_jfleg-dev


In [13]:
check_unique(dfmerge)

(30, 4)

In [14]:
dev_df.shape, dfmerge.shape

((30, 5), (1609, 12))

In [15]:
# find rows in df that are NOT in dfmerge
dev_df[~dev_df['model_prompt_type_prompt_index_split'].isin(dfmerge['model_prompt_type_prompt_index_split'])]

Unnamed: 0,model,split,prompt_type,prompt_index,model_prompt_type_prompt_index_split


In [16]:
# save dfmerge
dfmerge.to_csv('../best_dev_set_error_type_results.csv', index=False)