In [1]:
# notebook to create a .csv with the best results on the dev sets for each model,
#  evaluated across both zero-shot and few-shot prompts

In [1]:
import os
import pandas as pd

In [2]:
# zero-shot file
# assume 1-index prompts
zero_shot_df = pd.read_csv(os.path.join('..', 'best_zero_shot_dev_results.csv'))

zero_shot_df.head()

Unnamed: 0,model,prompt_type_index,GLEU,Prec,Rec,split,F0.5
0,facebook/opt-iml-max-30b,0-shot_10,0.501,0.8347,0.4881,jfleg-dev,
1,gpt-4-0613,0-shot_8,0.582,0.6824,0.6359,jfleg-dev,
2,google/flan-t5-xxl,0-shot_5,0.459,0.8055,0.3681,jfleg-dev,
3,gpt-3.5-turbo-0613,0-shot_10,0.574,0.6382,0.638,jfleg-dev,
4,tiiuae/falcon-40b-instruct,0-shot_6,0.541,0.6452,0.5596,jfleg-dev,


In [3]:
zero_shot_df.prompt_type_index.unique()

array(['0-shot_10', '0-shot_8', '0-shot_5', '0-shot_6', '0-shot_7'],
      dtype=object)

In [4]:
# few-shot file
# assume 1-index prompts
few_shot_df = pd.read_csv(os.path.join('..', 'best_few_shot_dev_results.csv'))
few_shot_df.head()

Unnamed: 0,model,prompt_type_index,GLEU,Prec,Rec,split,F0.5
0,facebook/opt-iml-max-30b,2-shot-Coyne_1,0.506,0.7768,0.4899,jfleg-dev,
1,gpt-3.5-turbo-0613,4-shot_2,0.577,0.6356,0.6377,jfleg-dev,
2,google/flan-t5-xxl,1-shot_2,0.463,0.7792,0.3593,jfleg-dev,
3,stabilityai/StableBeluga2,4-shot_1,0.56,0.6312,0.6174,jfleg-dev,
4,tiiuae/falcon-40b-instruct,4-shot_1,0.548,0.6474,0.5521,jfleg-dev,


In [5]:
# concatenate zero_shot_dev_df and few_shot_df
best_results_dev_df = pd.concat([zero_shot_df, few_shot_df])
best_results_dev_df.head()

Unnamed: 0,model,prompt_type_index,GLEU,Prec,Rec,split,F0.5
0,facebook/opt-iml-max-30b,0-shot_10,0.501,0.8347,0.4881,jfleg-dev,
1,gpt-4-0613,0-shot_8,0.582,0.6824,0.6359,jfleg-dev,
2,google/flan-t5-xxl,0-shot_5,0.459,0.8055,0.3681,jfleg-dev,
3,gpt-3.5-turbo-0613,0-shot_10,0.574,0.6382,0.638,jfleg-dev,
4,tiiuae/falcon-40b-instruct,0-shot_6,0.541,0.6452,0.5596,jfleg-dev,


In [6]:
best_results_dev_df.prompt_type_index.unique()

array(['0-shot_10', '0-shot_8', '0-shot_5', '0-shot_6', '0-shot_7',
       '2-shot-Coyne_1', '4-shot_2', '1-shot_2', '4-shot_1', '2-shot_2',
       '3-shot_2', '3-shot_3', '2-shot_1', '3-shot_1', '1-shot_3',
       '1-shot_1'], dtype=object)

In [7]:
best_results_dev_df[best_results_dev_df['split'] == 'jfleg-dev']

Unnamed: 0,model,prompt_type_index,GLEU,Prec,Rec,split,F0.5
0,facebook/opt-iml-max-30b,0-shot_10,0.501,0.8347,0.4881,jfleg-dev,
1,gpt-4-0613,0-shot_8,0.582,0.6824,0.6359,jfleg-dev,
2,google/flan-t5-xxl,0-shot_5,0.459,0.8055,0.3681,jfleg-dev,
3,gpt-3.5-turbo-0613,0-shot_10,0.574,0.6382,0.638,jfleg-dev,
4,tiiuae/falcon-40b-instruct,0-shot_6,0.541,0.6452,0.5596,jfleg-dev,
5,stabilityai/StableBeluga2,0-shot_10,0.563,0.6131,0.6103,jfleg-dev,
6,command,0-shot_10,0.535,0.6147,0.5633,jfleg-dev,
7,meta-llama/Llama-2-70b-chat-hf,0-shot_6,0.5,0.5893,0.6054,jfleg-dev,
8,Writer/InstructPalmyra-20b,0-shot_7,0.517,0.5628,0.5269,jfleg-dev,
9,bigscience/bloomz-7b1,0-shot_10,0.402,0.6667,0.1735,jfleg-dev,


In [8]:
# function to return a table of max F0.5 scores for each model, filtered by split
def get_max_f05(df, split):
    # filter by split
    df_split = df[df['split'] == split]
    # group df by model and select the rows with the maximum F0.5 score
    max_metric = 'F0.5'
    if 'jfleg' in split:
        max_metric = 'GLEU'

    idx = df_split.groupby(['model'])[max_metric].transform(max) == df_split[max_metric]
    df_max = df_split[idx]
    return df_max

In [10]:
df_max_splits = {}
for split in best_results_dev_df.split.unique():
    # metric is gleu if split is jfleg
    metric = 'F0.5' if 'jfleg' not in split else 'GLEU'
    df_temp = get_max_f05(best_results_dev_df, split)
    df_max_splits[split] = df_temp[['model', 'prompt_type_index', metric, 'Prec', 'Rec']]
    # convert metric to float
    df_max_splits[split][metric] = df_max_splits[split][metric].astype(float)
    # format metric to 3 decimal places
    df_max_splits[split][metric] = df_max_splits[split][metric].apply(lambda x: round(x, 3))


  idx = df_split.groupby(['model'])[max_metric].transform(max) == df_split[max_metric]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_max_splits[split][metric] = df_max_splits[split][metric].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_max_splits[split][metric] = df_max_splits[split][metric].apply(lambda x: round(x, 3))
  idx = df_split.groupby(['model'])[max_metric].transform(max) == df_split[max_metric]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

In [11]:
df_max_splits

{'jfleg-dev':                             model prompt_type_index   GLEU    Prec     Rec
 1                      gpt-4-0613          0-shot_8  0.582  0.6824  0.6359
 5       stabilityai/StableBeluga2         0-shot_10  0.563  0.6131  0.6103
 7  meta-llama/Llama-2-70b-chat-hf          0-shot_6  0.500  0.5893  0.6054
 8      Writer/InstructPalmyra-20b          0-shot_7  0.517  0.5628  0.5269
 0        facebook/opt-iml-max-30b    2-shot-Coyne_1  0.506  0.7768  0.4899
 1              gpt-3.5-turbo-0613          4-shot_2  0.577  0.6356  0.6377
 2              google/flan-t5-xxl          1-shot_2  0.463  0.7792  0.3593
 4      tiiuae/falcon-40b-instruct          4-shot_1  0.548  0.6474  0.5521
 5                         command    2-shot-Coyne_1  0.543  0.5948  0.5873
 8           bigscience/bloomz-7b1    2-shot-Coyne_1  0.456  0.6389  0.3588,
 'wibea-dev':                              model prompt_type_index   F0.5    Prec     Rec
 10                      gpt-4-0613          0-shot_6  0.510

In [12]:
# for each dataframe in df_max_splits, add a column with the split name
for split in sorted(df_max_splits):
    df_max_splits[split]['split'] = split

# concatenate all dataframes from df_max_splits
df_max_all = pd.concat(df_max_splits.values(), axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_max_splits[split]['split'] = split


In [13]:
df_max_all

Unnamed: 0,model,prompt_type_index,GLEU,Prec,Rec,split,F0.5
1,gpt-4-0613,0-shot_8,0.582,0.6824,0.6359,jfleg-dev,
5,stabilityai/StableBeluga2,0-shot_10,0.563,0.6131,0.6103,jfleg-dev,
7,meta-llama/Llama-2-70b-chat-hf,0-shot_6,0.5,0.5893,0.6054,jfleg-dev,
8,Writer/InstructPalmyra-20b,0-shot_7,0.517,0.5628,0.5269,jfleg-dev,
0,facebook/opt-iml-max-30b,2-shot-Coyne_1,0.506,0.7768,0.4899,jfleg-dev,
1,gpt-3.5-turbo-0613,4-shot_2,0.577,0.6356,0.6377,jfleg-dev,
2,google/flan-t5-xxl,1-shot_2,0.463,0.7792,0.3593,jfleg-dev,
4,tiiuae/falcon-40b-instruct,4-shot_1,0.548,0.6474,0.5521,jfleg-dev,
5,command,2-shot-Coyne_1,0.543,0.5948,0.5873,jfleg-dev,
8,bigscience/bloomz-7b1,2-shot-Coyne_1,0.456,0.6389,0.3588,jfleg-dev,


In [14]:
# save results to csv
df_max_all.to_csv(os.path.join('..', 'best_dev_set_results.csv'), index=False)