In [2]:
import pandas as pd
import re

def extract_content(text):
    # Use a regular expression to find content between [[ and ]]
    match = re.search(r'\[\[(.*?)\]\]', text)
    if match:
        # Return the content as a string
        return match.group(1).lower()
    else:
        # Return None if no match is found
        return None
    
def extract_number(text):
    # Use a regular expression to find a number between [[ and ]]
    match = re.findall(r'\[\[(\d+)\]\]', text)
    if match:
        # Return the number as an integer
        return int(match[-1])
    else:
        # Return None if no match is found
        return None

In [24]:
metrics_cat_ls = []
metrics_entity_look_ls = []
metrics_landmark_look_ls = []
metrics_ans_form_ls = []

for eval_type in ['openai', 'full_hf_140', 'no_entity', 'no_reference']:
    # read the questions
    results = pd.read_csv('data/questions_140.csv')[['id', 'question', 'category', 'answer_format', 'n_matches', 'entity_match']]

    # select questions that requires entity/landmark look up
    results['requires_landmark_look_up'] = False
    results.loc[(results['n_matches'] > 0) & (results['entity_match'] == 0), 'requires_landmark_look_up'] = True
    results['requires_entity_look_up'] = results['entity_match'] == 1

    # read the outputs
    if eval_type=='openai':
        seed1, seed2, seed3 = (1131, 2352, 3158)
    if eval_type=='full_hf_140':
        seed1, seed2, seed3 = (1095, 2109, 2455)
    if eval_type=='no_entity':
        seed1, seed2, seed3 = (1048, 7463, 2705)
    if eval_type=='no_reference':
        seed1, seed2, seed3 = (2559, 3032, 2798)

    i1 = pd.read_csv(f'out_{eval_type}/out_info_{seed1}.csv')
    i2 = pd.read_csv(f'out_{eval_type}/out_info_{seed2}.csv')
    i3 = pd.read_csv(f'out_{eval_type}/out_info_{seed3}.csv')

    i1 = i1[i1['output'].notna()]
    i2 = i2[i2['output'].notna()]
    i3 = i3[i3['output'].notna()]

    i1 = i1[i1['info_output'].notna()]
    i2 = i2[i2['info_output'].notna()]
    i3 = i3[i3['info_output'].notna()]

    # output extraction
    i1['code_out_1'] = i1['output'].apply(extract_content)
    i1.loc[(i1['code_out_1']=='nan') & (i1['answer_format']=='yes or no'), 'code_out_1'] = 'no'
    i1.loc[(i1['code_out_1']=='nan') & (i1['answer_format']=='a single number'), 'code_out_1'] = '0'
    i1.drop(i1[(i1['code_out_1'] == 'nan') & (i1['answer_format'] == 'a single entity name')].index, inplace=True)

    i2['code_out_2'] = i2['output'].apply(extract_content)
    i2.loc[(i2['code_out_2']=='nan') & (i2['answer_format']=='yes or no'), 'code_out_2'] = 'no'
    i2.loc[(i2['code_out_2']=='nan') & (i2['answer_format']=='a single number'), 'code_out_2'] = '0'
    i2.drop(i2[(i2['code_out_2'] == 'nan') & (i2['answer_format'] == 'a single entity name')].index, inplace=True)

    i3['code_out_3'] = i3['output'].apply(extract_content)
    i3.loc[(i3['code_out_3']=='nan') & (i3['answer_format']=='yes or no'), 'code_out_3'] = 'no'
    i3.loc[(i3['code_out_3']=='nan') & (i3['answer_format']=='a single number'), 'code_out_3'] = '0'
    i3.drop(i3[(i3['code_out_3'] == 'nan') & (i3['answer_format'] == 'a single entity name')].index, inplace=True)

    i1['info_out_1'] = i1['info_output'].apply(extract_number)
    i2['info_out_2'] = i2['info_output'].apply(extract_number)
    i3['info_out_3'] = i3['info_output'].apply(extract_number)

    results = pd.merge(results, i1[['id', 'code_out_1', 'info_out_1']], on='id', how='left')
    results = pd.merge(results, i2[['id', 'code_out_2', 'info_out_2']], on='id', how='left')
    results = pd.merge(results, i3[['id', 'code_out_3', 'info_out_3']], on='id', how='left')

    # success rate
    results['success_1'] = 0
    results.loc[results['code_out_1'].notna() | results['code_out_2'].notna() | results['code_out_3'].notna(), 'success_1'] = 1

    results['success_3'] = 0
    results.loc[results['code_out_1'].notna() & results['code_out_2'].notna() & results['code_out_3'].notna(), 'success_3'] = 1

    # consistency score
    results['consistency_2'] = 0
    results.loc[(results['success_1'] == 1) & ((results['code_out_1'] == results['code_out_2']) | (results['code_out_1'] == results['code_out_3']) | (results['code_out_3'] == results['code_out_2'])), 'consistency_2'] = 1

    results['consistency_3'] = 0
    results.loc[(results['success_3'] == 1) & (results['code_out_1'] == results['code_out_2']) & (results['code_out_1'] == results['code_out_3']), 'consistency_3'] = 1

    # information gain
    results['info_gain_2_2'] = 0
    results.loc[(results['consistency_2'] == 1) & (((results['info_out_1'] == results['info_out_2']) & ((results['code_out_1'] == results['code_out_2']))) | 
                                                ((results['info_out_2'] == results['info_out_3']) & ((results['code_out_2'] == results['code_out_3']))) | 
                                                ((results['info_out_1'] == results['info_out_3']) & ((results['code_out_1'] == results['code_out_3'])))), 'info_gain_2_2'] = 1

    results['info_gain_3_2'] = 0
    results.loc[(results['consistency_3'] == 1) & ((results['info_out_1'] == results['info_out_2']) | (results['info_out_2'] == results['info_out_3']) | (results['info_out_1'] == results['info_out_3'])), 'info_gain_3_2'] = 1

    results['info_gain_3_3'] = 0
    results.loc[(results['consistency_3'] == 1) & (results['info_out_1'] == results['info_out_2']) & (results['info_out_2'] == results['info_out_3']), 'info_gain_3_3'] = 1
    
    # information gain = 0
    # results['info_gain_3_2_zero'] = 0
    # results.loc[(results['info_gain_3_2'] == 1) & (((results['info_out_1']==0) & (results['info_out_2']==0)) | ((results['info_out_1']==0) & (results['info_out_3']==0)) | ((results['info_out_3']==0) & (results['info_out_2']==0))), 'info_gain_3_2_zero'] = 1

    results['info_gain_3_3_zero'] = 0
    results.loc[(results['info_gain_3_3'] == 1) & (results['info_out_1']==0), 'info_gain_3_3_zero'] = 1

    # compute all metrics
    metrics = ['success_1', 'success_3', 'consistency_2', 'consistency_3', 'info_gain_3_3', 'info_gain_3_2', 'info_gain_2_2', 'info_gain_3_3_zero']
    metrics_rename = {
        'success_1': "Success Rate (at least 1 seed)", 
        'success_3': "Success Rate (all 3 seeds)",
        'consistency_2': "Consistency (at least 2 seeds)", 
        'consistency_3': "Consistency (all 3 seeds)", 
        'info_gain_3_3': "Info Gain (Consistency 3 + Info Gain 3)", 
        'info_gain_3_2': "Info Gain (Consistency 3 + Info Gain 2)", 
        'info_gain_2_2': "Info Gain (Consistency 2 + Info Gain 2)",
        'info_gain_3_3_zero': "Info Gain (Consistency 3 + Info Gain 3) => 0",
        # 'info_gain_3_2_zero': "Info Gain (Consistency 3 + Info Gain 2) => 0",
        # 'info_gain_2_2_zero': "Info Gain (Consistency 2 + Info Gain 2) => 0",
    }

    # compute average metrics
    metrics_cat = results.groupby('category', as_index=False)[metrics].mean().rename(metrics_rename, axis=1)
    metrics_cat['eval_type'] = eval_type
    metrics_cat_ls.append(metrics_cat)
    
    metrics_entity_look = results.groupby('requires_entity_look_up', as_index=False)[metrics].mean().rename(metrics_rename, axis=1)
    metrics_entity_look['eval_type'] = eval_type
    metrics_entity_look_ls.append(metrics_entity_look)

    metrics_landmark_look = results.groupby('requires_landmark_look_up', as_index=False)[metrics].mean().rename(metrics_rename, axis=1)
    metrics_landmark_look['eval_type'] = eval_type
    metrics_landmark_look_ls.append(metrics_landmark_look)

    metrics_ans_form = results.groupby('answer_format', as_index=False)[metrics].mean().rename(metrics_rename, axis=1)
    metrics_ans_form['eval_type'] = eval_type
    metrics_ans_form_ls.append(metrics_ans_form)

In [27]:
metrics = list(metrics_rename.values())

metrics_cat_all = pd.concat(metrics_cat_ls)
metrics_cat_all.groupby(['category', 'eval_type'])[metrics].first().to_excel('metrics_cat.xlsx')

metrics_entity_look_all = pd.concat(metrics_entity_look_ls)
metrics_entity_look_all.groupby(['requires_entity_look_up', 'eval_type'])[metrics].first().to_excel('metrics_entity_lookup.xlsx')

metrics_landmark_look_all = pd.concat(metrics_landmark_look_ls)
metrics_landmark_look_all.groupby(['requires_landmark_look_up', 'eval_type'])[metrics].first().to_excel('metrics_landmark_lookup.xlsx')

metrics_ans_form_all = pd.concat(metrics_ans_form_ls)
metrics_ans_form_all.groupby(['answer_format', 'eval_type'])[metrics].first().to_excel('metrics_ans_format.xlsx')

In [26]:
metrics_entity_look_all

Unnamed: 0,requires_entity_look_up,Success Rate (at least 1 seed),Success Rate (all 3 seeds),Consistency (at least 2 seeds),Consistency (all 3 seeds),Info Gain (Consistency 3 + Info Gain 3),Info Gain (Consistency 3 + Info Gain 2),Info Gain (Consistency 2 + Info Gain 2),Info Gain (Consistency 3 + Info Gain 3) => 0,eval_type
0,False,1.0,0.97561,0.878049,0.536585,0.329268,0.512195,0.731707,0.0,openai
1,True,1.0,0.931034,0.862069,0.551724,0.37931,0.551724,0.844828,0.068966,openai
0,False,0.987805,0.560976,0.52439,0.170732,0.097561,0.121951,0.243902,0.012195,full_hf_140
1,True,1.0,0.655172,0.586207,0.258621,0.137931,0.224138,0.37931,0.051724,full_hf_140
0,False,0.914634,0.365854,0.414634,0.085366,0.02439,0.073171,0.268293,0.0,no_entity
1,True,0.844828,0.431034,0.5,0.293103,0.172414,0.275862,0.448276,0.172414,no_entity
0,False,0.97561,0.292683,0.365854,0.073171,0.012195,0.060976,0.219512,0.0,no_reference
1,True,1.0,0.344828,0.586207,0.206897,0.172414,0.206897,0.327586,0.086207,no_reference
