In [76]:
import pandas as pd
import os
import numpy as np
import ast 

np.random.seed(0)

# Begin Processing Auto-Eval Files

In [106]:
# This notebook does several things, including:
    # 1. Removes precision and T2V annotations for instances where coverage was set to -1 due to display issues
    # 2. Joins in the utility and fluency results from the re-done human evaluation
    # 2. Ensures each dataset is represented by 120 query-generation pairs per OP instantiation
    # 3. Joins in Vertex API results about which sentences require citation (resolves discrepancies in sentence parsing)
    # 4. Creates a copy of results with data only for sentences requiring citation

baselines = False
model_type = 'sonnet4.5'  # 'gpt4', 'gpt5', 'sonnet4.5'


data_str_ls = ['nq', 'multihop', 'mash', 'eli3g']

def op_fps(data_str, model_type):  
     mapped_data_str = {'nq':'nq', 'multihop':'mh', 'mash':'mash', 'eli3g':'eli3'}[data_str]
     return f'../../attrib/autoEval_results/RESOLVED_with_needs_citation_labels_{mapped_data_str}_{model_type}'
    
def baseline_fps(data_str):
        mapped_data_str = {'nq':'nq', 'multihop':'mh', 'mash':'mash', 'eli3g':'eli3'}[data_str]
        return f'../../attrib/autoEval_results/RESOLVED_with_needs_citation_labels_baseline_{mapped_data_str}_gpt4'


# Remove irrelevant T2V and precision annotations

In [107]:
def remove_irrelevant_t2v_and_precision_annotations(df):
    # Occasionally, the annotation interface fails to display a cited sentence (coverage = -1). 
    # In these cases, precision and T2V were still collected. This function identifies and removes these measurements.
    idxs_ops_of_interest = []
    for i in range(len(df)):
        if (df['op'].iloc[i] == 'Snippet'):
            continue
        t2vs = eval(df['t2v_coverage'].iloc[i])
        is_covered = eval(df['is_covered'].iloc[i])
        is_precise = eval(df['precise_citations'].iloc[i])
        actual_is_covered = []
        actual_is_precise = []
        for j in range(len(is_covered)):
            cov_item = is_covered[j]
            if (cov_item['coverage'] != -1):
                actual_is_covered.append(cov_item)
            prec_item = is_precise[j]
            if (len(prec_item['annotations'])!=0):
                actual_is_precise.append(prec_item)
            
        if ((len(actual_is_covered) != len(t2vs)) or \
            (len(actual_is_covered) != len(actual_is_precise))):
            query_id = df['query_id'].iloc[i]
            op = df['op'].iloc[i]
            idxs_ops_of_interest.append((query_id, op))
            is_precise = eval(df['precise_citations'].iloc[i])
            new_is_precise = []
            for j in range(len(is_covered)):
                coverage_item = is_covered[j]
                if (coverage_item['coverage']!=-1):
                    new_is_precise.append(is_precise[j])
                else:
                    new_is_precise.append({"annotations":[],"sentence_id":coverage_item["sentence_id"]})
    
            if (len(is_covered)==len(t2vs)): # all of the sentences have a citation, but some weren't displayed properly
                new_t2vs = []
                for j in range(len(is_covered)):
                    coverage_item = is_covered[j]
                    if (coverage_item['coverage']!=-1):
                        new_t2vs.append(t2vs[j])

            elif (len(is_covered) > len(t2vs)): # some of the sentences had no citations and some weren't displayed properly
                new_t2vs = []
                k = 0 # will be used to index into t2vs
                for j in range(len(is_covered)):
                    coverage_item = is_covered[j]
                    precision_item = is_precise[j]
                    if (not ((coverage_item['coverage']==-1) and (len(precision_item['annotations'])==0))): # if T2V recorded for this sentence
                        if (coverage_item['coverage']!=-1): # if the sentence was displayed correctly
                            new_t2vs.append(t2vs[k]) # keep the corresponding t2v
                        k += 1
            else:
                print('!!!!!! not handled')    
                
            
            df['precise_citations'].iloc[i] = str(new_is_precise)
            df['t2v_coverage'].iloc[i] = str(new_t2vs)

    # print('Corrected:', idxs_ops_of_interest)
    df = df.reset_index()
    return df

In [108]:
def check_annotations(df):
    # checks whether all precision and T2V annotations are consistent with the coverage dict
    df = df[df['op']!='Snippet']
    
    idxs_ops_of_interest = []
    for i in range(len(df)):
        t2vs = eval(df['t2v_coverage'].iloc[i])
        is_covered = eval(df['is_covered'].iloc[i])
        is_precise = eval(df['precise_citations'].iloc[i])
        actual_is_covered = []
        actual_is_precise = []
        for j in range(len(is_covered)):
            cov_item = is_covered[j]
            if (cov_item['coverage'] != -1):
                actual_is_covered.append(cov_item)
            prec_item = is_precise[j]
            if (len(prec_item['annotations'])!=0):
                actual_is_precise.append(prec_item)
        if (len(actual_is_covered) != len(t2vs)):
            print(df['query_id'].iloc[i])
            print('is_precise', is_precise)
            print('actual_is_covered len', len(actual_is_covered))
            print('actual_is_covered', actual_is_covered)
            print('is_covered len', len(is_covered))
            print('t2vs', t2vs)
            print('t2vs len', len(t2vs))

In [109]:
df_dict = {}

for data_str in data_str_ls:

    if baselines:
        df_dict[data_str] = pd.read_csv(baseline_fps(data_str)+'.csv', index_col=False)
        df_dict[data_str].rename(columns={'auto_fluency_rating': 'human_fluency_rating',
                   'auto_utility_rating': 'human_utility_rating',
                   'auto_precise_citations': 'precise_citations',
                   'auto_is_covered': 'is_covered',
                   'auto_t2v_coverage': 't2v_coverage',
                   'auto_t2v_precision': 't2v_precision'
                   }, inplace=True)

    else:
        df_dict[data_str] = pd.read_csv(op_fps(data_str, model_type)+'.csv', index_col=False)
        df_dict[data_str] = df_dict[data_str].rename(columns={'auto_fluency_rating': 'human_fluency_rating',
                    'auto_utility_rating': 'human_utility_rating',
                    'auto_precise_citations': 'precise_citations',
                    'auto_is_covered': 'is_covered',
                    'auto_t2v_coverage': 't2v_coverage',
                    'auto_t2v_precision': 't2v_precision'
                    })

for data_str in data_str_ls:
    # df_dict[data_str] = remove_irrelevant_t2v_and_precision_annotations(df_dict[data_str])
    check_annotations(df_dict[data_str])
        
print('Checked that annotations are consistent across precision, T2V, and coverage.')

Checked that annotations are consistent across precision, T2V, and coverage.


In [110]:
def identify_mismatches(df):
    # The Vertex API sometimes parses sentences differently than our implementation. We resolve these conflicts by hand.
    ids_that_need_editing = []
    for i in range(len(df)):
        if (df['op'].iloc[i] == 'Snippet'):
            continue
        # print("!!!!!", eval(df['t2v_coverage'].iloc[i]))
        coverage_annotation_count = len(eval(df['t2v_coverage'].iloc[i])) # Sent
        gpt4_sentence_count = len(eval(df['Sent'].iloc[i])) # Sent
        vertex_sentence_count = len(eval(df['Sentences Need Citation'].iloc[i]))
        if (gpt4_sentence_count != vertex_sentence_count):
            # if ((all(eval(df['Sentences Need Citation'].iloc[i]))) & (vertex_sentence_count > gpt4_sentence_count)):
            #     pass
            # else:
            print('MISMATCH')
            print(i)
            print('(\''+df['op'].iloc[i]+'\', \''+str(df['query_id'].iloc[i])+'\')')
            print('curr sentence count:', gpt4_sentence_count)
            print('coverage annotation count:', coverage_annotation_count)
            print('sentences:', df['Sent'].iloc[i])
            print('vertex sentence count:', vertex_sentence_count)
            print('curr vertex label:', eval(df['Sentences Need Citation'].iloc[i]))
            print()
            ids_that_need_editing.append(i)
    print(ids_that_need_editing)

In [111]:
# TEST
# Display the cases that require review
for data_str in data_str_ls:
    df = df_dict[data_str]

    for i in range(len(df)):
        # if the op is 'Snippet', skip
        if (df['op'].iloc[i] == 'Snippet'):
            continue
        sentences_need_citation = eval(df['Sentences Need Citation'].iloc[i])
        if not len(eval(df['is_covered'].iloc[i])) == len(sentences_need_citation):
            print(f"COVERAGE data: {data_str}, id: {i}, query_id: {df['query_id'].iloc[i]}, op: {df['op'].iloc[i]} | {len(eval(df['is_covered'].iloc[i]))} | {len(sentences_need_citation)}")

# Ensure there are 120 queries per method
First, check to see how much can be kept

In [117]:
def check_trimmed_annotations_soft(df, n):
    query_counts_by_method = df.groupby('op')['query_id'].count()
    methods = df.groupby('op')['query_id'].count().index
    for i in range(len(query_counts_by_method)):
        assert query_counts_by_method.iloc[i] >= n
        if (query_counts_by_method.iloc[i] < n):
            print('\tNeed more for '+methods[i]+': '+str(n-query_counts_by_method.iloc[i]))

In [118]:
for data_str in data_str_ls:
        df = df_dict[data_str]
        check_trimmed_annotations_soft(df, 150)

In [119]:
def check_trimmed_annotations(df, n):
    query_counts_by_method = df.groupby('op')['query_id'].count()
    for i in range(len(query_counts_by_method)):
        assert query_counts_by_method[i] == n

In [120]:
def trim_annotations(df, n):
    trimmed_df = df.iloc[:0]
    ops = np.unique(df['op'])
    trimmed_op_df_ls = []
    for op in ops:
        op_df = df[df['op']==op]
        op_df = op_df.sort_values(by='query_id')
        op_df = op_df.iloc[:n]
        trimmed_op_df_ls.append(op_df)
    trimmed_df = pd.concat(trimmed_op_df_ls, ignore_index=True)

    trimmed_df = trimmed_df.reset_index()
    return trimmed_df

In [121]:
n = 150
num_trimmmed = 0
for data_str in data_str_ls:
    df = df_dict[data_str]
    num_trimmmed += len(df)
    trimmed_df = trim_annotations(df, n)
    num_trimmmed -= len(trimmed_df)
    df_dict[data_str] = df
print('Number of points removed:', num_trimmmed)

Number of points removed: 190


# Create the results files accounting for "needs citation"

In [123]:
# Display the cases that require review
for data_str in data_str_ls:
    print('Showing '+data_str+' Baseline='+str(baselines))
    df = df_dict[data_str]
    
    print(data_str)
    
    # Identify mismatches in my sentence count and the vertex sentence count
    identify_mismatches(df)
    # Determine the "needs citation" labels by hand for this case

Showing nq Baseline=False
nq
[]
Showing multihop Baseline=False
multihop
[]
Showing mash Baseline=False
mash
[]
Showing eli3g Baseline=False
eli3g
[]


In [124]:
# For cases where one or more sentences don't require citation, add their "Sentences Need Citation" label to a dict below
baseline_corrections = {'mash': {}, 
               'eli3g': {70: [True]*2}, # ('Quoted', '504')
               'nq': {},
               'multihop': {}
              }

op_corrections = {'mash': {
                    # 88: [False, True, True, True, True, True, True, True, True]
                    
    } , 
               'eli3g': {
                #    70: [True]*2, # ('Quoted', '504')
                   },
               'nq': {
                #    43: [False, False, False, True, False, True]
                   },
               'multihop': {}
              }


In [125]:
def fix_mismatches(df, corrections_dict):
    for i in range(len(df)):
        if (df['op'].iloc[i] == 'Snippet'):
            continue
        gpt4_sentence_count = len(eval(df['Sent'].iloc[i]))
        vertex_sentence_count = len(eval(df['Sentences Need Citation'].iloc[i]))
        if (gpt4_sentence_count != vertex_sentence_count):
            if ((all(eval(df['Sentences Need Citation'].iloc[i]))) & (vertex_sentence_count > gpt4_sentence_count)):
                 df.loc[i, 'Sentences Need Citation'] = str([True]*gpt4_sentence_count) 
            else:
                if (i not in corrections_dict):
                    print(i)
                    continue
                df.loc[i, 'Sentences Need Citation'] = str(corrections_dict[i])

    for i in range(len(df)):
        if (df['op'].iloc[i] == 'Snippet'):
            continue
        gpt4_sentence_count = len(eval(df['Sent'].iloc[i]))
        vertex_sentence_count = len(eval(df['Sentences Need Citation'].iloc[i]))
        assert gpt4_sentence_count == vertex_sentence_count

    df = df.reset_index()
            
    return df

In [126]:
# Fix the mismatches between vertex and the current sentence count
df_dict = df_dict if baselines else df_dict
for data_str in data_str_ls:
    df = df_dict[data_str]
    
    # assign the "needs citation" labels for the mismatch case from above
    if (baselines):
        corrections_dict = baseline_corrections
    else:
        corrections_dict = op_corrections
        
    df = fix_mismatches(df, corrections_dict[data_str])  
    df_dict[data_str] = df
print('Fixed mismatches between the Vertex API and the annotated sentence count')

Fixed mismatches between the Vertex API and the annotated sentence count


In [127]:
def make_only_needs_citation(df):
    # Remove the precision, coverage, and T2V data for sentences that do not require citation
    # Clean up the precision and coverage annotations, given the "needs citation labels"
    for i in range(len(df)):
        if (df['op'].iloc[i] == 'Snippet'):
            continue
            
        sentences_need_citation = eval(df['Sentences Need Citation'].iloc[i])

        if len(sentences_need_citation) != len(eval(df['is_covered'].iloc[i])):
            print(sentences_need_citation)
            print(eval(df['is_covered'].iloc[i]))
            print('OP', df['op'].iloc[i])
            print('ID', df['query_id'].iloc[i])
            print('id', i)
            print()
        
        # first the coverage
        is_covered = eval(df['is_covered'].iloc[i])
        new_is_covered = []
        for j in range(len(is_covered)):
            sentence_idx = int(is_covered[j]['sentence_id'])
            if (sentences_need_citation[sentence_idx]):
                new_is_covered.append(is_covered[j])

        df.loc[i, 'is_covered'] = str(new_is_covered)
        if not (len(eval(df['is_covered'].iloc[i])) == np.sum(sentences_need_citation)):
            print(i)
    
        # now the precision
        is_precise = eval(df['precise_citations'].iloc[i])
        new_is_precise = []
        for j in range(len(is_precise)):
            item = is_precise[j]
            sentence_idx = int(item['sentence_id'])
            if (sentences_need_citation[sentence_idx]):
                new_is_precise.append(item)
        df.loc[i, 'precise_citations'] = str(new_is_precise)
        # assert len(eval(df['precise_citations'].iloc[i])) == np.sum(sentences_need_citation)
        if not (len(eval(df['precise_citations'].iloc[i])) == np.sum(sentences_need_citation)):
            print(i)
        

        # now T2V
        t2vs = eval(df['t2v_coverage'].iloc[i])
            
        # keep the T2V values that correspond to coverage values that a) exist and b) need citation
        actual_coverage_items = []
        for item in is_covered:
            if (item['coverage'] != -1):
                actual_coverage_items.append(item)

        
        new_t2vs = []
        for j in range(len(actual_coverage_items)):
            sentence_idx = int(actual_coverage_items[j]['sentence_id'])
            if (sentences_need_citation[sentence_idx]):
                new_t2vs.append(t2vs[j])
        df.loc[i, 't2v_coverage'] = str(new_t2vs)

        # Now, handle the citations dict
        actual_citations_dict = {}
        citations_dict = eval(df['Citation Dict'].iloc[i])
        for k in citations_dict.keys():
            if (sentences_need_citation[int(k)]):
                actual_citations_dict[k] = citations_dict[k]
        df.loc[i, 'Citation Dict'] = str(actual_citations_dict)

    df = df.reset_index()

    return df

In [128]:
def check_needs_citation(df):
    for i in range(len(df)):
        if (df['op'].iloc[i] == 'Snippet'):
            continue
        needs_citation_ls = eval(df['Sentences Need Citation'].iloc[i])
        is_covered_ls = eval(df['is_covered'].iloc[i])
        is_precise_ls = eval(df['precise_citations'].iloc[i])
        assert np.sum(needs_citation_ls) == len(is_covered_ls)
        assert  np.sum(needs_citation_ls) == len(is_precise_ls)
        t2vs = eval(df['t2v_coverage'].iloc[i])
        assert len(is_covered_ls) >= len(t2vs)

In [129]:
for data_str in data_str_ls:

    df = df_dict[data_str]
    # clean up the coverage, precision, and T2V annotations, given the "needs citation labels"
    print(data_str)
    df = make_only_needs_citation(df)

    # check that only the relevant sentences are kept
    check_needs_citation(df)
    
    df_dict[data_str] = df
print('Discard coverage, precision, and T2V annotations for sentences that do not require citation')

nq
multihop
mash
eli3g
Discard coverage, precision, and T2V annotations for sentences that do not require citation


# Save op files in the right folder with consistent naming

In [133]:
data_name_ls = ['nq', 'mh', 'mash', 'eta3g']
columns_to_remove = ['n-gram precision', 'Citation Count', 'n sentences', 'Fluency Rating', 'Perceived Utility Rating']

for data_str, data_name in zip(data_str_ls, data_name_ls):
    op_df = df_dict[data_str]
    op_df = op_df.drop(columns=columns_to_remove)
    op_df = op_df.loc[:, ~op_df.columns.str.startswith("Unnamed")]
    check_needs_citation(op_df) # check again that only sentences requiring citation are kept
    if (baselines):
        save_path = '../mturk_results/processed_results/'+data_name+'_auto_eval_byQueryOP_baselines_needs_citation_'+model_type+'.csv' # this is a file used in plotting_by_metric
    else:
        save_path = '../mturk_results/processed_results/'+data_name+'_auto_eval_byQueryOP_ops_needs_citation_'+model_type+'.csv' # this is a file used in plotting_by_metric
    op_df.to_csv(save_path)