In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random

In [2]:
input_tag = '_cleaned_trimmed_needs_citation_only_NEW.csv' # Only has precision, coverage, and T2V for sentences that need citation (Vertex)
results_tag = 'sentences_that_need_citation_only'
datasets = ['NQ', 'Eta3G', 'MH', 'MASH']
ops = ['Snippet', 'Quoted', 'Paraphrased', 'Entailed', 'Abstractive', 'Post Hoc', 'Gemini']
op_names = ['Reference\nExtractive', 'Reference\nQuoted', 'Reference\nParaphrased', 'Reference\nEntailed', 'Reference\nAbstractive', 'GPT-4 + Vertex\nAbstractive', 'Gemini']
all_ops = ['Snippet', 'Quoted', 'Quoted Reeval', 'Paraphrased', 'Entailed', 'Abstractive', 'Gemini', 'Post Hoc']
op_fps = {'NQ': '../mturk_results/nq_mturk_with_needs_citation_labels2',
       'MH': '../mturk_results/mh_mturk_with_needs_citation_labels',
       'MASH': '../mturk_results/mash_mturk_with_needs_citation_labels',
       'Eta3G': '../mturk_results/eli3_mturk_with_needs_citation_labels',
      }
baseline_fps = {'NQ': '../mturk_results/nq_baseline_mturk_with_needs_citation_labels',
       'MH': '../mturk_results/mh_baseline_mturk_with_needs_citation_labels',
       'MASH': '../mturk_results/mash_baseline_mturk_with_needs_citation_labels',
       'Eta3G': '../mturk_results/eli3_baseline_mturk_with_needs_citation_labels',
      }
all_results_dict = {}
for k in op_fps.keys():
    op_df = pd.read_csv(op_fps[k]+input_tag, index_col=False)
    baseline_df = pd.read_csv(baseline_fps[k]+input_tag, index_col=False)
    quoted_baseline_df = pd.read_csv(baseline_fps[k]+'.csv', index_col=False)
    quoted_baseline_df = quoted_baseline_df[quoted_baseline_df['op']=='Quoted']
    baseline_df = pd.concat([baseline_df, quoted_baseline_df])
    # rename the reeval quoted op for the outputs
    baseline_df_outputs_no_quoted = baseline_df[baseline_df['op']!='Quoted']
    baseline_df_outputs_quoted = baseline_df[baseline_df['op']=='Quoted']
    baseline_df_outputs_quoted.loc[:,'op'] = ['Quoted Reeval']*len(baseline_df_outputs_quoted)
    baseline_df = pd.concat([baseline_df_outputs_no_quoted, baseline_df_outputs_quoted])
    
    dataset_results = pd.concat([op_df, baseline_df])
    all_results_dict[k] = dataset_results
    
# Trim to 120 queries per OP per dataset
n_to_keep = 120
for dataset in datasets:
    new_dataset_results = pd.DataFrame()
    dataset_results = all_results_dict[dataset]
    for op in all_ops:
        dataset_op_results = dataset_results[dataset_results['op']==op]
        dataset_op_results = dataset_op_results.sort_values('query_id')
        if (op != 'Quoted Reeval'):
            dataset_op_results = dataset_op_results.iloc[:120]
        new_dataset_results = pd.concat([new_dataset_results, dataset_op_results], ignore_index=True)
    new_dataset_results['dataset'] = dataset
    all_results_dict[dataset] = new_dataset_results

for dataset in datasets:
    assert len(all_results_dict[dataset]) > n_to_keep*len(all_ops)
    
assert len(all_results_dict) == 4

all_results = pd.DataFrame()
for dataset in datasets:
    dataset_results = all_results_dict[dataset]
    all_results = pd.concat([dataset_results, all_results], ignore_index=True)

print(len(all_results))

3938


# Citation Coverage

In [8]:
for op in ['Quoted', 'Paraphrased', 'Entailed', 'Abstractive', 'Post Hoc', 'Gemini']:
    print(op)
    num_with_no_citations = 0
    num_with_at_least_one_precise_citation = 0
    num_with_all_imprecise_citations = 0
    num_coverage_errors = 0
    num_sentences = 0
    for k in datasets:
        # print(k)
        ds_results = all_results_dict[k]
        ds_results = ds_results[ds_results['op']==op]
        
        for i in range(len(ds_results)):
            # get the 
            if (ds_results['op'].iloc[i]=='Snippet'):
                continue
            coverage_results = eval(ds_results['is_covered'].iloc[i])
            precision_results = eval(ds_results['precise_citations'].iloc[i])
            for sentence_i in range(len(precision_results)):
                # assert coverage_results[sentence_i]['sentence_id'] == sentence_i
                num_sentences += 1
                coverage_value = coverage_results[sentence_i]['coverage']
                if (coverage_value == 1):
                    continue
                num_coverage_errors += 1
                precision_values = precision_results[sentence_i]['annotations']
                if (len(precision_values) == 0):
                    num_with_no_citations += 1
                elif (1 in precision_values):
                    num_with_at_least_one_precise_citation += 1
                else:
                    num_with_all_imprecise_citations += 1
                    
    print(num_coverage_errors, '/ '+str(num_sentences))
    results = np.array([num_with_no_citations, num_with_all_imprecise_citations, num_with_at_least_one_precise_citation])
    results = 100*results/num_coverage_errors
    print(np.round(results, decimals=1))
    print()

Quoted
134 / 1381
[67.9  6.  26.1]

Paraphrased
145 / 1288
[29.  12.4 58.6]

Entailed
115 / 776
[20. 13. 67.]

Abstractive
216 / 907
[42.1  6.  51.9]

Post Hoc
520 / 920
[67.7 11.7 20.6]

Gemini
1157 / 1361
[83.8  5.5 10.7]



# Improvement rates over OPs

### Fluency

In [9]:
# Fluency Rating: percentage of <3's in quoted that change to 3's in paraphrased (or entailed; just change comparison_op)
def get_quoted_improvement_rate(metric, comparison_op):
    denominator = 0
    numerator = 0
    for k in datasets:
        dataset_results = all_results_dict[k]
        quoted_1fluency_df = dataset_results[(dataset_results['op']=='Quoted')&(dataset_results[metric]<3)]
        qids_quoted_1fluency = quoted_1fluency_df['query_id']
        corresponding_pp_quoted_1fluency_df = dataset_results[(dataset_results['op']==comparison_op)&(dataset_results['query_id'].isin(qids_quoted_1fluency))]
        corresponding_3fluency_pp_quoted_1fluency_df = corresponding_pp_quoted_1fluency_df[corresponding_pp_quoted_1fluency_df[metric]==3]
        denominator += len(corresponding_pp_quoted_1fluency_df)
        numerator += len(corresponding_3fluency_pp_quoted_1fluency_df)
    print(numerator, denominator, numerator/denominator)
    return numerator/denominator

get_quoted_improvement_rate('human_fluency_rating', 'Paraphrased')
get_quoted_improvement_rate('human_fluency_rating', 'Entailed')

64 70 0.9142857142857143
67 68 0.9852941176470589


0.9852941176470589

### Utility

In [11]:
get_quoted_improvement_rate('human_utility_rating', 'Paraphrased')
get_quoted_improvement_rate('human_utility_rating', 'Entailed');

85 130 0.6538461538461539
120 134 0.8955223880597015


## Quoted Utility Failure Type Prevalences

We perform three batches of utility failure analysis

In [18]:
def print_all_instances(sampled_quoted_utility_failures):
    for i in range(len(sampled_quoted_utility_failures)):
        query = sampled_quoted_utility_failures['Question'].iloc[i]
        ds = sampled_quoted_utility_failures['dataset'].iloc[i]
        query_id = sampled_quoted_utility_failures['query_id'].iloc[i]
        output = sampled_quoted_utility_failures['Output (cited)'].iloc[i]
        sources = eval(sampled_quoted_utility_failures['All Sources'].iloc[i])
        print('(\"'+ds+'\", \"Quoted\", '+str(query_id)+')')
        print(query)
        print()
        print(output)
        for s in sources:
            print(s)
        print(query_id)
        print()
        print('--------------------------------------------------------------------------------------------------------')
        print()

### First batch of utility analysis

In [19]:
random.seed(10)
np.random.seed(10)
# First, sample 10 low utility quote responses from each dataset
sampled_quoted_utility_failures = pd.DataFrame()
for k in datasets:
    dataset_results = all_results_dict[k]
    low_utility_quoted_dataset_results = dataset_results[(dataset_results['op']=='Quoted')&(dataset_results['human_utility_rating']<3)]
    sample_of_10 = low_utility_quoted_dataset_results.sample(10)
    sampled_quoted_utility_failures = pd.concat([sampled_quoted_utility_failures, sample_of_10])
assert len(sampled_quoted_utility_failures) == 40
# Then, read each query and output. Take note of the failure categories each one falls into.

print_all_instances(sampled_quoted_utility_failures)

("NQ", "Quoted", 351)
what kind of guns are allowed in switzerland

[92m"Firearms regulation in Switzerland allows the acquisition of semi-automatic, and – with a may-issue permit – fully automatic firearms, by Swiss citizens and foreigners with or without permanent residence" [1][0m. [92m"Bolt-action rifles do not require an acquisition permit, and can be acquired with just a background check" [2][0m. [92m"The acquisition of fully automatic weapons, suppressors and target lasers requires special permits issued by the cantonal firearms office" [3][0m.
Firearms regulation in Switzerland allows the acquisition of semi-automatic, and – with a may-issue permit – fully automatic firearms, by Swiss citizens and foreigners with or without permanent residence.[note 1] The laws pertaining to the acquisition of firearms in Switzerland are amongst the most liberal in the world. Swiss gun laws are primarily about the acquisition of arms, and not ownership. As such a license is not required t

### Second half of utility analysis

In [20]:
random.seed(10)
np.random.seed(10)

new_sampled_quoted_utility_failures = pd.DataFrame()
for ds in datasets:
    # get rows for this dataset
    ds_df = all_results_dict[ds]
    ds_sampled_quoted_utility_failures = sampled_quoted_utility_failures[sampled_quoted_utility_failures['dataset']==ds]
    # ds_df = ds_df.drop_duplicates()
    duplicate_idx = ds_df.duplicated(subset=['op', 'query_id'], keep='first')
    print('Removing n duplicates:', np.sum(duplicate_idx))
    ds_df = ds_df[~duplicate_idx]
    
    # create unique row identifier from the op and the query_id
    ds_df['op_query_id'] = ds_df['op']+ds_df['query_id'].astype(str)
    print(len(ds_df['op_query_id']))
    ds_sampled_quoted_utility_failures['op_query_id'] = ds_sampled_quoted_utility_failures['op']+ds_sampled_quoted_utility_failures['query_id'].astype(str)
    print(len(ds_sampled_quoted_utility_failures['op_query_id']))
    
    # only keep rows that have not yet been sampled for this dataset
    result = ds_df.merge(ds_sampled_quoted_utility_failures, on=['op_query_id'], how='left', indicator=True)
    result = result[['_merge', 'op_query_id']].merge(ds_df, on=['op_query_id'], how='right')
    remainder_ds_df = result[result['_merge']=='left_only'].drop(columns=['_merge'])
    print(len(remainder_ds_df))

    # sample 10 more utility failures
    low_utility_quoted_dataset_results = remainder_ds_df[(remainder_ds_df['op']=='Quoted')&(remainder_ds_df['human_utility_rating']<3)]
    sample_of_10 = low_utility_quoted_dataset_results.sample(10)
    new_sampled_quoted_utility_failures = pd.concat([new_sampled_quoted_utility_failures, sample_of_10])
assert len(new_sampled_quoted_utility_failures) == 40

print_all_instances(new_sampled_quoted_utility_failures)

Removing n duplicates: 4
979
10
969
Removing n duplicates: 15
969
10
959
Removing n duplicates: 2
988
10
978
Removing n duplicates: 2
979
10
969
("NQ", "Quoted", 331)
what is the role of hcl in the stomach

[92m"The secretion is a complex and relatively energetically expensive process" [1][0m. [92m"Parietal cells contain an extensive secretory network (called canaliculi) from which the hydrochloric acid is secreted into the lumen of the stomach" [2][0m. [92m"The acidic gastric juice also contains digestive enzymes produced by other cells in the gastric glands – gastric chief cells" [3][0m. [96m"Acid in the stomach serves several purposes but is mostly associated with degrading proteins and polysaccharides so they can cross the intestinal epithelium" [4][0m. [96m"An increase in HCl and decreasing pH level also signal gastric motility to turn on to move the partially digested bolus of food along and help kill bacteria normally ingested with food" [5][0m.
The secretion is a comp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds_df['op_query_id'] = ds_df['op']+ds_df['query_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds_sampled_quoted_utility_failures['op_query_id'] = ds_sampled_quoted_utility_failures['op']+ds_sampled_quoted_utility_failures['query_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

In [48]:
# Utility failure types by example for the 2nd half of the analysis
n=40
too_verbose = [("NQ", "Quoted", 331), ("NQ", "Quoted", 285), ("NQ", "Quoted", 253), ("Eta3G", "Quoted", 515), ("Eta3G", "Quoted", 442), 
               ("Eta3G", "Quoted", 565), ("MH", "Quoted", 188), ("MH", "Quoted", 148), ("MH", "Quoted", 119), ("MH", "Quoted", 92),
              ("MH", "Quoted", 183), ("MASH", "Quoted", 107), ("MASH", "Quoted", 165), ("MASH", "Quoted", 155), ("MASH", "Quoted", 79),
               ("MASH", "Quoted", 191), ("MASH", "Quoted", 183), ("MASH", "Quoted", 123), ("MH", "Quoted", 73),
               ("MH", "Quoted", 81), ("MASH", "Quoted", 175)
              ]

premises_without_conclusion = [("NQ", "Quoted", 369), ("NQ", "Quoted", 264), ("MH", "Quoted", 99), ("MH", "Quoted", 188), 
                               ("MH", "Quoted", 191), ("MH", "Quoted", 157), ("MASH", "Quoted", 150), ("MASH", "Quoted", 165)]

not_appropriate_style = [("NQ", "Quoted", 258), ("Eta3G", "Quoted", 515), ("Eta3G", "Quoted", 456), ("Eta3G", "Quoted", 460),
                        ("Eta3G", "Quoted", 572), ("Eta3G", "Quoted", 536), ("Eta3G", "Quoted", 512), ("Eta3G", "Quoted", 542)]

failed_info_retrieval = [("NQ", "Quoted", 245), ("NQ", "Quoted", 253), ("Eta3G", "Quoted", 442), ("MASH", "Quoted", 195),
                        ("Eta3G", "Quoted", 442)]

failed_in_context_answer = [("NQ", "Quoted", 288), ("NQ", "Quoted", 305), ("Eta3G", "Quoted", 572)]

In [49]:
def print_instance(curr_query_id, ds):
    mash_results = all_results_dict[ds]
    for i in range(len(mash_results)):
        op = mash_results['op'].iloc[i]
        if (op == 'Snippet'):
            continue
        query = mash_results['Question'].iloc[i]
        ds = mash_results['dataset'].iloc[i]
        query_id = mash_results['query_id'].iloc[i]
        if (query_id != curr_query_id):
            continue
        output = mash_results['Output (cited)'].iloc[i]
        # sources = eval(mash_results['All Sources'].iloc[i])
        utility_rating = mash_results['human_utility_rating'].iloc[i]
        # urls = eval(mash_results['All URLs'].iloc[i])
        print(query)
        print(utility_rating)
        print('(\"'+ds+'\", \"'+op+'\", '+str(query_id)+')')
        print()
        print(output)
        print()
        # for u, s in zip(urls, sources):
        #     print(u)
        #     print(s)
        #     print()
        print(query_id)
        print()
        print('--------------------------------------------------------------------------------------------------------')
        print()
print_instance(183, 'MASH')

How often should I check my blood sugar if I have diabetes?
1
("MASH", "Quoted", 183)

[92m"When you have diabetes, you may need to check your blood sugar throughout the day" [1][0m. [92m"Give yourself a blood sugar test as often as your doctor advises" [2][0m. [92m"To do it, you use a gadget that pricks your finger with a tiny needle" [3][0m. [92m"You'll put a drop of blood onto a test strip" [4][0m. [92m"The strip goes into a handheld device that measures your blood sugar level" [5][0m. [92m"Record the test results, so you can share it with your doctor" [6][0m. [92m"A1c Test: This is a blood test you'll get in your doctor's office at least twice a year, or as often as he recommends" [7][0m. [92m"The results show your average blood sugar control for the past 2 to 3 months" [8][0m. [92m"Continuous Glucose Monitoring System: If you choose this method, your doctor will place a tiny sensor under your skin to check blood sugar levels every 5 minutes" [9][0m.

183

-------

### Third and final batch of utility analysis

In [50]:
random.seed(10)
np.random.seed(10)

last_sampled_quoted_utility_failures = pd.DataFrame()
for ds in datasets:
    # get rows for this dataset
    ds_df = all_results_dict[ds]
    all_sampled_quoted_utility_failures = pd.concat([sampled_quoted_utility_failures, new_sampled_quoted_utility_failures])
    ds_sampled_quoted_utility_failures = all_sampled_quoted_utility_failures[all_sampled_quoted_utility_failures['dataset']==ds]
    duplicate_idx = ds_df.duplicated(subset=['op', 'query_id'], keep='first')
    ds_df = ds_df[~duplicate_idx]
    
    # create unique row identifier from the op and the query_id
    ds_df['op_query_id'] = ds_df['op']+ds_df['query_id'].astype(str)
    ds_sampled_quoted_utility_failures['op_query_id'] = ds_sampled_quoted_utility_failures['op']+ds_sampled_quoted_utility_failures['query_id'].astype(str)
    
    # only keep rows that have not yet been sampled for this dataset
    result = ds_df.merge(ds_sampled_quoted_utility_failures, on=['op_query_id'], how='left', indicator=True)
    result = result[['_merge', 'op_query_id']].merge(ds_df, on=['op_query_id'], how='right')
    remainder_ds_df = result[result['_merge']=='left_only'].drop(columns=['_merge'])

    # sample 10 more utility failures
    low_utility_quoted_dataset_results = remainder_ds_df[(remainder_ds_df['op']=='Quoted')&(remainder_ds_df['human_utility_rating']<3)]
    sample_of_10 = low_utility_quoted_dataset_results.sample(min(20, len(low_utility_quoted_dataset_results)))
    print(len(sample_of_10))
    last_sampled_quoted_utility_failures = pd.concat([last_sampled_quoted_utility_failures, sample_of_10])
    
print_all_instances(last_sampled_quoted_utility_failures)

20
19
20
5
("NQ", "Quoted", 283)
who did cora marry in once upon a time

Cora [96m"seizes her moment at a masquerade ball, slyly sidling up to the eligible Prince Henry and charming him into asking her to dance" [1][0m.
Cora and Regina arrive at the shop and overcome the protection spell. While David, Neal, and Emma stand against them, Mary Margaret sneaks away to Regina's mausoleum and uses the candle to curse Cora's heart. Regina follows after Cora senses that someone is there. Emma and Neal retreat to the back room, where she casts a new protection spell. Believing he will die, Gold asks to call Belle (Emilie de Ravin). Although Belle still doesn't remember Gold, she is moved when he tells her he loves her, and that she is a hero for loving a monster like him. He says that she inspires him to be his best self. Neal is surprised to hear such heartfelt words from his father, who then also apologizes to him. Neal affirms that he is still angry, but he tearfully embraces Gold. Regina 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds_df['op_query_id'] = ds_df['op']+ds_df['query_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds_sampled_quoted_utility_failures['op_query_id'] = ds_sampled_quoted_utility_failures['op']+ds_sampled_quoted_utility_failures['query_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

In [51]:
# Utility failure types by example for the second and third part of the utility failure analysis
# Results from the first part of the failure analysis were only tracked by count below

too_verbose = [("NQ", "Quoted", 397), ("NQ", "Quoted", 320), ("NQ", "Quoted", 348), ("NQ", "Quoted", 262), ("NQ", "Quoted", 345),
              ("NQ", "Quoted", 294), ("NQ", "Quoted", 362), ("NQ", "Quoted", 402), ("Eta3G", "Quoted", 471), ("Eta3G", "Quoted", 469),
              ("Eta3G", "Quoted", 465), ("Eta3G", "Quoted", 457), ("Eta3G", "Quoted", 524), ("Eta3G", "Quoted", 549), 
               ("Eta3G", "Quoted", 473), ("Eta3G", "Quoted", 463), ("Eta3G", "Quoted", 467), ("MH", "Quoted", 123),
              ("MH", "Quoted", 129), ("MH", "Quoted", 93), ("MH", "Quoted", 110), ("MH", "Quoted", 105), ("MH", "Quoted", 198),
              ("MH", "Quoted", 194), ("MH", "Quoted", 61), ("MH", "Quoted", 102), ("MH", "Quoted", 143), ("MH", "Quoted", 155),
              ("MH", "Quoted", 182), ("MH", "Quoted", 71), ("MH", "Quoted", 127), ("MH", "Quoted", 114), ("MH", "Quoted", 111),
              ("MH", "Quoted", 167), ("MH", "Quoted", 189), ("MASH", "Quoted", 144), ("MASH", "Quoted", 70), ("MASH", "Quoted", 92),
              ("MASH", "Quoted", 105), ("MASH", "Quoted", 84), ("NQ", "Quoted", 298), ("NQ", "Quoted", 275), ("NQ", "Quoted", 331), 
               ("NQ", "Quoted", 285), ("NQ", "Quoted", 253), ("Eta3G", "Quoted", 515), ("Eta3G", "Quoted", 442), 
               ("Eta3G", "Quoted", 565), ("MH", "Quoted", 188), ("MH", "Quoted", 148), ("MH", "Quoted", 119), ("MH", "Quoted", 92),
              ("MH", "Quoted", 183), ("MASH", "Quoted", 107), ("MASH", "Quoted", 165), ("MASH", "Quoted", 155), ("MASH", "Quoted", 79),
               ("MASH", "Quoted", 191), ("MASH", "Quoted", 183), ("MASH", "Quoted", 123), ("MH", "Quoted", 73),
               ("MH", "Quoted", 81), ("MASH", "Quoted", 175)]

premises_without_conclusion = [("NQ", "Quoted", 279), ("Eta3G", "Quoted", 487), ("Eta3G", "Quoted", 462), ("MH", "Quoted", 123),
                              ("MH", "Quoted", 129), ("MH", "Quoted", 93), ("MH", "Quoted", 102), ("MH", "Quoted", 71), 
                               ("MH", "Quoted", 111), ("MH", "Quoted", 167), ("NQ", "Quoted", 369), ("NQ", "Quoted", 264), 
                               ("MH", "Quoted", 99), ("MH", "Quoted", 188), ("MH", "Quoted", 191), ("MH", "Quoted", 157), 
                               ("MASH", "Quoted", 150), ("MASH", "Quoted", 165)]

not_appropriate_style = [("Eta3G", "Quoted", 484), ("Eta3G", "Quoted", 471), ("Eta3G", "Quoted", 469), ("Eta3G", "Quoted", 443),
                        ("Eta3G", "Quoted", 563), ("Eta3G", "Quoted", 457), ("Eta3G", "Quoted", 524), ("Eta3G", "Quoted", 468), 
                         ("Eta3G", "Quoted", 495), ("Eta3G", "Quoted", 473), ("Eta3G", "Quoted", 463), ("Eta3G", "Quoted", 574),
                        ("Eta3G", "Quoted", 467), ("Eta3G", "Quoted", 513), ("NQ", "Quoted", 258), ("Eta3G", "Quoted", 515), 
                         ("Eta3G", "Quoted", 456), ("Eta3G", "Quoted", 460), ("Eta3G", "Quoted", 572), ("Eta3G", "Quoted", 536), 
                         ("Eta3G", "Quoted", 512), ("Eta3G", "Quoted", 542)]

failed_info_retrieval = [("NQ", "Quoted", 283), ("NQ", "Quoted", 358), ("NQ", "Quoted", 254), ("NQ", "Quoted", 362), ("MH", "Quoted", 159),
                        ("NQ", "Quoted", 324), ("NQ", "Quoted", 245), ("NQ", "Quoted", 253), ("MASH", "Quoted", 195),
                        ("Eta3G", "Quoted", 442)]

failed_in_context_answer = [("NQ", "Quoted", 343), ("NQ", "Quoted", 311), ("Eta3G", "Quoted", 484), 
                            ("Eta3G", "Quoted", 539), ("Eta3G", "Quoted", 468), ("MH", "Quoted", 72), ("NQ", "Quoted", 290),
                           ("NQ", "Quoted", 242), ("NQ", "Quoted", 288), ("NQ", "Quoted", 305), ("Eta3G", "Quoted", 572)]

In [52]:
# Tally of the utility failure types by dataset (across the three parts of the utility failure analysis)
# ['NQ', 'Eta3G', 'MH', 'MASH']
n=40
too_verbose = [15, 17, 25, 20]
premises_without_conclusion = [4, 3, 21, 2]
not_appropriate_style = [1, 28, 0, 0]
failed_info_retrieval = [11,2,1,2]
failed_in_context_answer = [10,5,1,1]
all_records = [too_verbose, premises_without_conclusion, not_appropriate_style, failed_info_retrieval, failed_in_context_answer]
for i in range(4):
    total = 0
    for r in all_records:
        total += r[i]
    print(total)

41
55
48
25


In [53]:
def count_by_ds(failures_instances):
    num_nq = 0
    num_eta3g = 0
    num_mh = 0
    num_mash = 0
    for item in failures_instances:
        if (item[0]=='NQ'):
            num_nq += 1
        elif (item[0]=='Eta3G'):
            num_eta3g += 1
        elif (item[0]=='MH'):
            num_mh += 1
        else:
            num_mash += 1
    return [num_nq, num_eta3g, num_mh, num_mash]

In [54]:
# Get the number of instances that the utility failures are out of
num_failures_analysed = {'NQ':40, 'Eta3G':39, 'MH':40, 'MASH':25}
# First, get the rates by dataset
for ds in datasets:
    ds_df = all_results_dict[ds]
    failure_rate = len(ds_df[ds_df['human_utility_rating']<3])/len(ds_df)
    print('failure rate:', failure_rate)
    denom = int(num_failures_analysed[ds]/failure_rate)
    print('failures out of:', denom)
    print()

failure rate: 0.3051881993896236
failures out of: 131

failure rate: 0.2764227642276423
failures out of: 141

failure rate: 0.2606060606060606
failures out of: 153

failure rate: 0.1926605504587156
failures out of: 129



In [58]:
# Overall utility failure prevalences
n = 40+39+40+25
print('Proportion of utility failures that were too verbose:', sum(too_verbose)/n)
print('Proportion of utility failures that had premises without conclusions:', sum(premises_without_conclusion)/n)
print('Proportion of utility failures that did not have appropriate style:', sum(not_appropriate_style)/n)
print('Proportion of utility failures resulting from failure to interpret information in-context:', sum(failed_info_retrieval)/n)
print('Proportion of utility failures resulting from retrieval failures:', sum(failed_info_retrieval)/(n/2))

Proportion of utility failures that were too verbose: 0.5347222222222222
Proportion of utility failures that had premises without conclusions: 0.20833333333333334
Proportion of utility failures that did not have appropriate style: 0.2013888888888889
Proportion of utility failures resulting from failure to interpret information in-context: 0.1111111111111111
Proportion of utility failures resulting from retrieval failures: 0.2222222222222222


In [61]:
for i in range(len(datasets)):
    m = num_failures_analysed[datasets[i]]
    print('Proportion of utility failures that were too verbose:', too_verbose[i]/m*100)
    print('Proportion of utility failures that had premises without conclusions:', premises_without_conclusion[i]/m*100)
    print('Proportion of utility failures that did not have appropriate style:', not_appropriate_style[i]/m*100)
    print('Proportion of utility failures resulting from failure to interpret information in-context:', failed_in_context_answer[i]/m*100)
    print('Proportion of utility failures resulting from retrieval failures:', failed_info_retrieval[i]/m*100)
    print()

Proportion of utility failures that were too verbose: 37.5
Proportion of utility failures that had premises without conclusions: 10.0
Proportion of utility failures that did not have appropriate style: 2.5
Proportion of utility failures resulting from failure to interpret information in-context: 25.0
Proportion of utility failures resulting from retrieval failures: 27.500000000000004

Proportion of utility failures that were too verbose: 43.58974358974359
Proportion of utility failures that had premises without conclusions: 7.6923076923076925
Proportion of utility failures that did not have appropriate style: 71.7948717948718
Proportion of utility failures resulting from failure to interpret information in-context: 12.82051282051282
Proportion of utility failures resulting from retrieval failures: 5.128205128205128

Proportion of utility failures that were too verbose: 62.5
Proportion of utility failures that had premises without conclusions: 52.5
Proportion of utility failures that di

# Precision Failure Analysis

In [3]:
def has_precision_error(all_results, i):
    precision_annotations = eval(all_results['precise_citations'].iloc[i])
    for sentence_dict in precision_annotations:
        if (0 in sentence_dict['annotations']):
            return True
    return False

def get_num_precision_errors(element):
    num_precision_errors = 0
    precision_annotations = eval(element)
    for sentence_dict in precision_annotations:
        num_precision_errors += len(sentence_dict['annotations']) - np.sum(sentence_dict['annotations'])
    return num_precision_errors

def get_num_citations(element):
    num_citations = 0
    precision_annotations = eval(element)
    for sentence_dict in precision_annotations:
        num_citations += len(sentence_dict['annotations'])
    return num_citations

In [22]:
random.seed(10)
np.random.seed(10)
# First, obtain 10 responses with precision errors for each combination of dataset and OP of interest
n = 40

sampled_precision_failures_df = pd.DataFrame()
op_implementations = ['Quoted', 'Entailed', 'Post Hoc', 'Gemini']

for k in op_implementations:
    print(k)
    # Get the responses with at least one citation with a precision failure
    op_dataset_results = all_results[all_results['op']==k]
    more_op_dataset_results = all_results[all_results['op']==k]
    op_dataset_results['num_citations'] = op_dataset_results['precise_citations'].apply(get_num_citations)
    more_op_dataset_results['num_citations'] = more_op_dataset_results['precise_citations'].apply(get_num_citations)
    total_num_citations = np.sum(op_dataset_results['num_citations'])+np.sum(more_op_dataset_results['num_citations'])
    
    failure_idxs = []
    for i in range(len(op_dataset_results)):
        if (has_precision_error(op_dataset_results, i)):
            failure_idxs.append(i)
    op_precision_failure_df = op_dataset_results.iloc[failure_idxs]

    failure_idxs = []
    for i in range(len(more_op_dataset_results)):
        if (has_precision_error(more_op_dataset_results, i)):
            failure_idxs.append(i)
    more_op_precision_failure_df = more_op_dataset_results.iloc[failure_idxs]

    # figure out how many precision failures are in each output
    op_precision_failure_df['num_precision_errors'] = op_precision_failure_df['precise_citations'].apply(get_num_precision_errors)
    more_op_precision_failure_df['num_precision_errors'] = more_op_precision_failure_df['precise_citations'].apply(get_num_precision_errors)
    total_num_prec_failures = np.sum(op_precision_failure_df['num_precision_errors'])+np.sum(more_op_precision_failure_df['num_precision_errors'])
    
    print('Precision Success Rate: ', 1-total_num_prec_failures/total_num_citations)
    op_precision_failure_df = op_precision_failure_df.sample(frac=1)
    more_op_precision_failure_df = more_op_precision_failure_df.sample(frac=1)
    num_failures = 0
    idx = 0
    idx_to_keep = []
    while ((num_failures < n) and (idx < len(op_precision_failure_df))):
        curr_num_failures = op_precision_failure_df['num_precision_errors'].iloc[idx]
        # curr_num_failures2 = get_num_precision_errors(precision)
        if ((curr_num_failures+num_failures<=n) or (idx == len(op_precision_failure_df)-1)):
            num_failures += curr_num_failures
            idx_to_keep.append(idx)
        idx += 1
    if (k == 'Entailed'):
        idx = 0
        more_idx_to_keep = []
        while ((num_failures < n) and (idx < len(more_op_precision_failure_df))):
            curr_num_failures = more_op_precision_failure_df['num_precision_errors'].iloc[idx]
            if ((curr_num_failures+num_failures<=n) or (idx == len(op_precision_failure_df)-1)):
                num_failures += curr_num_failures
                more_idx_to_keep.append(idx)
            idx += 1
    # assert num_failures >= n
    corresponding_n_citations = (num_failures / total_num_prec_failures) * total_num_citations
    print('Precision failures examined: '+str(num_failures)+'/'+str(int(corresponding_n_citations)))
    sample_to_analyze = op_precision_failure_df.iloc[idx_to_keep]
    if (k == 'Entailed'):
        sample_to_analyze = pd.concat([sample_to_analyze, more_op_precision_failure_df.iloc[more_idx_to_keep]])
    sampled_precision_failures_df = pd.concat([sampled_precision_failures_df, sample_to_analyze])
print()

Quoted
Precision Success Rate:  0.9895424836601308
Precision failures examined: 16.0/1530
Entailed
Precision Success Rate:  0.9656862745098039
Precision failures examined: 40.0/1165
Post Hoc
Precision Success Rate:  0.7838745800671892
Precision failures examined: 40.0/185
Gemini
Precision Success Rate:  0.8290816326530612
Precision failures examined: 40.0/234



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  op_dataset_results['num_citations'] = op_dataset_results['precise_citations'].apply(get_num_citations)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_op_dataset_results['num_citations'] = more_op_dataset_results['precise_citations'].apply(get_num_citations)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

In [6]:
# Then, read each query and output. Take note of the failure categories each one falls into.
for i in range(len(sampled_precision_failures_df)):
    query = sampled_precision_failures_df['Question'].iloc[i]
    ds = sampled_precision_failures_df['dataset'].iloc[i]
    op = sampled_precision_failures_df['op'].iloc[i]
    query_id = sampled_precision_failures_df['query_id'].iloc[i]
    output = sampled_precision_failures_df['Output (cited)'].iloc[i]
    sources = eval(sampled_precision_failures_df['Used Sources (cited)'].iloc[i])
    precision = sampled_precision_failures_df['precise_citations'].iloc[i]
    n_precision_errors = get_num_precision_errors(precision)
    curr_tuple = (ds, op, query_id)
    print('QUERY: ', query)
    print('N ERRORS: ', n_precision_errors)
    print('(\"'+ds+'\", \"'+op+'\", '+str(query_id)+')')
    print()
    print('PRECISION: ', precision)
    print()
    print('OUTPUT: ', output)
    print()
    for s in sources:
        print('SOURCE: ', s)
    print('(\"'+ds+'\", \"'+op+'\", '+str(query_id)+')')
    print()
    print('--------------------------------------------------------------------------------------------------------')
    print()

QUERY:  How should I plan my summer vacation if I have summer depression?
N ERRORS:  1
ANNOTATOR:  A15X8ATAWSRXIF
("MASH", "Quoted", 136)

PRECISION:  [{'annotations': [1, 1, 1], 'sentence_id': 0}, {'annotations': [1, 1, 1, 1, 1], 'sentence_id': 1}, {'annotations': [1, 1, 1, 1, 1], 'sentence_id': 2}, {'annotations': [1, 1], 'sentence_id': 3}, {'annotations': [1, 0], 'sentence_id': 4}]

OUTPUT:  Summer depression, also known as [92m"summertime SAD" [1][0m, is a form of seasonal affective disorder that [92m"affects about 4% to 6% of the U. S. population" [2][0m where [92m"the onset of summer triggers their depression symptoms" [3][0m. Triggers for summer depression include [92m"the longer days, and increasing heat and humidity" [4][0m, [92m"disrupted schedules in summer" [5][0m, [92m"body image issues" [6][0m, [92m"financial worries" [7][0m, and [92m"the heat" [8][0m. To manage summer depression effectively, one should [92m"Get help" [9][0m from a professional if feelin

In [20]:
# Code each response and mark the number of citations from that response that fall into the failure case (_n)
# Claim taken out of context
outofcontext = [("MASH", "Quoted", 86), ("MASH", "Quoted", 191), ("Eta3G", "Entailed", 472), ("MASH", "Post Hoc", 66), 
                ("NQ", "Post Hoc", 316), ("MASH", "Post Hoc", 114), ("MASH", "Post Hoc", 162), ("Eta3G", "Post Hoc", 507),
                ("MASH", "Gemini", 155), ("NQ", "Quoted", 302)
]
outofcontext_n = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# Irrelevant source quote: keywords as proxies (something went wrong in the retrieval/quote selection process)
isource = [("NQ", "Entailed", 340), ("MH", "Post Hoc", 180), ("Eta3G", "Post Hoc", 486), 
           ("MASH", "Post Hoc", 105), ("NQ", "Post Hoc", 328), ("MH", "Post Hoc", 146), ("MASH", "Post Hoc", 85), ("MASH", "Post Hoc", 119),
           ("MH", "Post Hoc", 60), ("Eta3G", "Post Hoc", 530), ("Eta3G", "Post Hoc", 514), ("Eta3G", "Post Hoc", 507), ("Eta3G", "Post Hoc", 507),
           ("MASH", "Post Hoc", 133), ("MASH", "Post Hoc", 100), ("Eta3G", "Post Hoc", 461), ("Eta3G", "Gemini", 503), 
           ("Eta3G", "Gemini", 573), ("NQ", "Gemini", 275), ("NQ", "Gemini", 257), ("Eta3G", "Gemini", 449), ("Eta3G", "Gemini", 517),
           ("MASH", "Gemini", 155), ("NQ", "Gemini", 250), ("MH", "Gemini", 228), ("Eta3G", "Gemini", 561), ("NQ", "Gemini", 330),
           ("MASH", "Gemini", 83), ("MASH", "Gemini", 113), ("MASH", "Gemini", 113), ("NQ", "Gemini", 292), ("NQ", "Gemini", 369),
           ("MH", "Gemini", 79), ("Eta3G", "Gemini", 492), ("NQ", "Gemini", 299), ("NQ", "Gemini", 341)         
          ]+[("MASH", "Entailed", 108), ("MH", "Entailed", 126), ("MH", "Entailed", 100), ("MASH", "Entailed", 60), ("MASH", "Entailed", 136),
           ("Eta3G", "Entailed", 482), ("MH", "Post Hoc", 182)]
isource_n = [1, 3, 3, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1]

# Relevant source quote: claim too specific (something went wrong in the generation process)
rsource = [("MASH", "Gemini", 83), ("MASH", "Post Hoc", 114), ("MH", "Post Hoc", 64), ("MH", "Entailed", 207), ("MH", "Entailed", 119), ("Eta3G", "Entailed", 505), ("MASH", "Entailed", 98),
           ("MH", "Entailed", 89), ("Eta3G", "Entailed", 463), ("Eta3G", "Entailed", 472), ("MASH", "Entailed", 192), 
           ("Eta3G", "Entailed", 577), ("MH", "Post Hoc", 180), ("Eta3G", "Post Hoc", 497), ("NQ", "Gemini", 243), ("Eta3G", "Gemini", 569),
           ("NQ", "Gemini", 252), ("NQ", "Gemini", 331), ("NQ", "Gemini", 314)
          ]+[("MH", "Entailed", 184), ("MASH", "Entailed", 151), ("Eta3G", "Entailed", 540), ("Eta3G", "Entailed", 586)]
rsource_n = [1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# Annotation mistake due to not reading the source quote in context of the source (user perceives it as a precision error, even if it is not)
mistake_ignored_context = [("MASH", "Quoted", 157), ("MASH", "Quoted", 136), ("MASH", "Quoted", 164), ("MASH", "Quoted", 155),
                           ("MASH", "Entailed", 80), ("MASH", "Post Hoc", 114), ("MASH", "Post Hoc", 85),
                           ("MH", "Post Hoc", 139), ("MASH", "Gemini", 86), ("NQ", "Gemini", 369), ("MASH", "Gemini", 123)
]
mistake_ignored_context_n = [1, 1, 3, 2, 1, 1, 2, 2, 1, 1, 1]

# Annotation mistake due to a different claim being evaluated (often is same as coverage being evaluated instead)
mistake_coverage = [("MH", "Entailed", 155), ("MH", "Entailed", 100), ("MH", "Entailed", 131), ("MH", "Entailed", 112), 
                    ("MH", "Entailed", 201), ("MASH", "Entailed", 146), ("MH", "Entailed", 195), ("MH", "Entailed", 75),
                    ("MASH", "Post Hoc", 100), ("NQ", "Gemini", 355), ("NQ", "Gemini", 282),
                    ("MASH", "Gemini", 69), ("NQ", "Gemini", 250), ("NQ", "Gemini", 384), ("NQ", "Gemini", 331), ("Eta3G", "Gemini", 496)
]
mistake_coverage_n = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# Annotation mistake due to other error
mistake_other = [("MASH", "Quoted", 95), ("Eta3G", "Quoted", 550), ("MASH", "Quoted", 185), ("MASH", "Quoted", 100), ("Eta3G", "Quoted", 477),
                 ("NQ", "Quoted", 388), ("MASH", "Entailed", 90), ("MASH", "Entailed", 130), ("MASH", "Entailed", 163),
                 ("MASH", "Entailed", 172), ("NQ", "Entailed", 281), ("NQ", "Gemini", 335), ("Eta3G", "Gemini", 455)
]
mistake_other_n = [1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1]

def report_n_for_op(op, tuple_list, n_list):
    assert len(tuple_list) == len(n_list)
    n = 0
    for i in range(len(tuple_list)):
        curr_tuple = tuple_list[i]
        dataset, curr_op, query_id = curr_tuple
        if (curr_op == op):
            n += n_list[i]
    return n

all_tuples = outofcontext+isource+rsource+mistake_ignored_context+mistake_coverage+mistake_other
all_tuples_n = outofcontext_n+isource_n+rsource_n+mistake_ignored_context_n+mistake_coverage_n+mistake_other_n
number_of_precision_failures = 0

for op in op_implementations:
    print(op)
    a = report_n_for_op(op, outofcontext, outofcontext_n)
    b = report_n_for_op(op, isource, isource_n)
    c = report_n_for_op(op, rsource, rsource_n)
    d = report_n_for_op(op, mistake_ignored_context, mistake_ignored_context_n)
    e = report_n_for_op(op, mistake_coverage, mistake_coverage_n)
    f = report_n_for_op(op, mistake_other, mistake_other_n)
    n = a+b+c
    print('Out of Context:', a) #, a/n*100)
    print('Irrelevant Source:', b) #, b/n*100)
    print('Relevant Source:', c) #, c/n*100)
    print('Mistake (info elsewhere):', d) #, d/n*100)
    print('Mistake (coveraged evaluated):', e) #, e/n*100)
    print('Mistake (other):', f) #, f/n*100)
    print('Total precision failures:', a+b+c)
    print('Total examined:', a+b+c+d+e+f)
    print('____________________________________________________________________')
    print()

Quoted
Out of Context: 3
Irrelevant Source: 0
Relevant Source: 0
Mistake (info elsewhere): 7
Mistake (coveraged evaluated): 0
Mistake (other): 6
Total precision failures: 3
Total examined: 16
____________________________________________________________________

Entailed
Out of Context: 1
Irrelevant Source: 8
Relevant Source: 14
Mistake (info elsewhere): 1
Mistake (coveraged evaluated): 8
Mistake (other): 8
Total precision failures: 23
Total examined: 40
____________________________________________________________________

Post Hoc
Out of Context: 5
Irrelevant Source: 23
Relevant Source: 6
Mistake (info elsewhere): 5
Mistake (coveraged evaluated): 1
Mistake (other): 0
Total precision failures: 34
Total examined: 40
____________________________________________________________________

Gemini
Out of Context: 1
Irrelevant Source: 21
Relevant Source: 6
Mistake (info elsewhere): 3
Mistake (coveraged evaluated): 7
Mistake (other): 2
Total precision failures: 28
Total examined: 40
____________