In [None]:
import os
import json
import pickle
from sklearn.metrics import precision_recall_curve, average_precision_score, auc, roc_curve

In [None]:
with open('data/nyt_data/train.json') as fin:
    train_examples = [json.loads(line) for line in fin]

In [None]:
with open('data/nyt_data/dev.json') as fin:
    val_examples = [json.loads(line) for line in fin]

In [None]:
val_articles = [x['text'] for x in val_examples]

In [None]:
# Load the indices of training examples inditified to contain a PERSON hallucination according to Spacy
with open('./data/nyt_data/train_bad_inds.pk', 'rb') as fin:
    train_bad_inds = set(pickle.load(fin))

In [None]:
labels = [0]*len(train_examples)
for i in train_bad_inds:
    labels[i] = 1

In [None]:
# Load in the generated summaries for BART model trained on the NYT data
with open('./data/nyt_data/val_pred_summaries.pk', 'rb') as fin:
    val_summaries = pickle.load(fin)

In [None]:
# Load the indices of generated summaries inditified to contain a PERSON hallucination according to Spacy
with open('./data/nyt_data/val_preds_bad_inds.pk', 'rb') as fin:
    bad_preds = pickle.load(fin)

In [None]:
# Select 20 examples of generations containing person hallucinations
selected = [bad_preds[1], bad_preds[2], bad_preds[3], bad_preds[9], bad_preds[6], 
            bad_preds[11], bad_preds[13], bad_preds[15], bad_preds[17], bad_preds[18], 
            bad_preds[20], bad_preds[21], bad_preds[27], bad_preds[28], bad_preds[29],
            bad_preds[30], bad_preds[48], bad_preds[34], bad_preds[43], bad_preds[47]]

In [None]:
articles = [val_articles[i] for i in selected]
summaries = [val_summaries[i] for i in selected]

In [None]:
summaries

In [None]:
# Manually fix each of the PERSON hallucinations in the generated summaries
fixed_summaries = []
fixed_summaries.append('Sports of The Times column discusses outlook for women\'s college basketball season; photo (M)')
fixed_summaries.append('A review of Naked Angels production of David Marshall Grant play Snakebit, directed by Jace Alexander; Geoffrey Nauffts, Jodie Markell and David Alan Basche star; photo (M)')
fixed_summaries.append('Travel article on Naples, Italy, describes sights and sounds of city\'s Spanish Quarter and Vomero, two neighborhoods that have distinctly European flavor; photos; maps (L)')
fixed_summaries.append('Metro Matters column discusses New York City\'s battle to keep its many sex shops open despite new zoning laws that would force them to close down; photo (M)')
fixed_summaries.append('A review of premiere of NBC sitcom Just Shoot Me, starring George Segal and Laura San Giacomo; photo (M)')
fixed_summaries.append('A review of a salsa performance by Pete Rodriguez and his orchestra at Copacabana; photo (S)')
fixed_summaries.append('A review of an instructional piano playing computer program from Voyetra Technologies; photo (S)')
fixed_summaries.append('A review of a performance by Heath Brothers at Iridium (S)')
fixed_summaries.append('A review of a performance by Dutch jazz improviser Misha Mengelberg at Knitting Factory (S)')
fixed_summaries.append('A column on making crisp-roasted rack of lamb; recipe (M)')
fixed_summaries.append('Sports of The Times column discusses performance of Cleveland Indians outfielder Tony Fernandez in victory over Baltimore Orioles in Game 7 of American League Championship Series; photo (M)')
fixed_summaries.append('Sleeping arrangements article on being bundled up in Arctic winter gear to get to China to adopt baby from orphanage; drawing (M)')
fixed_summaries.append('Beliefs column on whether years should be designated as BC or BCE, BCE or BCE (M)')
fixed_summaries.append('A review of an exhibit of old hand-woven baskets at Pound Ridge Museum in Pound Ridge, NY; photos (M)')
fixed_summaries.append('A review of Randy Newman\'s play Faust, starring David Garrison and directed by Michael Greif, at Goodman Theater; photo (M)')
fixed_summaries.append('A review of a concert by tenor John Aler and pianist Kenneth Merrill at Alice Tully Hall; photo (M)')
fixed_summaries.append('A review of Amarone, Italian restaurant in Clinton section of Manhattan (M)')
fixed_summaries.append('A review of a recital by String Sextets at Carnegie Hall; photo (M)')
fixed_summaries.append('Travel essay on visiting emergency and emergency hospitals while traveling; drawing (M)')
fixed_summaries.append('A review of a concert by Peter Schickele at Carnegie Hall; photo (M)')

In [None]:
# NOTE: These examples can then be used to for comparing error attribution methods.
# See cae_e2e.ipynb to see how to use our method for error attribution

In [None]:
# We will load in the scores from our classifier and compute the metrics
with open('./data/classifier_distillation/train_scored.pk', 'rb') as fin:
    scores = pickle.load(fin)

In [None]:
average_precision_score(labels, scores, average="samples")*100

In [None]:
fpr, tpr, thresholds = roc_curve(labels, scores, pos_label=1)
auc(fpr, tpr)*100