In [1]:
from xsum_dataset import XsumDataset
from generate_xsum_summary import load_summarization_model_and_tokenizer, generate_summaries, generate_token_entropy_metadata

import datasets
import random

random.seed(0)

## load model and tokenizer
---

In [2]:
model_name = "facebook/bart-large-xsum"
model, tokenizer = load_summarization_model_and_tokenizer(model_name)

## load dataset and concat train/val/test
---

In [3]:
xsum_data_raw = datasets.load_dataset("xsum")
xsum_data_raw_cc = datasets.concatenate_datasets(
    [xsum_data_raw["train"], xsum_data_raw["validation"], xsum_data_raw["test"]]
    )
xsum_concat_data = XsumDataset(xsum_data_raw_cc)


Using custom data configuration default
Reusing dataset xsum (/home/wk247/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

## select a sample from dataset with bbcid
---
* we can select multiple samples, but let's just sample one for easy comparison

In [4]:
# sample one bbcid
bbc_ids = random.choices(list(xsum_concat_data.data_by_id.keys()), k=1)
# or fix one: bbc_ids = ["33858956"]
bbc_ids

['38424799']

In [5]:
# selected_data - list of dicts with keys: (id, document, true_summary, (factuality_data, faithfulness_data))
selected_data = [xsum_concat_data.data_by_id[x] for x in bbc_ids]

# original_docs - list of documents to sumamrize
original_docs = [x["document"] for x in selected_data]
print("original docs to summarize:\n", original_docs)

original docs to summarize:
 ['In very blustery conditions the Sons secured victory courtesy of a goal from Mark Docherty, who had scored in August\'s 1-0 win over the Tangerines.\nHis cross evaded everyone and the wind played a part as the ball whipped into the net past goalkeeper Cammy Bell.\nWith Hibernian winning earlier in the day at Falkirk, United slip to second going into 2017.\nDumbarton began on the front foot. Docherty fired in a shot from 25 yards, but it was comfortably held by Bell.\nSam Stanton also tried his luck for the hosts, firing just wide of the post.\nEventually Dumbarton would get their reward for a positive start. Docherty slung in a cross from the left and soon celebrated a deserved lead for the Sons.\nIt took half-an-hour for United to muster any meaningful sight of goal. Stewart Murdoch shot from 25 yards but it was easily held by Alan Martin.\nUnited did have the ball in the back of the net just before half-time when Simon Murray fired home from close range

## generate summaries of original documents
---

In [6]:
# generate summaries and metadata
gen_summaries, gen_metadata = generate_summaries(
    model,
    tokenizer,
    original_docs,
    num_beams=4,
    return_generation_metadata=True
)

In [7]:
# generate token entropy metadata
gen_token_entropy_metadata = generate_token_entropy_metadata(bbc_ids, gen_metadata)

In [8]:
gen_summaries

['Dundee United lost ground at the top of the Scottish Championship after being beaten at Dumbarton.']

In [9]:
gen_token_entropy_metadata

{'38424799': {'tokens_with_entropy': [('D', 1.593770980834961),
   ('und', 3.4541940689086914),
   ('ee', 1.0558691024780273),
   (' United', 1.2431459426879883),
   (' lost', 3.7446138858795166),
   (' ground', 3.280916690826416),
   (' at', 2.2022035121917725),
   (' the', 1.1939692497253418),
   (' top', 1.1276772022247314),
   (' of', 1.3361749649047852),
   (' the', 1.6074154376983643),
   (' Scottish', 2.3245315551757812),
   (' Championship', 1.8109066486358643),
   (' after', 2.332247018814087),
   (' being', 1.5816192626953125),
   (' beaten', 1.6345798969268799),
   (' at', 4.622220993041992),
   (' Dumb', 2.504394292831421),
   ('arton', 2.3805301189422607),
   ('.', 2.6738195419311523),
   ('</s>', 2.9492383003234863)]}}

## input perturbed document
---
* (CHECK) escape sequences may differ whether original documents are coiped from terminal or streamlit
* first check if original doc and perturbed doc are the same

In [10]:
# new input
perturbed_docs = [str(input("perturbed doc:")).replace('\\n', '\n').replace("\\", "")]

In [11]:
# check if original doc and perturbed doc are the same
assert original_docs[0] == perturbed_docs[0], "original doc and perturbed doc are different"

## generate summaries of pertrubed documents
---

In [12]:
# generate summaries of perturbed documents
ptb_summaries, ptb_metadata = generate_summaries(
    model,
    tokenizer,
    perturbed_docs,
    num_beams=4,
    return_generation_metadata=True
)

In [13]:
ptb_token_entropy_metadata = generate_token_entropy_metadata(bbc_ids, ptb_metadata)


In [14]:
ptb_summaries

['Dundee United lost ground at the top of the Scottish Championship after being beaten at Dumbarton.']

In [15]:
ptb_token_entropy_metadata

{'38424799': {'tokens_with_entropy': [('D', 1.593770980834961),
   ('und', 3.4541940689086914),
   ('ee', 1.0558691024780273),
   (' United', 1.2431459426879883),
   (' lost', 3.7446138858795166),
   (' ground', 3.280916690826416),
   (' at', 2.2022035121917725),
   (' the', 1.1939692497253418),
   (' top', 1.1276772022247314),
   (' of', 1.3361749649047852),
   (' the', 1.6074154376983643),
   (' Scottish', 2.3245315551757812),
   (' Championship', 1.8109066486358643),
   (' after', 2.332247018814087),
   (' being', 1.5816192626953125),
   (' beaten', 1.6345798969268799),
   (' at', 4.622220993041992),
   (' Dumb', 2.504394292831421),
   ('arton', 2.3805301189422607),
   ('.', 2.6738195419311523),
   ('</s>', 2.9492383003234863)]}}

## print summary and metadata
---

In [16]:
for source, perturbed_doc, gen_summary, ptb_summary in zip(selected_data, perturbed_docs, gen_summaries, ptb_summaries):
    print("XSUM ID", source["id"])
    print("* GROUND TRUTH SUMMARY:", source["true_summary"])
    print()
    print("* GENERATED SUMMARY:", gen_summary)
    print("* PERTURBED SUMMARY:", ptb_summary)
    print()
    print("* original document == perturbed document:", source["document"] == perturbed_doc)
    print("* generated summary == perturbed summary:", gen_summary == ptb_summary)

XSUM ID 38424799
* GROUND TRUTH SUMMARY: Dundee United relinquished their lead at the top of the Championship with a dismal defeat at Dumbarton.

* GENERATED SUMMARY: Dundee United lost ground at the top of the Scottish Championship after being beaten at Dumbarton.
* PERTURBED SUMMARY: Dundee United lost ground at the top of the Scottish Championship after being beaten at Dumbarton.

* original document == perturbed document: True
* generated summary == perturbed summary: True
