In [1]:
from xsum_dataset import XsumDataset
import datasets

## 1. check if train/val/test has the same structure

In [2]:
xsum_data_raw = datasets.load_dataset("xsum")

Using custom data configuration default
Reusing dataset xsum (/home/wk247/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
train_data = XsumDataset(xsum_data_raw["train"])
val_data = XsumDataset(xsum_data_raw["validation"])
test_data = XsumDataset(xsum_data_raw["test"])

In [4]:
# dataset keys
keys = list(train_data.dataset[0].keys())
keys

['id', 'document', 'true_summary', 'factuality_data', 'faithfulness_data']

In [5]:
for dataset in [train_data, val_data, test_data]:
    for sample in dataset:
        if list(sample.keys()) != keys:  # if the sample has different keys
            print(sample.keys())
            assert(False)
        if sample["factuality_data"] != {}:  # if there is factuality data
            print(sample)
            assert(False)
        if sample["faithfulness_data"] != {}:  # if there is faithfulness data
            print(sample)
            assert(False)
        if len(sample["true_summary"]) == 0:  # if the sample doesn't have true summary
            print(sample)
            assert(False)

## concat train/val/test

In [6]:
xsum_data_raw_cc = datasets.concatenate_datasets(
    [xsum_data_raw["train"], xsum_data_raw["validation"], xsum_data_raw["test"]]
    )

In [7]:
xsum_data_raw_cc

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 226711
})

In [8]:
xsum_concat_data = XsumDataset(xsum_data_raw_cc)

In [9]:
len(xsum_concat_data.dataset)

226711

# 2. take 'generate_summaries' apart

In [10]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from utils import entropy
from generate_xsum_summary import load_summarization_model_and_tokenizer

In [11]:
model_name = "facebook/bart-large-xsum"
model, tokenizer = load_summarization_model_and_tokenizer(model_name)

In [12]:
docs_to_summarize = [
    "Load summary generation model and move to GPU, if possible.",
    "Given a trained summary generation model and appropriate tokenizer,"]

In [13]:
tokenizer(docs_to_summarize)

{'input_ids': [[0, 47167, 4819, 2706, 1421, 8, 517, 7, 22794, 6, 114, 678, 4, 2], [0, 18377, 10, 5389, 4819, 2706, 1421, 8, 3901, 19233, 6315, 6, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

## inputs

In [14]:
inputs = tokenizer(
    docs_to_summarize,
    # max_length=1024,  # default is 1024 for 'facebook/bart-large-xsum'
    truncation=True,
    return_tensors="pt",
    padding=True,
)
input_token_ids = inputs.input_ids.to(device)

In [15]:
input_token_ids

tensor([[    0, 47167,  4819,  2706,  1421,     8,   517,     7, 22794,     6,
           114,   678,     4,     2],
        [    0, 18377,    10,  5389,  4819,  2706,  1421,     8,  3901, 19233,
          6315,     6,     2,     1]], device='cuda:0')

In [16]:
# special ids
tokenizer.all_special_ids

[0, 2, 3, 1, 50264]

In [17]:
# convert_ids_to_tokens
tokenizer.convert_ids_to_tokens(input_token_ids[0])

['<s>',
 'Load',
 'Ġsummary',
 'Ġgeneration',
 'Ġmodel',
 'Ġand',
 'Ġmove',
 'Ġto',
 'ĠGPU',
 ',',
 'Ġif',
 'Ġpossible',
 '.',
 '</s>']

## output

In [18]:
model_output = model.generate(
    input_token_ids,
    num_beams=4,
    max_length=150,
    early_stopping=True,
    return_dict_in_generate=True,
    output_scores=True,
)

In [19]:
# decode first model output
tokenizer.decode(
    model_output.sequences[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
)

'If you are using a computer with a high-performance graphics card, you may need to change the way you load data.'

In [20]:
# decode all outputs
generated_summaries = [
    tokenizer.decode(
        id, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    for id in model_output.sequences
]
generated_summaries

['If you are using a computer with a high-performance graphics card, you may need to change the way you load data.',
 'The following is a list of some of the most popular phrases in the English language.']

## sequence metadata

In [21]:
# flatten token ids
input_set = input_token_ids.view(-1).tolist()
input_set

[0,
 47167,
 4819,
 2706,
 1421,
 8,
 517,
 7,
 22794,
 6,
 114,
 678,
 4,
 2,
 0,
 18377,
 10,
 5389,
 4819,
 2706,
 1421,
 8,
 3901,
 19233,
 6315,
 6,
 2,
 1]

In [22]:
model_output.sequences

tensor([[    2,  1106,    47,    32,   634,    10,  3034,    19,    10,   239,
            12, 15526, 12774,  1886,     6,    47,   189,   240,     7,   464,
             5,   169,    47,  7511,   414,     4,     2],
        [    2,   133,   511,    16,    10,   889,     9,   103,     9,     5,
           144,  1406, 22810,    11,     5,  2370,  2777,     4,     2,     1,
             1,     1,     1,     1,     1,     1,     1]], device='cuda:0')

In [23]:
# generate metadata for one sequence
seq_metadata = []
for idx, output_token_id in enumerate(model_output.sequences[0][1:]):  # from the second token
    beam_idx = model_output.beam_indices[0][idx]
    selected_beam_probs = torch.exp(model_output.scores[idx][beam_idx])

    # top alternatives during beam search
    beam_top_alternatives = []
    top_probs = torch.topk(selected_beam_probs, k=3)
    for i, v in zip(top_probs.indices, top_probs.values):
        beam_top_alternatives.append({
            "token": tokenizer.decode(i),
            "token_id": i.item(),
            "beam_token_prob": v.item()
        })

    seq_metadata.append({
        "token_id": output_token_id,
        "token": tokenizer.decode(output_token_id),
        "entropy": entropy(selected_beam_probs),  # entropy of the selected token
        "beam_token_prob": selected_beam_probs[output_token_id].item(),  # prob of the selected token
        "beam_idx": beam_idx.item(),  # beam index of the selected token
        "beam_top_probs": beam_top_alternatives,  # token, token_id, prob of top K alternatives
        "token_in_input": output_token_id in input_token_ids[0],  # is the selected token in its document? - use for overlap
        # bug?
        # "token_in_input": output_token_id in input_set
    })

In [24]:
# metadatas are saved from the second token
len(seq_metadata) == len(model_output.sequences[0]) - 1

True

In [25]:
seq_metadata[0]

{'token_id': tensor(1106, device='cuda:0'),
 'token': 'If',
 'entropy': 7.595674991607666,
 'beam_token_prob': 0.04578085616230965,
 'beam_idx': 0,
 'beam_top_probs': [{'token': 'If',
   'token_id': 1106,
   'beam_token_prob': 0.04578085616230965},
  {'token': 'Check',
   'token_id': 26615,
   'beam_token_prob': 0.021572448313236237},
  {'token': 'As', 'token_id': 1620, 'beam_token_prob': 0.021526599302887917}],
 'token_in_input': False}

In [26]:
# generate metadata for all sequences
token_metadata = []
for seq_idx in range(model_output.sequences.shape[0]):  # for each summary
    seq_metadata = []
    token_metadata.append(seq_metadata)
    for idx, output_token_id in enumerate(model_output.sequences[seq_idx][1:]):
        beam_idx = model_output.beam_indices[seq_idx][idx]
        selected_beam_probs = torch.exp(model_output.scores[idx][beam_idx])

        beam_top_alternatives = []
        top_probs = torch.topk(selected_beam_probs, k=3)
        for i, v in zip(top_probs.indices, top_probs.values):
            beam_top_alternatives.append({
                "token": tokenizer.decode(i),
                "token_id": i.item(),
                "beam_token_prob": v.item()
            })

        seq_metadata.append({
            "token_id": output_token_id,
            "token": tokenizer.decode(output_token_id),
            "entropy": entropy(selected_beam_probs),
            "beam_token_prob": selected_beam_probs[output_token_id].item(),
            "beam_idx": beam_idx.item(),
            "beam_top_probs": beam_top_alternatives,
            "token_in_input": output_token_id in input_token_ids[0],
        })

In [27]:
len(token_metadata)  # number of sequences

2