huggingface summarization example:
https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb

## load dataset
---

In [4]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("xsum")
metric = load_metric("rouge")

Using custom data configuration default
Reusing dataset xsum (/home/wk247/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [7]:
raw_datasets["train"][0]

 'summary': 'Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.',
 'id': '35232142'}

## load tokenizer, model
---

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
model_checkpoint = "google/pegasus-xsum"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

## preprocess data
---

In [9]:
sample_sentence_1 = "Hello, this one sentence!"
sample_sentence_2 = "This is another sentence."

In [10]:
# tokenize one sentence
tokenizer(sample_sentence_1)

{'input_ids': [8087, 108, 136, 156, 5577, 147, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [11]:
# tokenize multiple sentences
tokenizer([sample_sentence_1, sample_sentence_2])

{'input_ids': [[8087, 108, 136, 156, 5577, 147, 1], [182, 117, 372, 5577, 107, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [None]:
# To prepare the targets for our model, 
# we need to tokenize them inside the as_target_tokenizer context manager
with tokenizer.as_target_tokenizer():
    print(tokenizer(["Hello, this one sentence!", "This is another sentence."]))

In [None]:
# T5 checkpoints we have to prefix the inputs with "summarize:"
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [20]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [21]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[139, 357, 519, 113, 1303, 115, 12455, 9246, 108, 156, 113, 109, 633, 3741, 2790, 108, 117, 309, 270, 9068, 107, 4308, 201, 117, 3121, 115, 6334, 12358, 111, 223, 4194, 115, 91586, 14626, 1686, 8867, 2790, 141, 2716, 336, 107, 41558, 124, 109, 3381, 3682, 54521, 749, 12071, 640, 112, 1303, 134, 109, 1946, 59340, 15258, 22947, 107, 1478, 1098, 111, 3564, 1129, 195, 2790, 141, 10233, 115, 12455, 9246, 244, 109, 1951, 29821, 17198, 316, 190, 109, 1120, 107, 1485, 2965, 23164, 59614, 3333, 109, 345, 112, 11028, 109, 1303, 107, 139, 6500, 31670, 114, 11210, 1075, 108, 10233, 223, 1162, 1746, 124, 5156, 1411, 233, 109, 674, 1553, 55976, 107, 62273, 20678, 108, 170, 11216, 109, 30220, 9372, 162, 140, 8867, 2790, 108, 243, 265, 256, 146, 5709, 109, 1546, 121, 44224, 1407, 559, 109, 6172, 1194, 107, 611, 108, 265, 243, 154, 16530, 201, 256, 133, 174, 2777, 165, 112, 615, 109, 11210, 1075, 368, 146, 3656, 107, 198, 362, 117, 1011, 155, 125, 171, 311, 186, 117, 167, 249, 14447, 118

In [22]:
# tokenize the whole dataset
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/205 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [26]:
import argparse
import torch
import datasets
from typing import List, Tuple
# from sumtool.utils import entropy
from xsum_dataset import XsumDataset
# from sumtool.storage import store_model_summaries
from transformers import BartTokenizer, BartForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
def load_summarization_model_and_tokenizer() -> Tuple[
    BartForConditionalGeneration, BartTokenizer
]:
    """
    Load summary generation model and move to GPU, if possible.
    Returns:
        (model, tokenizer)
    """
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-xsum")
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-xsum")
    model.to(device)

    return model, tokenizer

In [28]:
def generate_summaries(
    model: BartForConditionalGeneration,
    tokenizer: BartTokenizer,
    docs_to_summarize: List[str],
    num_beams: int = 4,
    return_generation_metadata: bool = False
):
    """
    Given a trained summary generation model and appropriate tokenizer,
    1. Tokenize text (and move to device, if possible)
    2. Run inference on model to generate output vocabulary tokens for summary
    3. Decode tokens to a sentence using the tokenizer
    Args:
        model: model to run inference on
        tokenizer: tokenizer corresponding to model
        docs_to_summarize: documents to summarize
        num_beams: number of beams for beam search
        return_generation_metadata: whether generation metadata should be returned
    Returns:
        decoded_sentence
    """
    inputs = tokenizer(
        docs_to_summarize,
        max_length=1024,
        truncation=True,
        return_tensors="pt",
        padding=True,
    )
    input_token_ids = inputs.input_ids.to(device)

    model_output = model.generate(
        input_token_ids,
        num_beams=num_beams,
        max_length=150,
        early_stopping=True,
        return_dict_in_generate=True,
        output_scores=True,
    )

    generated_summaries = [
        tokenizer.decode(
            id, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        for id in model_output.sequences
    ]

    if not return_generation_metadata:
        return generated_summaries
    else:
        token_metadata = []
        input_set = input_token_ids.view(-1).tolist()
        for seq_idx in range(model_output.sequences.shape[0]):
            seq_metadata = []
            token_metadata.append(seq_metadata)
            for idx, output_token_id in enumerate(model_output.sequences[seq_idx][1:]):
                beam_idx = model_output.beam_indices[seq_idx][idx]
                selected_beam_probs = torch.exp(model_output.scores[idx][beam_idx])

                beam_top_alternatives = []
                top_probs = torch.topk(selected_beam_probs, k=3)
                for i, v in zip(top_probs.indices, top_probs.values):
                    beam_top_alternatives.append({
                        "token": tokenizer.decode(i),
                        "token_id": i.item(),
                        "beam_token_prob": v.item()
                    })

                seq_metadata.append({
                    "token_id": output_token_id,
                    "token": tokenizer.decode(output_token_id),
                    "entropy": entropy(selected_beam_probs),
                    "beam_token_prob": selected_beam_probs[output_token_id].item(),
                    "beam_idx": beam_idx.item(),
                    "beam_top_probs": beam_top_alternatives,
                    "token_in_input": output_token_id in input_set,
                })

        return generated_summaries, token_metadata


In [31]:
data_split = "test"  # "train", "test", "validation"

In [29]:
model, tokenizer = load_summarization_model_and_tokenizer()

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

In [32]:
xsum_data = XsumDataset(datasets.load_dataset("xsum")[data_split])

Using custom data configuration default
Reusing dataset xsum (/home/wk247/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934)


  0%|          | 0/3 [00:00<?, ?it/s]

In [39]:
bbc_ids = ["38264402", "34227252"]

In [42]:
selected_data = [xsum_data.data_by_id[x.strip()] for x in bbc_ids]

In [44]:
summaries, generation_metadata = generate_summaries(
        model,
        tokenizer,
        [x["document"] for x in selected_data],
        num_beams=4,
        return_generation_metadata=True
    )

AttributeError: 'BeamSearchEncoderDecoderOutput' object has no attribute 'beam_indices'

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Script to run inference on an xsum example using a pre-trained model"
    )

    parser.add_argument(
        "--bbc_ids",
        type=str,
        required=True,
        help="Comma-separated document BBC IDs in the Xsum dataset",
    )

    parser.add_argument(
        "--data_split",
        type=str,
        required=True,
        choices=["train", "test", "validation"],
        help="xsum data split to index into with `data_index`",
    )

    args = parser.parse_args()

    model, tokenizer = load_summarization_model_and_tokenizer()

    xsum_data = XsumDataset(datasets.load_dataset("xsum")[args.data_split])
    selected_data = [xsum_data.data_by_id[x.strip()] for x in args.bbc_ids.split(",")]

    summaries, generation_metadata = generate_summaries(
        model,
        tokenizer,
        [x["document"] for x in selected_data],
        num_beams=4,
        return_generation_metadata=True
    )

    summary_metadata = {}

    for source, gen_summary, seq_metadata in zip(selected_data, summaries, generation_metadata):
        print("XSUM ID", source["id"])
        print("GOLD STANDARD SUMMARY:", source["true_summary"])
        print("PREDICTED SUMMARY:", gen_summary)

        tokens_with_entropy = []
        for token_metadata in seq_metadata:
            tokens_with_entropy.append((
                token_metadata["token"],
                token_metadata["entropy"]
            ))

        summary_metadata[source["id"]] = {
            "tokens_with_entropy": tokens_with_entropy
        }

#     store_model_summaries(
#         "xsum",
#         model.config.name_or_path,
#         model.config.to_dict(),
#         {
#             source["id"]: gen_summary
#             for source, gen_summary in zip(selected_data, summaries)
#         },
#         summary_metadata
#     )