In [None]:
# |default_exp text.data.seq2seq.summarization
# |default_cls_lvl 3

In [None]:
# | nbflags skip_exec

In [None]:
# |hide
%reload_ext autoreload
%autoreload 2

# Data

> The `text.data.seq2seq.summarization` module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for summarization tasks using architectures like BART and T5. Summarization tasks attempt to generate a human-understandable and sensible representation of a larger body of text (e.g., capture the meaning of a larger document in 1-3 sentences).

In [None]:
# |export
import warnings
from typing import Optional

import numpy as np
import pandas as pd

from datasets import Dataset
from fastai.data.block import DataBlock
from transformers import AutoModelForSeq2SeqLM, PreTrainedTokenizerBase
from transformers.utils import logging as hf_logging

from blurr.text.data.seq2seq.core import (
    Seq2SeqBatchTokenizeTransform,
    Seq2SeqPreprocessor,
    Seq2SeqTextBlock,
)
from blurr.text.utils import get_hf_objects

In [None]:
# | hide
import os, ast, pdb
from functools import reduce

from datasets import load_dataset
from fastai.data.transforms import *
from fastai.torch_core import *
from fastai.torch_imports import *
from fastcore.all import *
from fastcore.test import *
from nbdev import nbdev_export
from nbdev.showdoc import show_doc

from blurr.utils import print_versions
from blurr.text.utils import BlurrText

What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.6
transformers: 4.16.2


In [None]:
# |export
# silence all the HF warnings
warnings.simplefilter("ignore")
hf_logging.set_verbosity_error()

In [None]:
# | echo: false
NLP = BlurrText()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")

In [None]:
# |hide
# |cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")

Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `cnn_dailymail` to demonstrate how to configure your BLURR for summarization tasks

In [None]:
raw_datasets = load_dataset("cnn_dailymail", "3.0.0", split=["train", "validation"])
raw_datasets

Reusing dataset cnn_dailymail (/home/wgilliam/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


  0%|          | 0/2 [00:00<?, ?it/s]

[Dataset({
     features: ['article', 'highlights', 'id'],
     num_rows: 287113
 }),
 Dataset({
     features: ['article', 'highlights', 'id'],
     num_rows: 13368
 })]

In [None]:
print(raw_datasets[0][0].keys())
print(raw_datasets[0][0]["highlights"])

print(raw_datasets[1][0].keys())
print(raw_datasets[1][0]["highlights"])

dict_keys(['article', 'highlights', 'id'])
Syrian official: Obama climbed to the top of the tree, "doesn't know how to get down"
Obama sends a letter to the heads of the House and Senate .
Obama to seek congressional approval on military action against Syria .
Aim is to determine whether CW were used, not by whom, says U.N. spokesman .
dict_keys(['article', 'highlights', 'id'])
Accident happens in Santa Ynez, California, near where Crosby lives .
The jogger suffered multiple fractures; his injuries are not believed to be life-threatening .


In [None]:
raw_train_ds = raw_datasets[0].shuffle(seed=42).select(range(1000))
raw_valid_ds = raw_datasets[1].shuffle(seed=42).select(range(200))

len(raw_train_ds) + len(raw_valid_ds)

Loading cached shuffled indices for dataset at /home/wgilliam/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-516bef66c83f0d37.arrow
Loading cached shuffled indices for dataset at /home/wgilliam/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-e7e93c0052828394.arrow


1200

In [None]:
raw_train_df = pd.DataFrame(raw_train_ds)
raw_valid_df = pd.DataFrame(raw_valid_ds)

raw_train_df.head(2)

Unnamed: 0,article,highlights,id
0,"A protester in Ferguson was arrested during a demonstration on Thursday night - and live-tweeted her entire experience. Brittany Ferrell, a nursing student at the University of Missouri-Saint Louis, was one of 13 people detained by officers in the conflicted Missouri city for 'noise disruption'. The detention has sparked an investigation by the American Civil Liberties Union as lawyers accuse officers of overstretching their powers. Scroll down for video . Arrested: This is Brittany Ferrell, the nursing student and protester who live-tweeted her arrest in Ferguson . Tweeting in handcuffs, ...","Brittany Ferrell, nursing student, was arrested with 12 people on Thursday .\nThey were calling on police take responsibility for Michael Brown's death .\nMs Ferrell tweeted as she was arrested, piled in a small wagon with 7 others .\nThey were accused of 'noise disruption', put in orange jumpsuits and cuffed .\nOfficers now being investigated, lawyers claim they 'overstretched powers'",1e01f238418c31d4e9093f6334e0232babeb639a
1,"A day after confirming it had lost the ability to display Instagram images, Twitter has rolled out its own library of retro filters for its Android and iPhone apps. The eight filters are the usual suspects we've come to expect from mobile photo apps, including desaturated, black and white and high contrast. There are auto-adjust and cropping options, as well as a helpful grid view that lets you see what each filter will look like at once. ""The latest versions of Twitter for iPhone and Twitter for Android introduce a few new ways to enhance the images you tweet,"" said Twitter senior designe...",Twitter has added photo filters to its Android and iOS mobile apps .\nThe addition will help Twitter compete against Facebook-owned Instagram .\nThis is the first time the social network has offered image editing tools .,6f89645bff243fe9ce2a0509e5ca01912abf0d10


In [None]:
pretrained_model_name = "sshleifer/distilbart-cnn-6-6"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls
)
hf_arch, type(hf_tokenizer), type(hf_config), type(hf_model)

('bart',
 transformers.models.bart.tokenization_bart_fast.BartTokenizerFast,
 transformers.models.bart.configuration_bart.BartConfig,
 transformers.models.bart.modeling_bart.BartForConditionalGeneration)

## Preprocessing

Starting with version 2.0, BLURR provides a preprocessing base class that can be used to build task specific pre-processed datasets from pandas DataFrames or Hugging Face Datasets

### `SummarizationPreprocessor` -

In [None]:
# |export
class SummarizationPreprocessor(Seq2SeqPreprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The unique identifier in the dataset
        id_attr: Optional[str] = None,
        # The attribute holding the text
        text_attr: str = "text",
        # The maximum length (# of tokens) allowed for inputs. Will default to the max length allowed
        # by the model if not provided
        max_input_tok_length: Optional[int] = None,
        # The attribute holding the summary
        target_text_attr: str = "summary",
        # The maximum length (# of tokens) allowed for targets
        max_target_tok_length: Optional[int] = None,
        # If not "None", any examples where "target_text_attr" is < "min_summary_char_length" will be removed
        min_summary_char_length: Optional[int] = None,
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        # we need to use the offset mappings to get back at the raw text from its tokenized representation
        tok_kwargs = {**tok_kwargs, "return_offsets_mapping": True}

        super().__init__(
            hf_tokenizer,
            batch_size,
            text_attr,
            max_input_tok_length,
            target_text_attr,
            max_target_tok_length,
            is_valid_attr,
            tok_kwargs,
        )

        self.id_attr = id_attr
        self.min_summary_char_length = min_summary_char_length

    def process_df(
        self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None
    ):
        df = super().process_df(training_df, validation_df)

        # process df in mini-batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            final_df = final_df.append(self._process_df_batch(batch_df))

        final_df.reset_index(drop=True, inplace=True)
        return final_df

    def process_hf_dataset(
        self, training_ds: Dataset, validation_ds: Optional[Dataset] = None
    ):
        ds = super().process_hf_dataset(training_ds, validation_ds)
        return Dataset.from_pandas(self.process_df(pd.DataFrame(ds)))

    # ----- utility methods -----
    def _process_df_batch(self, batch_df):
        # remove summaries that are too short if a min character length is specified
        if self.min_summary_char_length:
            batch_df = batch_df[
                batch_df[self.target_text_attr].str.len()
                >= self.min_summary_char_length
            ]

        batch_df.reset_index(drop=True, inplace=True)

        # grab our inputs and targets batch encoding objects
        inputs, targets = self._tokenize_function(batch_df.to_dict(orient="list"))

        # add are processed text and target texts to the batched DataFrame
        for txt_seq_idx, (txt_attr, batch_enc) in enumerate(
            zip([self.text_attr, self.target_text_attr], [inputs, targets])
        ):
            if txt_attr is None:
                break

            char_idxs = []
            for idx, offset_mapping in enumerate(batch_enc["offset_mapping"]):
                text_offsets = [
                    offset_mapping[i]
                    for i, seq_id in enumerate(batch_enc.sequence_ids(idx))
                ]
                char_idxs.append([min(text_offsets)[0], max(text_offsets)[1]])

            batch_df = pd.concat(
                [
                    batch_df,
                    pd.DataFrame(
                        char_idxs,
                        columns=[
                            f"{txt_attr}_start_char_idx",
                            f"{txt_attr}_end_char_idx",
                        ],
                    ),
                ],
                axis=1,
            )
            batch_df.insert(
                0,
                f"proc_{txt_attr}",
                batch_df.apply(
                    lambda r: r[txt_attr][
                        r[f"{txt_attr}_start_char_idx"] : r[f"{txt_attr}_end_char_idx"]
                        + 1
                    ],
                    axis=1,
                ),
            )

        return batch_df

This class can be used for preprocessing summarization tasks, and includes a `proc_{your_text_attr}` and `proc_{target_text_attr}` attributes containing your modified input and target texts as a result of tokenization (e.g., if you specify a `max_length` the `proc_{your_text_attr}` may contain truncated text). 

#### Using a `DataFrame`

In [None]:
preprocessor = SummarizationPreprocessor(
    hf_tokenizer,
    id_attr="id",
    text_attr="article",
    target_text_attr="highlights",
    max_input_tok_length=128,
    max_target_tok_length=30,
    min_summary_char_length=10,
)
proc_df = preprocessor.process_df(raw_train_df, raw_valid_df)
proc_df.columns, len(proc_df)
proc_df.head(2)

Unnamed: 0,proc_highlights,proc_article,article,highlights,id,is_valid,article_start_char_idx,article_end_char_idx,highlights_start_char_idx,highlights_end_char_idx
0,"Brittany Ferrell, nursing student, was arrested with 12 people on Thursday .\nThey were calling on police take responsibility for Michael Brown's death","A protester in Ferguson was arrested during a demonstration on Thursday night - and live-tweeted her entire experience. Brittany Ferrell, a nursing student at the University of Missouri-Saint Louis, was one of 13 people detained by officers in the conflicted Missouri city for 'noise disruption'. The detention has sparked an investigation by the American Civil Liberties Union as lawyers accuse officers of overstretching their powers. Scroll down for video . Arrested: This is Brittany Ferrell, the nursing student and protester who live-tweeted her arrest in Ferguson . Tweeting in handcuffs, ...","A protester in Ferguson was arrested during a demonstration on Thursday night - and live-tweeted her entire experience. Brittany Ferrell, a nursing student at the University of Missouri-Saint Louis, was one of 13 people detained by officers in the conflicted Missouri city for 'noise disruption'. The detention has sparked an investigation by the American Civil Liberties Union as lawyers accuse officers of overstretching their powers. Scroll down for video . Arrested: This is Brittany Ferrell, the nursing student and protester who live-tweeted her arrest in Ferguson . Tweeting in handcuffs, ...","Brittany Ferrell, nursing student, was arrested with 12 people on Thursday .\nThey were calling on police take responsibility for Michael Brown's death .\nMs Ferrell tweeted as she was arrested, piled in a small wagon with 7 others .\nThey were accused of 'noise disruption', put in orange jumpsuits and cuffed .\nOfficers now being investigated, lawyers claim they 'overstretched powers'",1e01f238418c31d4e9093f6334e0232babeb639a,False,0,648,0,150
1,Twitter has added photo filters to its Android and iOS mobile apps .\nThe addition will help Twitter compete against Facebook-owned Instagram .\nThis,"A day after confirming it had lost the ability to display Instagram images, Twitter has rolled out its own library of retro filters for its Android and iPhone apps. The eight filters are the usual suspects we've come to expect from mobile photo apps, including desaturated, black and white and high contrast. There are auto-adjust and cropping options, as well as a helpful grid view that lets you see what each filter will look like at once. ""The latest versions of Twitter for iPhone and Twitter for Android introduce a few new ways to enhance the images you tweet,"" said Twitter senior designe...","A day after confirming it had lost the ability to display Instagram images, Twitter has rolled out its own library of retro filters for its Android and iPhone apps. The eight filters are the usual suspects we've come to expect from mobile photo apps, including desaturated, black and white and high contrast. There are auto-adjust and cropping options, as well as a helpful grid view that lets you see what each filter will look like at once. ""The latest versions of Twitter for iPhone and Twitter for Android introduce a few new ways to enhance the images you tweet,"" said Twitter senior designe...",Twitter has added photo filters to its Android and iOS mobile apps .\nThe addition will help Twitter compete against Facebook-owned Instagram .\nThis is the first time the social network has offered image editing tools .,6f89645bff243fe9ce2a0509e5ca01912abf0d10,False,0,635,0,147


## Examples

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects.

In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls
)

#####  Step 2: Create your `DataBlock`

Two lines!  Notice we pass in `noop` for our targets (e.g. our summaries) because the batch transform will take care of both out inputs and targets.

In [None]:
blocks = (Seq2SeqTextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)
dblock = DataBlock(
    blocks=blocks,
    get_x=ColReader("article"),
    get_y=ColReader("highlights"),
    splitter=RandomSplitter(),
)

In [None]:
# dblock.summary(cnndm_df)

##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(raw_train_df, bs=4)

In [None]:
b = dls.one_batch()

In [None]:
len(b), b[0]["input_ids"].shape, b[0]["labels"].shape, b[1].shape

(2, torch.Size([4, 1024]), torch.Size([4, 152]), torch.Size([4, 152]))

In [None]:
b[0]["labels"][0], b[1][0]

(tensor([    0,   270,  3905,  2950,   516,     9,   908,    25,    37,  5586,
           940,  2355,   375,   479, 50118,  9167,   703,    15,     5,   276,
           183,  1284,  2922, 11137,  4457,    30,   299,   940,  2355,  3504,
            11,   188,   469,   479,     2,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=1000, target_trunc_at=250)

Unnamed: 0,text,target
0,"<s> By. Daily Mail Reporter. PUBLISHED:. 08:16 EST, 14 May 2012. |. UPDATED:. 22:07 EST, 14 May 2012. Barack Obama's latest campaign gambit follows a familiar line of attack as it uses Mitt Romney's private equity past to cast the Republican candidate as greedy, job-killing corporate titan with little concern for the working class. The President is not the first of Mr Romney's opponents to try and paint the former governor of Massachusetts as a heartless uber-capitalist - even his Republican rivals used the same tactic during the heated primary battle. But Mr Obama's campaign seems to have been particularly unoriginal - as his attack ad is almost identical to one produced by Ted Kennedy for his Senate campaign against Mr Romney in 1994, featuring unemployed workers complaining about Bain Capital, the firm founded by Mr Romney. The timing of the Obama assault on private equity is also unfortunate, as on Monday night the President attended a fundraiser hosted by Democratic supporter Ham",President follows familiar line of attack as he highlights private equity past.\nAd released on the same day Obama attended fundraiser hosted by top private equity boss in New York.
1,"<s> (CNN) -- Voters in North Carolina, Indiana and Ohio on Tuesday kick off five straight weeks of primary contests that could give us a clearer indication of whether establishment Republicans have the upper hand against the tea party movement for control of the party. The results could back up recent tough talk from Senate GOP leader Mitch McConnell, who predicted big wins for incumbents facing primary challenges from the right, saying, ""I think we are going to crush them everywhere."" And they may have a major impact in determining whether Republicans retake the majority in the Senate. Since the birth of the tea party movement in 2009, primary challenges from the right have produced major headlines and headaches for the GOP and hurt the party's chances of winning back the Senate from Democrats in the past two election cycles. Candidates backed by the tea party movement and other grass-roots conservatives effectively cost the GOP five winnable Senate elections the last two cycles in Ne","Establishment Republicans are fighting back more strongly against challenges from the right.\nWith a number of vulnerable Democrats in the Senate, GOP thinks it can win control.\nNorth Carolina primary seen as a key test of establishment-vs.-tea party"


#### Using a preprocessed dataset

##### Step 1a: Get your Hugging Face objects.

In [None]:
pretrained_model_name = "sshleifer/distilbart-cnn-6-6"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls
)

##### Step 1b. Preprocess dataset

In [None]:
preprocessor = SummarizationPreprocessor(
    hf_tokenizer,
    id_attr="id",
    text_attr="article",
    target_text_attr="highlights",
    max_input_tok_length=128,
    max_target_tok_length=30,
    min_summary_char_length=10,
)
proc_df = preprocessor.process_df(raw_train_df, raw_valid_df)

##### Step 2: Create your `DataBlock`

In [None]:
blocks = (Seq2SeqTextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)
dblock = DataBlock(
    blocks=blocks,
    get_x=ColReader("proc_article"),
    get_y=ColReader("proc_highlights"),
    splitter=ColSplitter(),
)

##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)

Unnamed: 0,text,target
0,"<s> Washington (CNN) -- A post-mortem Sunday of the mid-term elections provided little evidence that Democrats and Republicans will work together to address major issues such as deficit reduction any better than they have in recent years. Republicans interviewed on talk shows promised congressional investigations, an all-out effort to repeal health care reform, and steadfast opposition to any form of higher taxes. Democrats, meanwhile, said the losses they suffered in the congressional elections reflected voter dissatisfaction with lingering high unemployment in the slow recovery from economic recession, rather than an outright repudiation of their policies. Republicans won more than 60 seats formerly held by Democrats to take majority control of </s>","GOP targets health care reform, government spending.\n""Are we willing to work with him?"" Cantor says of President Obama.\nObama says"
1,"<s> Scientists believe they have discovered how to'switch off' autoimmune diseases, prompting hope the breakthrough could pave the way for a new treatment for multiple sclerosis. Researchers at the University of Bristol, who describe the work as an 'important breakthrough', say it could improve the lives of millions around the world. The study reveals how to stop cells from attacking healthy body tissue. The team discovered how cells convert from being aggressive to protecting against disease, rather than the body's immune system destroying its own tissue by mistake. Scientists at the University of Bristol have discovered how to'switch off' autoimmune diseases, which they hope will pave the way for new </s>",Team at Bristol University have described their work as a 'breakthrough'\nDiscovered a way to stop cells from attacking healthy body tissue.\n


## Tests

The purpose of the following tests is to ensure as much as possible, that the core DataBlock code above works for the pretrained **summarization models** below.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained summarization models you are working with ... and if any of your pretrained summarization models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
[
    model_type
    for model_type in NLP.get_models(task="ConditionalGeneration")
    if (not model_type.startswith("TF"))
]

['BartForConditionalGeneration',
 'BigBirdPegasusForConditionalGeneration',
 'BlenderbotForConditionalGeneration',
 'BlenderbotSmallForConditionalGeneration',
 'FSMTForConditionalGeneration',
 'LEDForConditionalGeneration',
 'M2M100ForConditionalGeneration',
 'MBartForConditionalGeneration',
 'MT5ForConditionalGeneration',
 'PegasusForConditionalGeneration',
 'ProphetNetForConditionalGeneration',
 'Speech2TextForConditionalGeneration',
 'T5ForConditionalGeneration',
 'XLMProphetNetForConditionalGeneration']

In [None]:
pretrained_model_names = [
    "facebook/bart-base",
    "facebook/blenderbot_small-90M",
    "allenai/led-base-16384",
    "google/mt5-small",
    "google/pegasus-cnn_dailymail",
    "t5-small",
    "microsoft/prophetnet-large-uncased",
    "microsoft/xprophetnet-large-wiki100-cased",  # XLMProphetNet
]

In [None]:
path = Path("./")
cnndm_df = pd.read_csv(path / "cnndm_sample.csv")

In [None]:
# |slow
# | output: false
model_cls = AutoModelForSeq2SeqLM
bsz = 2
seq_sz = 256
trg_seq_sz = 40

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
        model_name, model_cls=model_cls
    )
    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    batch_tokenize_tfm = Seq2SeqBatchTokenizeTransform(
        hf_arch,
        hf_config,
        hf_tokenizer,
        hf_model,
        padding="max_length",
        max_length=seq_sz,
        max_target_length=trg_seq_sz,
    )

    def add_t5_prefix(inp):
        return f"summarize: {inp}" if (hf_arch == "t5") else inp

    blocks = (Seq2SeqTextBlock(batch_tokenize_tfm=batch_tokenize_tfm), noop)
    dblock = DataBlock(
        blocks=blocks,
        get_x=Pipeline([ColReader("article"), add_t5_prefix]),
        get_y=ColReader("highlights"),
        splitter=RandomSplitter(),
    )

    dls = dblock.dataloaders(cnndm_df, bs=bsz)
    b = dls.one_batch()

    try:
        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)
        test_eq(b[1].shape, torch.Size([bsz, trg_seq_sz]))

        if hasattr(hf_tokenizer, "add_prefix_space") and hf_arch not in ["led"]:
            test_eq(hf_tokenizer.add_prefix_space, True)

        test_results.append(
            (hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", "")
        )
        dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=1000)

    except Exception as err:
        test_results.append(
            (hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err)
        )

=== facebook/bart-base ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"<s> London (CNN) -- In 1948, a hospital outside London witnessed the birth of the Paralympic movement, as a Jewish doctor who had fled Nazi Germany sought to change the lives of patients with spinal injuries -- and inspire new hope in them through sport. The first ""Stoke Mandeville Games"" were organized in 1948 to coincide with the London Olympics, the second to be held in Britain. Named for the hospital in Buckinghamshire where Prof. Ludwig Guttmann's pioneering spinal injuries unit was based, the competitors in those initial Games -- 14 men and two women -- took part in a wheelchair archery contest. Many were military veterans injured on the battlefields of World War II. Just a year later, six teams competed at Stoke Mandeville -- with wheelchair netball, a forerunner of wheelchair basketball, being introduced -- as sport became a central part of a rehabilitation process that had been revolutionized by Guttmann. In 1956, a ""statement of intent"" was unveiled for the Games, which were","Paralympic movement was born in Stoke Mandeville, outside London, in 1948.\n2012 Games will be the biggest yet, with 4,200 competitors from 165 countries.\nIn"
1,"<s> (CNN Student News) -- March 23, 2010. Download PDF maps related to today's show:. • Haiti • China. Transcript. THIS IS A RUSH TRANSCRIPT. THIS COPY MAY NOT BE IN ITS FINAL FORM AND MAY BE UPDATED. CARL AZUZ, CNN STUDENT NEWS ANCHOR: Happy birthday, Roger Bannister -- first man to run the mile in less than four minutes. In more than twice that time, you'll be up to speed on today's headlines. I'm Carl Azuz. First Up: Health Care. AZUZ: First up, it's the biggest expansion of the United States health care system in more than forty years. And by a vote of 219-212, the U.S. House of Representatives passed a health care reform bill late Sunday night. This is the same bill that the Senate passed last December. This means that when President Obama signs it, it's law. The House also passed a set of changes to the Senate bill. We're gonna get back to that in just a second. But first, you know this health care issue has been controversial. We want you to check out some of the reaction to las",Find out what comes next after the passage of a health care reform bill.\nLearn about a proposal that would change how student loans are funded.\nFollow the steps that led to a showdown


=== facebook/blenderbot_small-90M ===

architecture:	blenderbot_small
tokenizer:	BlenderbotSmallTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"__unk__ cnn ) __unk__ - home to up to 10 percent of all known species, mexico is recognized as one of the most biodiverse regions on the planet. the twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. and there is a great deal to lose. in the united nations environment program __unk__ unep ) world conservation monitoring centre's list of megadiverse countries mexico ranks 11th. the list represents a group of 17 countries that harbor the majority of the earth's species and are therefore considered extremely biodiverse. from its coral reefs in the caribbean sea to its tropical jungles in chiapas and the yucatan peninsula and its de__unk__ and prairies in the north, mexico boasts an incredibly rich variety of flora and fauna. some 574 out of 717 reptile species found in mexico __unk__ - the most in any country __unk__ - can only be encountered within its borders. it is home to 502 types of ma","mexico hosts to up to 10 percent of all known species on earth. __newln__ it is home to 502 types of mammals, 290 bird species and 26 000 types of plants. __newln__ human development"
1,"__unk__ cnn ) __unk__ - if you can't stand the heat, get out of the kitchen __unk__ - or so the saying goes. but in the pressure cooker atmosphere of the white house, where world__unk__ changing decisions are made on a daily basis, the kitchen could well be the coolest room in the building. the chef feeding the most powerful man on the planet uses few words as she kneads, stirs, and whips; her style likened to a __unk__ baseball coach"" who calmly relays orders through hand signals. __unk__ i think'baseball coach'is a great analogy because everybody has their own positions, everybody has their own plays to make __unk__ "" says chef cristeta comerford, in an accent that wavers between her native philippines, and adopted home of chicago. __unk__ you look at everyone's strengths, everyone's abilities and knowledge. so hopefully towards the end of your meal you hit a home run because you're basically trying to rally your team to be the best at whatever they do __unk__ "" while foul__unk__ mou","exclusive interview with white house chef cristeta comerford. __newln__ coolheaded cook relays orders with hand signals, not outbursts. __newln__ filipino started off as salad girl"" working"


=== allenai/led-base-16384 ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"<s>(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds, 1,150 v","Mexico hosts to up to 10 percent of all known species on Earth.\nIt is home to 502 types of mammals, 290 bird species and 26,000 types of plants.\nHuman development"
1,"<s>London (CNN) -- In 1948, a hospital outside London witnessed the birth of the Paralympic movement, as a Jewish doctor who had fled Nazi Germany sought to change the lives of patients with spinal injuries -- and inspire new hope in them through sport. The first ""Stoke Mandeville Games"" were organized in 1948 to coincide with the London Olympics, the second to be held in Britain. Named for the hospital in Buckinghamshire where Prof. Ludwig Guttmann's pioneering spinal injuries unit was based, the competitors in those initial Games -- 14 men and two women -- took part in a wheelchair archery contest. Many were military veterans injured on the battlefields of World War II. Just a year later, six teams competed at Stoke Mandeville -- with wheelchair netball, a forerunner of wheelchair basketball, being introduced -- as sport became a central part of a rehabilitation process that had been revolutionized by Guttmann. In 1956, a ""statement of intent"" was unveiled for the Games, which were b","Paralympic movement was born in Stoke Mandeville, outside London, in 1948.\n2012 Games will be the biggest yet, with 4,200 competitors from 165 countries.\n"


=== google/mt5-small ===

architecture:	mt5
tokenizer:	T5TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds,</s>","Mexico hosts to up to 10 percent of all known species on Earth. It is home to 502 types of mammals, 290 bird species and 26,000 types of"
1,"Dan Condon believes in recycling. Just not when it comes to his hotel towels. Condon composts when he's at home in Boulder, Colorado. He eats local, organic and fair-trade food and drives a Honda CR-Z hybrid sports car. You might call him green. Except he's not so green when he travels for his work at an education nonprofit and stays in a hotel, which happens about 10 weeks per year. There, he uses a new towel every day. And don't try to bribe him with a drink or dessert coupon to get him to reuse the same one. ""I could care less about rewards for environmentally conscious behavior unless it's miles,"" Condon wrote in an e-mail. If hotels can't convince a hybrid-driving recycling enthusiast like Condon to go green while traveling, how can they possibly convince everyone else? 9 glamorous movie-star hotels. That's the problem of hotels trying to ""green"" your hotel stay. After guests have paid a</s>","Hotel guests who ""go green"" are happier with their stay. Increasing water and energy costs are pushing hotels to cut costs wherever they"


=== google/pegasus-cnn_dailymail ===

architecture:	pegasus
tokenizer:	PegasusTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Dan Condon believes in recycling. Just not when it comes to his hotel towels. Condon composts when he's at home in Boulder, Colorado. He eats local, organic and fair-trade food and drives a Honda CR-Z hybrid sports car. You might call him green. Except he's not so green when he travels for his work at an education nonprofit and stays in a hotel, which happens about 10 weeks per year. There, he uses a new towel every day. And don't try to bribe him with a drink or dessert coupon to get him to reuse the same one. ""I could care less about rewards for environmentally conscious behavior unless it's miles,"" Condon wrote in an e-mail. If hotels can't convince a hybrid-driving recycling enthusiast like Condon to go green while traveling, how can they possibly convince everyone else? 9 glamorous movie-star hotels. That's the problem of hotels trying to ""green"" your hotel stay. After guests have paid a pretty penny for a night at the inn, even the most environmental guests may want to treat them","Hotel guests who ""go green"" are happier with their stay. Increasing water and energy costs are pushing hotels to cut costs wherever they can. Many hotels find that guests don't mind"
1,"(CNN Student News) -- March 23, 2010. Download PDF maps related to today's show:. • Haiti • China. Transcript. THIS IS A RUSH TRANSCRIPT. THIS COPY MAY NOT BE IN ITS FINAL FORM AND MAY BE UPDATED. CARL AZUZ, CNN STUDENT NEWS ANCHOR: Happy birthday, Roger Bannister -- first man to run the mile in less than four minutes. In more than twice that time, you'll be up to speed on today's headlines. I'm Carl Azuz. First Up: Health Care. AZUZ: First up, it's the biggest expansion of the United States health care system in more than forty years. And by a vote of 219-212, the U.S. House of Representatives passed a health care reform bill late Sunday night. This is the same bill that the Senate passed last December. This means that when President Obama signs it, it's law. The House also passed a set of changes to the Senate bill. We're gonna get back to that in just a second. But first, you know this health care issue has been controversial. We want you to check out some of the reaction to last ni",Find out what comes next after the passage of a health care reform bill. Learn about a proposal that would change how student loans are funded. Follow the steps that led to a showdown between


=== t5-small ===

architecture:	t5
tokenizer:	T5TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"summarize: (CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds,","Mexico hosts to up to 10 percent of all known species on Earth. It is home to 502 types of mammals, 290 bird species and 26,000 types of plants. Human development"
1,"summarize: It's an international air disaster in a war zone -- a commercial flight with almost 300 people on board shot down in eastern Ukraine. As new details emerge, here is a look at basic questions about the tragedy:. Was the plane shot down? All evidence so far says yes. President Barack Obama declared Friday that a surface-to-air missile blasted the Malaysia Airlines Boeing 777 on Thursday over the Donetsk region of Ukraine near the Russian border. According to a senior American official, a U.S. radar system saw a surface-to-air missile system turn on and track an aircraft right before plane went down. A second system saw a heat signature, which would indicate a missile rising from the ground into the air at the time the airliner was hit, the official explained. Does anyone dispute that? Not at this point. While the Ukrainian government trades accusations of blame with pro-Russian rebels it is fighting in eastern Ukraine and Russia itself, no one has offered evidence of an altern","Donetsk rebel official: Plane shot down, but not by us. Malaysian official says the crash site's integrity has been compromised. President Obama says evidence points to"


=== microsoft/prophetnet-large-uncased ===





architecture:	prophetnet
tokenizer:	ProphetNetTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"( cnn ) - - home to up to 10 percent of all known species, mexico is recognized as one of the most biodiverse regions on the planet. the twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. and there is a great deal to lose. in the united nations environment program ( unep ) world conservation monitoring centre's list of megadiverse countries mexico ranks 11th. the list represents a group of 17 countries that harbor the majority of the earth's species and are therefore considered extremely biodiverse. from its coral reefs in the caribbean sea to its tropical jungles in chiapas and the yucatan peninsula and its deserts and prairies in the north, mexico boasts an incredibly rich variety of flora and fauna. some 574 out of 717 reptile species found in mexico - - the most in any country - - can only be encountered within its borders. it is home to 502 types of mammals, 290 species of birds, 1,","mexico hosts to up to 10 percent of all known species on earth. it is home to 502 types of mammals, 290 bird species and 26, 000 types of plants. human development and climate"
1,"some u. s. officials this year are expected to get smartphones capable of handling classified government documents over cellular networks, according to people involved in the project. the phones will run a modified version of google's android software, which is being developed as part of an initiative that spans multiple federal agencies and government contractors, these people said. the smartphones are first being deployed to u. s. soldiers, people familiar with the project said. later, federal agencies are expected to get phones for sending and receiving government cables while away from their offices, sources said. eventually, local governments and corporations could give workers phones with similar software. the army has been testing touchscreen devices at u. s. bases for nearly two years, said michael mccarthy, a director for the army's brigade modernization command, in a phone interview. about 40 phones were sent to fighters overseas a year ago, and the army plans to ship 50 more","government, military officials to get android phones capable of sharing secret documents. the phones will run a modified version of google's android software, sources say. contractor : google "" more cooperative "" than"


=== microsoft/xprophetnet-large-wiki100-cased ===

architecture:	xlm_prophetnet
tokenizer:	XLMProphetNetTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds, 1,150 vari","Mexico hosts to up to 10 percent of all known species on Earth. It is home to 502 types of mammals, 290 bird species and 26,000 types of plants"
1,"The irony of Dianne Feinstein's center-stage moment this week was palpable. Just as she is leaving her coveted position as chair of the Senate Intelligence Committee, handing the reins to the incoming Republican majority â€” and as questions abound in her home state about whether she will retire -- she seemed to reach the pinnacle of her power, once again showing willingness to buck members of her party and wave off critics. Facing few political consequences in California where her Senate seat is firmly in her grasp, Feinstein, 81, was defiant as she took the Senate floor Tuesday to announce the release of a 525-page summary of the Intelligence Committee's report on the CIA's treatment of detainees â€”the product of five years of line-by-line scrutiny of some six million internal government documents. It was a scorching condemnation of the harsh interrogation techniques of the Bush era with the striking conclusion that those ""deeply flawed"" tactics had failed to produce information tha","Torture report was a powerful moment for Feinstein. While some in California clamor for new leaders, Feinstein mum on plans. Over her long career, Feinstein"


In [None]:
# |slow
# | echo: false
test_results_df = pd.DataFrame(
    test_results, columns=["arch", "tokenizer", "model_name", "result", "error"]
)
display_df(test_results_df)

Unnamed: 0,arch,tokenizer,model_name,result,error
0,bart,BartTokenizerFast,facebook/bart-base,PASSED,
1,blenderbot_small,BlenderbotSmallTokenizer,facebook/blenderbot_small-90M,PASSED,
2,led,LEDTokenizerFast,allenai/led-base-16384,PASSED,
3,mt5,T5TokenizerFast,google/mt5-small,PASSED,
4,pegasus,PegasusTokenizerFast,google/pegasus-cnn_dailymail,PASSED,
5,t5,T5TokenizerFast,t5-small,PASSED,
6,prophetnet,ProphetNetTokenizer,microsoft/prophetnet-large-uncased,PASSED,
7,xlm_prophetnet,XLMProphetNetTokenizer,microsoft/xprophetnet-large-wiki100-cased,PASSED,


## Export -

In [None]:
# |hide
nbdev_export()

Converted 00_callbacks.ipynb.
Converted 00_utils.ipynb.
Converted 01_text-callbacks.ipynb.
Converted 01_text-utils.ipynb.
Converted 11_text-data-core.ipynb.
Converted 11_text-modeling-core.ipynb.
Converted 12_text-data-language-modeling.ipynb.
Converted 12_text-modeling-language-modeling.ipynb.
Converted 13_text-data-token-classification.ipynb.
Converted 13_text-modeling-token-classification.ipynb.
Converted 14_text-data-question-answering.ipynb.
Converted 14_text-modeling-question-answering.ipynb.
Converted 20_text-data-seq2seq-core.ipynb.
Converted 20_text-modeling-seq2seq-core.ipynb.
Converted 21_text-data-seq2seq-summarization.ipynb.
Converted 21_text-modeling-seq2seq-summarization.ipynb.
Converted 22_text-data-seq2seq-translation.ipynb.
Converted 22_text-modeling-seq2seq-translation.ipynb.
Converted 99a_text-examples-high-level-api.ipynb.
Converted 99b_text-examples-glue.ipynb.
Converted 99c_text-examples-glue-plain-pytorch.ipynb.
Converted 99d_text-examples-multilabel.ipynb.
Conv