In [None]:
# |default_exp text.data.language_modeling
# |default_cls_lvl 3

In [None]:
# |hide
%reload_ext autoreload
%autoreload 2

# Data

> The `text.data.language_modeling` module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for causal and masked language modeling tasks. This includes things like training BERT from scratch or fine-tuning a particular pre-trained LM on your own corpus.

In [None]:
# |export
import os, random, warnings
from abc import ABC, abstractmethod
from enum import Enum

from datasets import Dataset
from fastcore.all import *
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import (
    AutoModelForCausalLM,
    AutoModelForMaskedLM,
    PretrainedConfig,
    PreTrainedTokenizerBase,
    PreTrainedModel,
    BatchEncoding,
)
from transformers.utils import logging as hf_logging

from blurr.text.data.core import (
    TextInput,
    BatchTokenizeTransform,
    Preprocessor,
    first_blurr_tfm,
)
from blurr.text.utils import get_hf_objects

In [None]:
# | hide
import pdb

from fastai.data.block import DataBlock, ColReader, ColSplitter
from fastai.data.core import DataLoader, DataLoaders, TfmdDL
from fastai.data.external import untar_data, URLs
from fastai.data.transforms import *
from fastcore.test import *
from nbdev import nbdev_export
from nbdev.showdoc import show_doc

from blurr.utils import print_versions
from blurr.text.data.core import TextBlock
from blurr.text.utils import BlurrText

What we're running with at the time this documentation was generated:
torch: 1.9.0+cu102
fastai: 2.7.9
transformers: 4.21.2


In [None]:
# |export
# silence all the HF warnings
warnings.simplefilter("ignore")
hf_logging.set_verbosity_error()

In [None]:
# | echo: false
NLP = BlurrText()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")

In [None]:
# |hide
# |cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")

Using GPU #1: GeForce GTX 1080 Ti


## Setup

For this example, we'll use the `WIKITEXT_TINY` dataset available from fastai.  In addition to using the `Datasets` library from Hugging Face, fastai provides a lot of smaller datasets that are really useful when experimenting and/or in the early development of your training/validation/inference coding.

In [None]:
wiki_path = untar_data(URLs.WIKITEXT_TINY)
wiki_path.ls()

(#2) [Path('/home/wgilliam/.fastai/data/wikitext-2/train.csv'),Path('/home/wgilliam/.fastai/data/wikitext-2/test.csv')]

In [None]:
train_df = pd.read_csv(wiki_path / "train.csv", header=None)
valid_df = pd.read_csv(wiki_path / "test.csv", header=None)

print(len(train_df), len(valid_df))
train_df.head()

615 47


Unnamed: 0,0
0,"\n = 2013 – 14 York City F.C. season = \n \n The 2013 – 14 season was the <unk> season of competitive association football and 77th season in the Football League played by York City Football Club , a professional football club based in York , North Yorkshire , England . Their 17th @-@ place finish in 2012 – 13 meant it was their second consecutive season in League Two . The season ran from 1 July 2013 to 30 June 2014 . \n Nigel Worthington , starting his first full season as York manager , made eight permanent summer signings . By the turn of the year York were only above the relegation z..."
1,"\n = Big Boy ( song ) = \n \n "" Big Boy "" <unk> "" I 'm A Big Boy Now "" was the first single ever recorded by the Jackson 5 , which was released by Steeltown Records in January 1968 . The group played instruments on many of their Steeltown compositions , including "" Big Boy "" . The song was neither a critical nor commercial success , but the Jackson family were delighted with the outcome nonetheless . \n The Jackson 5 would release a second single with Steeltown Records before moving to Motown Records . The group 's recordings at Steeltown Records were thought to be lost , but they were re..."
2,"\n = The Remix ( Lady Gaga album ) = \n \n The Remix is a remix album by American recording artist Lady Gaga . Released in Japan on March 3 , 2010 , it contains remixes of the songs from her first studio album , The Fame ( 2008 ) , and her third extended play , The Fame Monster ( 2009 ) . A revised version of the track list was prepared for release in additional markets , beginning with Mexico on May 3 , 2010 . A number of recording artists have produced the songs , including Pet Shop Boys , Passion Pit and The Sound of Arrows . The remixed versions feature both uptempo and <unk> composit..."
3,"\n = New Year 's Eve ( Up All Night ) = \n \n "" New Year 's Eve "" is the twelfth episode of the first season of the American comedy television series Up All Night . The episode originally aired on NBC in the United States on January 12 , 2012 . It was written by Erica <unk> and was directed by Beth McCarthy @-@ Miller . The episode also featured a guest appearance from Jason Lee as Chris and Reagan 's neighbor and Ava 's boyfriend , Kevin . \n During Reagan ( Christina Applegate ) and Chris 's ( Will <unk> ) first New Year 's Eve game night , Reagan 's competitiveness comes out causing Ch..."
4,"\n = Geopyxis carbonaria = \n \n Geopyxis carbonaria is a species of fungus in the genus Geopyxis , family <unk> . First described to science in 1805 , and given its current name in 1889 , the species is commonly known as the charcoal loving elf @-@ cup , dwarf <unk> cup , <unk> <unk> cup , or pixie cup . The small , <unk> @-@ shaped fruitbodies of the fungus are reddish @-@ brown with a whitish fringe and measure up to 2 cm ( 0 @.@ 8 in ) across . They have a short , tapered stalk . Fruitbodies are commonly found on soil where brush has recently been burned , sometimes in great numbers ...."


In [None]:
train_df["is_valid"] = False
valid_df["is_valid"] = True

df = pd.concat([train_df, valid_df])
df.head()

Unnamed: 0,0,is_valid
0,"\n = 2013 – 14 York City F.C. season = \n \n The 2013 – 14 season was the <unk> season of competitive association football and 77th season in the Football League played by York City Football Club , a professional football club based in York , North Yorkshire , England . Their 17th @-@ place finish in 2012 – 13 meant it was their second consecutive season in League Two . The season ran from 1 July 2013 to 30 June 2014 . \n Nigel Worthington , starting his first full season as York manager , made eight permanent summer signings . By the turn of the year York were only above the relegation z...",False
1,"\n = Big Boy ( song ) = \n \n "" Big Boy "" <unk> "" I 'm A Big Boy Now "" was the first single ever recorded by the Jackson 5 , which was released by Steeltown Records in January 1968 . The group played instruments on many of their Steeltown compositions , including "" Big Boy "" . The song was neither a critical nor commercial success , but the Jackson family were delighted with the outcome nonetheless . \n The Jackson 5 would release a second single with Steeltown Records before moving to Motown Records . The group 's recordings at Steeltown Records were thought to be lost , but they were re...",False
2,"\n = The Remix ( Lady Gaga album ) = \n \n The Remix is a remix album by American recording artist Lady Gaga . Released in Japan on March 3 , 2010 , it contains remixes of the songs from her first studio album , The Fame ( 2008 ) , and her third extended play , The Fame Monster ( 2009 ) . A revised version of the track list was prepared for release in additional markets , beginning with Mexico on May 3 , 2010 . A number of recording artists have produced the songs , including Pet Shop Boys , Passion Pit and The Sound of Arrows . The remixed versions feature both uptempo and <unk> composit...",False
3,"\n = New Year 's Eve ( Up All Night ) = \n \n "" New Year 's Eve "" is the twelfth episode of the first season of the American comedy television series Up All Night . The episode originally aired on NBC in the United States on January 12 , 2012 . It was written by Erica <unk> and was directed by Beth McCarthy @-@ Miller . The episode also featured a guest appearance from Jason Lee as Chris and Reagan 's neighbor and Ava 's boyfriend , Kevin . \n During Reagan ( Christina Applegate ) and Chris 's ( Will <unk> ) first New Year 's Eve game night , Reagan 's competitiveness comes out causing Ch...",False
4,"\n = Geopyxis carbonaria = \n \n Geopyxis carbonaria is a species of fungus in the genus Geopyxis , family <unk> . First described to science in 1805 , and given its current name in 1889 , the species is commonly known as the charcoal loving elf @-@ cup , dwarf <unk> cup , <unk> <unk> cup , or pixie cup . The small , <unk> @-@ shaped fruitbodies of the fungus are reddish @-@ brown with a whitish fringe and measure up to 2 cm ( 0 @.@ 8 in ) across . They have a short , tapered stalk . Fruitbodies are commonly found on soil where brush has recently been burned , sometimes in great numbers ....",False


In [None]:
model_cls = AutoModelForCausalLM
hf_logging.set_verbosity_error()

pretrained_model_name = "gpt2"
hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls
)

# some tokenizers like gpt and gpt2 do not have a pad token, so we add it here mainly for the purpose
# of setting the "labels" key appropriately (see below)
if hf_tokenizer.pad_token is None:
    hf_tokenizer.pad_token = "[PAD]"

hf_tokenizer.pad_token, hf_tokenizer.pad_token_id

Using pad_token, but it is not set yet.


('[PAD]', 50256)

In [None]:
# special_tokens_dict = {'additional_special_tokens': ['[C1]']}
# num_added_toks = hf_tokenizer.add_special_tokens(special_tokens_dict)
# hf_model.resize_token_embeddings(len(hf_tokenizer))

## Preprocessing

Starting with version 2.0, `BLURR` provides a language preprocessing class that can be used to preprocess DataFrames or Hugging Face Datasets for both causal and masked language modeling tasks.

### `LMPreprocessor` -

In [None]:
# |export
class LMPreprocessor(Preprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # How big each chunk of text should be (default: hf_tokenizer.model_max_length)
        chunk_size: Optional[int] = None,
        # How to indicate the beginning on a new text example (default is hf_tokenizer.eos_token|sep_token
        sep_token: Optional[str] = None,
        # The attribute holding the text
        text_attr: str = "text",
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        tok_kwargs = {**tok_kwargs, "truncation": False, "return_offsets_mapping": True}
        super().__init__(
            hf_tokenizer, batch_size, text_attr, None, is_valid_attr, tok_kwargs
        )

        self.chunk_size = chunk_size or hf_tokenizer.model_max_length
        self.sep_token = sep_token or hf_tokenizer.eos_token or hf_tokenizer.sep_token

    def process_df(
        self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None
    ):
        # process df in mini-batches
        final_train_df = pd.DataFrame()
        for g, batch_df in training_df.groupby(
            np.arange(len(training_df)) // self.batch_size
        ):
            final_train_df = final_train_df.append(self._process_df_batch(batch_df))
            final_train_df.reset_index(drop=True, inplace=True)

        final_val_df = pd.DataFrame() if validation_df is not None else None
        if final_val_df is not None:
            for g, batch_df in validation_df.groupby(
                np.arange(len(validation_df)) // self.batch_size
            ):
                final_val_df = final_val_df.append(self._process_df_batch(batch_df))
                final_val_df.reset_index(drop=True, inplace=True)

        final_df = super().process_df(final_train_df, final_val_df)
        return final_df

    def process_hf_dataset(
        self, training_ds: Dataset, validation_ds: Optional[Dataset] = None
    ):
        ds = super().process_hf_dataset(training_ds, validation_ds)
        return Dataset.from_pandas(self.process_df(pd.DataFrame(ds)))

    # ----- utility methods -----
    def _process_df_batch(self, batch_df):
        batch_df.reset_index(drop=True, inplace=True)

        # concatenate our texts
        concat_txts = {
            self.text_attr: f" {self.sep_token} ".join(
                batch_df[self.text_attr].values.tolist()
            )
        }
        inputs = self._tokenize_function(concat_txts)

        # compute the length of our concatenated texts
        n_total_toks = len(inputs["input_ids"])

        # need to modify chunk_size to included the # of special tokens added
        max_chunk_size = (
            self.chunk_size - self.hf_tokenizer.num_special_tokens_to_add() - 1
        )

        # drop the last chunk of text if it is smaller than chunk size (see the HF course, section 7 on training MLMs)
        total_length = (n_total_toks // max_chunk_size) * max_chunk_size

        # break our concatenated into chunks of text of size max_chunk_size
        examples = []
        for i in range(0, total_length, max_chunk_size):
            chunked_offsets = inputs["offset_mapping"][i : i + max_chunk_size]
            chunked_text = concat_txts[self.text_attr][
                min(chunked_offsets)[0] : max(chunked_offsets)[1]
            ]
            examples.append(chunked_text)

        return pd.DataFrame(examples, columns=[f"proc_{self.text_attr}"])

#### Using a `DataFrame`

In [None]:
preprocessor = LMPreprocessor(hf_tokenizer, chunk_size=128, text_attr=0)
proc_df = preprocessor.process_df(train_df, valid_df)

print(len(proc_df))
proc_df.head(2)

21330


Unnamed: 0,proc_0,is_valid
0,"\n = 2013 – 14 York City F.C. season = \n \n The 2013 – 14 season was the <unk> season of competitive association football and 77th season in the Football League played by York City Football Club , a professional football club based in York , North Yorkshire , England . Their 17th @-@ place finish in 2012 – 13 meant it was their second consecutive season in League Two . The season ran from 1 July 2013 to 30 June 2014 . \n Nigel Worthington , starting his first full season as York manager , made eight permanent summer signings . By the turn of the year York were only",False
1,"above the relegation zone on goal difference , before a 17 @-@ match unbeaten run saw the team finish in seventh @-@ place in the 24 @-@ team 2013 – 14 Football League Two . This meant York qualified for the play @-@ offs , and they were eliminated in the semi @-@ final by Fleetwood Town . York were knocked out of the 2013 – 14 FA Cup , Football League Cup and Football League Trophy in their opening round matches . \n 35 players made at least one appearance in nationally organised first @-@ team competition , and there were 12 different <unk> . Defender Ben Davies missed",False


#### Using a Hugging Face `Dataset`

In [None]:
# TODO

## LM Strategies

### `LMType` -

In [None]:
# |export
class LMType(Enum):
    """Use this enum to indicate what kind of language model you are training"""

    CAUSAL = 1
    MASKED = 2

### `BaseLMStrategy` and implementations -

In [None]:
# |export
class BaseLMStrategy(ABC):
    """ABC for various language modeling strategies (e.g., causal, BertMLM, WholeWordMLM, etc...)"""

    def __init__(
        self, hf_tokenizer, ignore_token_id=CrossEntropyLossFlat().ignore_index
    ):
        store_attr(["hf_tokenizer", "ignore_token_id"])

    @abstractmethod
    def build_inputs_targets(
        self,
        samples,
        include_labels: bool = True,
        inputs: Optional[BatchEncoding] = None,
    ):
        pass

    # utility methods
    def _get_random_token_id(self, n):
        return random.sample(list(self.hf_tokenizer.get_vocab().values()), n)

    @classmethod
    @abstractmethod
    def get_lm_type(cls):
        pass

Here we include a `BaseLMStrategy` abstract class and several different strategies for building your inputs and targets for causal and masked language modeling tasks.  With CLMs, the objective is to simply predict the next token, but with MLMs, a variety of masking strategies may be used (e.g., mask random tokens, mask random words, mask spans, etc...).  A `BertMLMStrategy` is introduced below that follows the "mask random tokens" strategy used in the BERT paper, but users can create their own `BaseLMStrategy` subclass to support any masking strategy they desire.

#### `CausalLMStrategy` -

In [None]:
# |export
class CausalLMStrategy(BaseLMStrategy):
    """For next token prediction language modeling tasks, we want to use the `CausalLMStrategy` which makes the
    necessary changes in your inputs/targets for causal LMs
    """

    def build_inputs_targets(
        self,
        samples,
        include_labels: bool = True,
        inputs: Optional[BatchEncoding] = None,
    ):
        updated_samples = []
        for s in samples:
            if include_labels:
                s[0]["labels"] = s[0]["input_ids"].clone()
                s[0]["labels"][
                    s[0]["labels"] == self.hf_tokenizer.pad_token_id
                ] = self.ignore_token_id

            targ_ids = torch.cat(
                [s[0]["input_ids"][1:], tensor([self.hf_tokenizer.eos_token_id])]
            )

            updated_samples.append((s[0], targ_ids))

        return updated_samples

    @classmethod
    def get_lm_type(cls: LMType):
        return LMType.CAUSAL

#### `BertMLMStrategy` - 

In [None]:
# |export
class BertMLMStrategy(BaseLMStrategy):
    """A masked language modeling strategy using the default BERT masking definition."""

    def __init__(
        self, hf_tokenizer, ignore_token_id=CrossEntropyLossFlat().ignore_index
    ):
        super().__init__(hf_tokenizer, ignore_token_id)

        vocab = hf_tokenizer.get_vocab()
        self.dnm_tok_ids = [
            vocab[tok]
            for tok in list(hf_tokenizer.special_tokens_map.values())
            if vocab[tok] != hf_tokenizer.mask_token_id
        ]

    def build_inputs_targets(
        self,
        samples,
        include_labels: bool = True,
        inputs: Optional[BatchEncoding] = None,
    ):
        updated_samples = []
        for s in samples:
            # mask the input_ids
            masked_input_ids = s[0]["input_ids"].clone()

            # we want to mask 15% of the non-special tokens(e.g., special tokens inclue [CLS], [SEP], etc...)
            idxs = torch.randperm(len(masked_input_ids))
            total_masked_idxs = int(len(idxs) * 0.15)

            # of the 15% for masking, replace 80% with [MASK] token, 10% with random token, and 10% with correct token
            n_mask_idxs = int(total_masked_idxs * 0.8)
            n_rnd_idxs = int(total_masked_idxs * 0.1)

            # we only want non-special tokens
            mask_idxs = [
                idx for idx in idxs if masked_input_ids[idx] not in self.dnm_tok_ids
            ][:total_masked_idxs]

            # replace 80% with [MASK]
            if n_mask_idxs > 0 and len(mask_idxs) >= n_mask_idxs:
                masked_input_ids[
                    [mask_idxs[:n_mask_idxs]]
                ] = self.hf_tokenizer.mask_token_id

            # replace 10% with a random token
            if n_rnd_idxs > 0 and len(mask_idxs) >= (n_mask_idxs + n_rnd_idxs):
                rnd_tok_ids = self._get_random_token_id(n_rnd_idxs)
                masked_input_ids[
                    [mask_idxs[n_mask_idxs : (n_mask_idxs + n_rnd_idxs)]]
                ] = tensor(rnd_tok_ids)

            # ignore padding when calculating the loss
            lbls = s[0]["input_ids"].clone()
            lbls[[[idx for idx in idxs if idx not in mask_idxs]]] = self.ignore_token_id

            # update the inputs to use our masked input_ids and labels; set targ_ids = labels (will use when
            # we calculate the loss ourselves)
            s[0]["input_ids"] = masked_input_ids
            targ_ids = lbls

            if include_labels:
                s[0]["labels"] = targ_ids.clone()

            updated_samples.append((s[0], targ_ids))

        return updated_samples

    @classmethod
    def get_lm_type(cls: LMType):
        return LMType.MASKED

Follows the masking strategy used in the [BERT paper](https://arxiv.org/abs/1810.04805) for random token masking

## Mid-level API

#### `CausalLMTextInput` and `MLMTextInput` -

In [None]:
# |export
class CausalLMTextInput(TextInput):
    pass


# export
class MLMTextInput(TextInput):
    pass

Again, we define a custom classes for the `@typedispatch`ed methods to use so that we can override how both causal and masked language modeling inputs/targets are assembled, as well as, how the data is shown via methods like `show_batch` and `show_results`.

#### `LMBatchTokenizeTransform` -

In [None]:
# |export
class LMBatchTokenizeTransform(BatchTokenizeTransform):
    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # The language modeling strategy (or objective)
        lm_strategy_cls: BaseLMStrategy = CausalLMStrategy,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs={},
        # Any keyword arguments you want included when generated text
        # See [How to generate text](https://huggingface.co/blog/how-to-generate)
        text_gen_kwargs={},
        # Keyword arguments to apply to `BatchTokenizeTransform`
        **kwargs
    ):
        super().__init__(
            hf_arch,
            hf_config,
            hf_tokenizer,
            hf_model,
            include_labels=include_labels,
            ignore_token_id=ignore_token_id,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            is_split_into_words=is_split_into_words,
            tok_kwargs=tok_kwargs.copy(),
            **kwargs
        )

        self.lm_strategy = lm_strategy_cls(
            hf_tokenizer=hf_tokenizer, ignore_token_id=ignore_token_id
        )
        self.text_gen_kwargs, self.ignore_token_id = text_gen_kwargs, ignore_token_id

    def encodes(self, samples, return_batch_encoding=False):
        # because no target is specific in CLM, fastai will duplicate the inputs (which is just the raw text)
        samples, inputs = super().encodes(samples, return_batch_encoding=True)
        if len(samples[0]) == 1:
            return samples

        updated_samples = self.lm_strategy.build_inputs_targets(
            samples, self.include_labels, inputs
        )

        if return_batch_encoding:
            return updated_samples, inputs

        return updated_samples

Our `LMBatchTokenizeTransform` allows us to update the input's `labels` and our targets appropriately given any language modeling task. 

The `labels` argument allows you to forgo calculating the loss yourself by letting Hugging Face return it for you should you choose to do that. Padding tokens are set to -100 by default (e.g., `CrossEntropyLossFlat().ignore_index`) and prevent cross entropy loss from considering token prediction for tokens it should ... i.e., the padding tokens. For more information on the meaning of this argument, see the [Hugging Face glossary entry for "Labels"](https://huggingface.co/transformers/glossary.html#labels)

## Examples

### Using the mid-level API

#### Causal LM

##### Step 1: Get your Hugging Face objects.

In [None]:
model_cls = AutoModelForCausalLM
hf_logging.set_verbosity_error()

pretrained_model_name = "gpt2"
hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls
)

# some tokenizers like gpt and gpt2 do not have a pad token, so we add it here mainly for the purpose
# of setting the "labels" key appropriately (see below)
if hf_tokenizer.pad_token is None:
    hf_tokenizer.pad_token = "[PAD]"

Using pad_token, but it is not set yet.


#####  Step 2: Preprocess data

In [None]:
preprocessor = LMPreprocessor(hf_tokenizer, chunk_size=128, text_attr=0)
proc_df = preprocessor.process_df(train_df, valid_df)

print(len(proc_df))
proc_df.head(2)

21330


Unnamed: 0,proc_0,is_valid
0,"\n = 2013 – 14 York City F.C. season = \n \n The 2013 – 14 season was the <unk> season of competitive association football and 77th season in the Football League played by York City Football Club , a professional football club based in York , North Yorkshire , England . Their 17th @-@ place finish in 2012 – 13 meant it was their second consecutive season in League Two . The season ran from 1 July 2013 to 30 June 2014 . \n Nigel Worthington , starting his first full season as York manager , made eight permanent summer signings . By the turn of the year York were only",False
1,"above the relegation zone on goal difference , before a 17 @-@ match unbeaten run saw the team finish in seventh @-@ place in the 24 @-@ team 2013 – 14 Football League Two . This meant York qualified for the play @-@ offs , and they were eliminated in the semi @-@ final by Fleetwood Town . York were knocked out of the 2013 – 14 FA Cup , Football League Cup and Football League Trophy in their opening round matches . \n 35 players made at least one appearance in nationally organised first @-@ team competition , and there were 12 different <unk> . Defender Ben Davies missed",False


#####  Step 3: Create your `DataBlock`

In [None]:
batch_tok_tfm = LMBatchTokenizeTransform(
    hf_arch, hf_config, hf_tokenizer, hf_model, lm_strategy_cls=CausalLMStrategy
)

blocks = (
    TextBlock(batch_tokenize_tfm=batch_tok_tfm, input_return_type=CausalLMTextInput),
    noop,
)

dblock = DataBlock(
    blocks=blocks, get_x=ColReader("proc_0"), splitter=ColSplitter(col="is_valid")
)

##### Step 4: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)

In [None]:
b = dls.one_batch()

In [None]:
b[0]["input_ids"].shape, b[0]["labels"].shape, b[1].shape

(torch.Size([4, 129]), torch.Size([4, 129]), torch.Size([4, 129]))

In [None]:
explode_types(b)

{tuple: [dict, torch.Tensor]}

In [None]:
# |export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `CausalLMTextInput` typed inputs
    x: CausalLMTextInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs
):
    # grab our tokenizer and ignore token to decode
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer
    ignore_token_id = tfm.ignore_token_id

    res = L(
        [
            (
                hf_tokenizer.decode(s[0], skip_special_tokens=False)[:trunc_at],
                hf_tokenizer.decode(
                    s[1][s[1] != ignore_token_id], skip_special_tokens=True
                )[:trunc_at],
            )
            for s in samples
        ]
    )

    display_df(pd.DataFrame(res, columns=["text", "target"])[:max_n])
    return ctxs

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)

Unnamed: 0,text,target
0,"₹ 40 million ( US $ 590 @,@ 000 ) was spent solely on VFX for Magadheera. \n \n = = = <unk> = = = \n \n During the film's shoot at Ramoji Film City in late November 2008, a 500 square feet ( 46 m2 ) film can, containing two or three scenes, was discovered missing from Rainbow lab. The filmmakers filed a case at <unk> police station. Security personnel and film unit members searched, but failed to recover the reels. Rajamouli's unit said it was not important if the scenes from","�� 40 million ( US $ 590 @,@ 000 ) was spent solely on VFX for Magadheera. \n \n = = = <unk> = = = \n \n During the film's shoot at Ramoji Film City in late November 2008, a 500 square feet ( 46 m2 ) film can, containing two or three scenes, was discovered missing from Rainbow lab. The filmmakers filed a case at <unk> police station. Security personnel and film unit members searched, but failed to recover the reels. Rajamouli's unit said it was not important if the scenes from"
1,"ederation. Described as "" the most organized of the Northern Arabian tribes "", at the peak of its power in the 6th century BCE it controlled a large region between the Persian Gulf and the Sinai Peninsula. \n Biblical tradition holds that the Qedarites are named for Qedar, the second son of Ishmael, mentioned in the Bible's books of Genesis ( 25 : 13 ) and 1 Chronicles ( 1 : 29 ), where there are also frequent references to Qedar as a tribe. The earliest <unk> inscriptions discovered by archaeol","eration. Described as "" the most organized of the Northern Arabian tribes "", at the peak of its power in the 6th century BCE it controlled a large region between the Persian Gulf and the Sinai Peninsula. \n Biblical tradition holds that the Qedarites are named for Qedar, the second son of Ishmael, mentioned in the Bible's books of Genesis ( 25 : 13 ) and 1 Chronicles ( 1 : 29 ), where there are also frequent references to Qedar as a tribe. The earliest <unk> inscriptions discovered by archaeologi"


#### Masked LM

##### Step 1: Get your Hugging Face objects.

In [None]:
model_cls = AutoModelForMaskedLM
hf_logging.set_verbosity_error()

pretrained_model_name = "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls
)

# some tokenizers like gpt and gpt2 do not have a pad token, so we add it here mainly for the purpose
# of setting the "labels" key appropriately (see below)
if hf_tokenizer.pad_token is None:
    hf_tokenizer.pad_token = "[PAD]"

#####  Step 2: Preprocess data

In [None]:
preprocessor = LMPreprocessor(hf_tokenizer, chunk_size=128, text_attr=0)
proc_df = preprocessor.process_df(train_df, valid_df)

print(len(proc_df))
proc_df.head(2)

Using eos_token, but it is not set yet.


21227


Unnamed: 0,proc_0,is_valid
0,"\n = 2013 – 14 York City F.C. season = \n \n The 2013 – 14 season was the <unk> season of competitive association football and 77th season in the Football League played by York City Football Club , a professional football club based in York , North Yorkshire , England . Their 17th @-@ place finish in 2012 – 13 meant it was their second consecutive season in League Two . The season ran from 1 July 2013 to 30 June 2014 . \n Nigel Worthington , starting his first full season as York manager , made eight permanent summer signings . By the turn of the year York were only above the relegation z...",False
1,"goal difference , before a 17 @-@ match unbeaten run saw the team finish in seventh @-@ place in the 24 @-@ team 2013 – 14 Football League Two . This meant York qualified for the play @-@ offs , and they were eliminated in the semi @-@ final by Fleetwood Town . York were knocked out of the 2013 – 14 FA Cup , Football League Cup and Football League Trophy in their opening round matches . \n 35 players made at least one appearance in nationally organised first @-@ team competition , and there were 12 different <unk> . Defender Ben Davies missed only five of the fifty @",False


#####  Step 3: Create your `DataBlock`

In [None]:
batch_tok_tfm = LMBatchTokenizeTransform(
    hf_arch, hf_config, hf_tokenizer, hf_model, lm_strategy_cls=BertMLMStrategy
)

blocks = (
    TextBlock(batch_tokenize_tfm=batch_tok_tfm, input_return_type=MLMTextInput),
    noop,
)

dblock = DataBlock(
    blocks=blocks, get_x=ColReader("proc_0"), splitter=ColSplitter(col="is_valid")
)

##### Step 4: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)

In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape, b[0]["labels"].shape, b[1].shape

(torch.Size([4, 128]), torch.Size([4, 128]), torch.Size([4, 128]))

In [None]:
b[0]["input_ids"][0][:20], b[0]["labels"][0][:20], b[1][0][:20]

(tensor([ 101, 2003, 2098, 2340,  103, 2101,  103, 1026, 4895,  103, 1028, 1026,
          103, 2243, 1028, 1998, 1996, 2674, 2736, 1037], device='cuda:1'),
 tensor([-100, -100, -100, -100, 2781, -100, 2083, -100, -100, 2243, -100, -100,
         4895, -100, 1028, -100, -100, -100, -100, -100], device='cuda:1'),
 tensor([-100, -100, -100, -100, 2781, -100, 2083, -100, -100, 2243, -100, -100,
         4895, -100, 1028, -100, -100, -100, -100, -100], device='cuda:1'))

In [None]:
explode_types(b)

{tuple: [dict, torch.Tensor]}

In [None]:
# |export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `MLMTextInput` typed inputs
    x: MLMTextInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs,
):
    # grab our tokenizer and ignore token to decode
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer
    ignore_token_id = tfm.ignore_token_id

    # grab our mask token id and do-not-mask token ids
    mask_token_id = hf_tokenizer.mask_token_id

    vocab = hf_tokenizer.get_vocab()
    dnm_tok_ids = [
        vocab[tok]
        for tok in list(hf_tokenizer.special_tokens_map.values())
        if vocab[tok] != mask_token_id
    ]

    res = L()
    for s in samples:
        # exclue dnm tokens from input
        inps = [
            hf_tokenizer.decode(tok_id)
            if (tok_id == mask_token_id or s[1][idx] == ignore_token_id)
            else f"[{hf_tokenizer.decode(tok_id)}]"
            for idx, tok_id in enumerate(s[0])
            if (tok_id not in dnm_tok_ids)
        ]

        # replaced masked tokens with "[{actual_token}]"
        trgs = [
            hf_tokenizer.decode(s[0][idx])
            if (tok_id == ignore_token_id)
            else f"[{hf_tokenizer.decode(tok_id)}]"
            for idx, tok_id in enumerate(s[1])
            if (s[0][idx] not in dnm_tok_ids)
        ]

        res.append(
            (" ".join(inps[:trunc_at]).strip(), " ".join(trgs[:trunc_at]).strip())
        )

    display_df(pd.DataFrame(res, columns=["text", "target"])[:max_n])
    return ctxs

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=250)

Unnamed: 0,text,target
0,"[##las] ##ed 11 minutes [MASK] through < [un] ##k > [MASK] un ##k > and the match finished a 1 [MASK] 1 [MASK] . york [were] knocked out of the fa cup after losing 3 – 2 at home to bristol rovers in a first round [MASK] ; the [MASK] were 3 [–] 0 up by 50 @ - @ minutes before fletcher pulled two back [MASK] york with a penalty [MASK] a long @ - @ range strike . defender keith [MASK] , of cheltenham , and [MASK] nick pope [MASK] of charlton athletic , were signed on loan until january 2014 . they [MASK] played in york ' s first league [MASK] [MASK] four weeks , 2 – 1 [MASK] , to southend united","[is] ##ed 11 minutes [later] through < [un] ##k > [<] un ##k > and the match finished a 1 [–] 1 [draw] . york [were] knocked out of the fa cup after losing 3 – 2 at home to bristol rovers in a first round [replay] ; the [visitors] were 3 [–] 0 up by 50 @ - @ minutes before fletcher pulled two back [for] york with a penalty [and] a long @ - @ range strike . defender keith [lowe] , of cheltenham , and [goalkeeper] nick pope [,] of charlton athletic , were signed on loan until january 2014 . they [both] played in york ' s first league [defeat] [in] four weeks , 2 – 1 [away] , to southend united"
1,"[MASK] ##on . [MASK] 134 ##5 [MASK] iii was planning a [MASK] assault on france . a three [MASK] [MASK] [MASK] < un ##k > attack would have the earl of northampton attacking from brittany [,] the [MASK] himself from flanders , while gr [bubble] ##mont was dispatched [MASK] < un ##k > to prepare a campaign in the south [MASK] moving rapidly through the country , he confronted the [comte] d ’ [MASK] at < un ##k > [on] [MASK] october and there achieved a victory described as "" [MASK] greatest single achievement of lancaster ' s entire military career "" . the ransom from the prisoners has been [MASK] at £ 50 @ , @ 000 . [MASK] next year , while edward was","[ign] ##on . [in] 134 ##5 [edward] iii was planning a [major] assault on france . a three [@] [-] [@] < un ##k > attack would have the earl of northampton attacking from brittany [,] the [king] himself from flanders , while gr [##os] ##mont was dispatched [to] < un ##k > to prepare a campaign in the south [.] moving rapidly through the country , he confronted the [comte] d ’ [isle] at < un ##k > [on] [21] october and there achieved a victory described as "" [the] greatest single achievement of lancaster ' s entire military career "" . the ransom from the prisoners has been [estimated] at £ 50 @ , @ 000 . [the] next year , while edward was"


## Export -

In [None]:
# |hide
nbdev_export()