In [None]:
# |default_exp text.data.question_answering
# |default_cls_lvl 3

In [None]:
# | nbflags skip_exec

In [None]:
# |hide
%reload_ext autoreload
%autoreload 2

# Data

> The `text.data.question_answering` module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for question/answering tasks. Question/Answering tasks are models that require two text inputs (a context that includes the answer and the question).  The objective is to predict the start/end tokens of the answer in the context).

In [None]:
# |export
import ast, warnings
from functools import reduce

from datasets import Dataset
from fastcore.all import *
from fastai.data.block import DataBlock, CategoryBlock, ColReader, ColSplitter
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import (
    AutoModelForQuestionAnswering,
    PretrainedConfig,
    PreTrainedTokenizerBase,
    PreTrainedModel,
)
from transformers.utils import logging as hf_logging

from blurr.text.data.core import (
    TextInput,
    BatchTokenizeTransform,
    Preprocessor,
    first_blurr_tfm,
)
from blurr.text.utils import get_hf_objects

In [None]:
# | hide
import pdb

from datasets import load_dataset
from fastai.data.core import DataLoader, DataLoaders, TfmdDL
from fastai.data.external import untar_data, URLs
from fastai.data.transforms import *
from fastcore.test import *
from nbdev import nbdev_export
from nbdev.showdoc import show_doc

from blurr.utils import print_versions
from blurr.text.data.core import TextBlock
from blurr.text.utils import BlurrText

In [None]:
# |export
# silence all the HF warnings
warnings.simplefilter("ignore")
hf_logging.set_verbosity_error()

In [None]:
# | echo: false
NLP = BlurrText()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("display.max_colwidth", 100)

print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")

What we're running with at the time this documentation was generated:
torch: 1.9.0+cu102
fastai: 2.7.9
transformers: 4.21.2


In [None]:
# |hide
# |cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")

Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `squad_v2` to demonstrate how to configure your blurr code for extractive question answering

In [None]:
raw_datasets = load_dataset("squad_v2", split=["train[:1000]", "validation[:200]"])

Reusing dataset squad_v2 (/home/wgilliam/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
raw_train_ds, raw_valid_ds = raw_datasets[0], raw_datasets[1]

In [None]:
raw_train_df = pd.DataFrame(raw_train_ds)
raw_valid_df = pd.DataFrame(raw_valid_ds)

raw_train_df["is_valid"] = False
raw_valid_df["is_valid"] = True

print(len(raw_train_df))
print(len(raw_valid_df))

1000
200


In [None]:
raw_train_df.head(2)

Unnamed: 0,id,title,context,question,answers,is_valid
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",False
1,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",What areas did Beyonce compete in when she was growing up?,"{'text': ['singing and dancing'], 'answer_start': [207]}",False


In [None]:
raw_valid_df.head(2)

Unnamed: 0,id,title,context,question,answers,is_valid
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10...,In what country is Normandy located?,"{'text': ['France', 'France', 'France', 'France'], 'answer_start': [159, 159, 159, 159]}",True
1,56ddde6b9a695914005b9629,Normans,The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10...,When were the Normans in Normandy?,"{'text': ['10th and 11th centuries', 'in the 10th and 11th centuries', '10th and 11th centuries'...",True


In [None]:
squad_df = pd.concat([raw_train_df, raw_valid_df])
len(squad_df)

1200

In [None]:
squad_df["ans_start_char_idx"] = squad_df.answers.apply(
    lambda v: v["answer_start"][0] if len(v["answer_start"]) > 0 else "0"
)
squad_df["answer_text"] = squad_df.answers.apply(
    lambda v: v["text"][0] if len(v["text"]) > 0 else ""
)
squad_df["ans_end_char_idx"] = (
    squad_df["ans_start_char_idx"].astype(int) + squad_df["answer_text"].str.len()
)

print(len(squad_df))
squad_df[squad_df.is_valid == True].head(2)

1200


Unnamed: 0,id,title,context,question,answers,is_valid,ans_start_char_idx,answer_text,ans_end_char_idx
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10...,In what country is Normandy located?,"{'text': ['France', 'France', 'France', 'France'], 'answer_start': [159, 159, 159, 159]}",True,159,France,165
1,56ddde6b9a695914005b9629,Normans,The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10...,When were the Normans in Normandy?,"{'text': ['10th and 11th centuries', 'in the 10th and 11th centuries', '10th and 11th centuries'...",True,94,10th and 11th centuries,117


In [None]:
model_cls = AutoModelForQuestionAnswering
hf_logging.set_verbosity_error()

pretrained_model_name = "roberta-base"  #'xlm-mlm-ende-1024'
hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=model_cls
)

max_seq_len = 128
vocab = dict(enumerate(range(max_seq_len)))

## Preprocessing

With version 2.0.0 of `BLURR`, we include a `Preprocessor` for question answering that can either truncate texts or else chunk long documents into multiple examples.

**Note**: Unlike other NLP tasks in BLURR, extractive question answering ***requires*** preprocessing in order to convert our raw start/end character indices into start/end token indices unless your dataset includes the later.  Token indicies, rather than character indices, will be used as our targets and are dependent on your tokenizer of choice.

In [None]:
# |export
class QAPreprocessor(Preprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The unique identifier in the dataset. If not specified and "return_overflowing_tokens": True, an "_id" attribute
        # will be added to your dataset with its value a unique, sequential integer, assigned to each record
        id_attr: Optional[str] = None,
        # The attribute in your dataset that contains the context (where the answer is included) (default: 'context')
        ctx_attr: str = "context",
        # The attribute in your dataset that contains the question being asked (default: 'question')
        qst_attr: str = "question",
        # The attribute in your dataset that contains the actual answer (default: 'answer_text')
        ans_attr: str = "answer_text",
        # The attribute in your dataset that contains the actual answer (default: 'answer_text')
        ans_start_char_idx: str = "ans_start_char_idx",
        # The attribute in your dataset that contains the actual answer (default: 'answer_text')
        ans_end_char_idx: str = "ans_end_char_idx",
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # Tokenization kwargs that will be applied with calling the tokenizer (default: {"return_overflowing_tokens": True})
        tok_kwargs: dict = {"return_overflowing_tokens": True},
    ):
        # these values are mandatory
        tok_kwargs = {**tok_kwargs, "return_offsets_mapping": True}

        # shift the question and context appropriately based on the tokenizers padding strategy
        if hf_tokenizer.padding_side == "right":
            tok_kwargs["truncation"] = "only_second"
            text_attrs = [qst_attr, ctx_attr]
        else:
            tok_kwargs["truncation"] = "only_first"
            text_attrs = [ctx_attr, qst_attr]

        super().__init__(
            hf_tokenizer,
            batch_size,
            text_attr=text_attrs[0],
            text_pair_attr=text_attrs[1],
            tok_kwargs=tok_kwargs,
        )
        store_attr()

    def process_df(
        self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None
    ):
        df = super().process_df(training_df, validation_df)

        # a unique Id for each example is required to properly score question answering results when chunking long
        # documents (e.g., return_overflowing_tokens=True)
        chunk_docs = self.tok_kwargs.get("return_overflowing_tokens", False)
        max_length = self.tok_kwargs.get(
            "max_length", self.hf_tokenizer.model_max_length
        )

        if self.id_attr is None and chunk_docs:
            df.insert(0, "_id", range(len(df)))

        # process df in mini-batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            final_df = final_df.append(
                self._process_df_batch(batch_df, chunk_docs, max_length)
            )

        final_df.reset_index(drop=True, inplace=True)
        return final_df

    def process_hf_dataset(
        self, training_ds: Dataset, validation_ds: Optional[Dataset] = None
    ):
        ds = super().process_hf_dataset(training_ds, validation_ds)
        return Dataset.from_pandas(self.process_df(pd.DataFrame(ds)))

    # ----- utility methods -----
    def _process_df_batch(self, batch_df, is_chunked, max_length):
        batch_df.reset_index(drop=True, inplace=True)

        # grab our inputs
        inputs = self._tokenize_function(batch_df.to_dict(orient="list"))

        offset_mapping = inputs.pop("offset_mapping")
        sample_map = inputs.pop("overflow_to_sample_mapping", batch_df.index.tolist())

        proc_data = []
        for idx, offsets in enumerate(offset_mapping):
            example_idx = sample_map[idx]
            row = batch_df.iloc[example_idx]
            input_ids = inputs["input_ids"][idx]
            seq_ids = inputs.sequence_ids(idx)

            # get question and context associated with the inputs at "idx"
            qst_mask = [
                i != 1 if self.hf_tokenizer.padding_side == "right" else i != 0
                for i in seq_ids
            ]
            qst_offsets = [
                offsets[i]
                for i, is_qst in enumerate(qst_mask)
                if is_qst and seq_ids[i] is not None
            ]
            ctx_offsets = [
                offsets[i]
                for i, is_qst in enumerate(qst_mask)
                if not is_qst and seq_ids[i] is not None
            ]

            proc_qst = row[self.qst_attr][min(qst_offsets)[0] : max(qst_offsets)[1]]
            proc_ctx = row[self.ctx_attr][min(ctx_offsets)[0] : max(ctx_offsets)[1]]

            # if we are chunking long documents, we need to tokenize the chunked question, context in order to correctly assign
            # the start/end token indices, else we can just the above since we are only looking at one example at a time
            if is_chunked:
                chunk_texts = (
                    (proc_qst, proc_ctx)
                    if self.hf_tokenizer.padding_side == "right"
                    else (proc_ctx, proc_qst)
                )
                chunk_inputs = self.hf_tokenizer(chunk_texts[0], chunk_texts[1])
                chunk_input_ids = chunk_inputs["input_ids"]
                chunk_qst_mask = [
                    i != 1 if self.hf_tokenizer.padding_side == "right" else i != 0
                    for i in chunk_inputs.sequence_ids()
                ]
            else:
                chunk_input_ids, chunk_qst_mask = input_ids, qst_mask

            # lastly we iterate over the input tokens to see if we can fine the answer tokens within (ignoring the input tokens
            # belonging to the "question" as we only want to find answers that exist in the "context")
            tok_input = self.hf_tokenizer.convert_ids_to_tokens(chunk_input_ids)
            tok_ans = self.hf_tokenizer.tokenize(str(row[self.ans_attr]))

            start_idx, end_idx = 0, 0
            for idx, (tok, is_qst_tok) in enumerate(zip(tok_input, chunk_qst_mask)):
                try:
                    if (
                        is_qst_tok == False
                        and tok == tok_ans[0]
                        and tok_input[idx : idx + len(tok_ans)] == tok_ans
                    ):
                        # ensure we are within the max_length
                        last_idx = idx + len(tok_ans)
                        if last_idx < max_length:
                            start_idx, end_idx = idx, idx + len(tok_ans)
                        break
                except:
                    pass

            # update the oringal example information with the processed question, context, start/end "token" indices, and
            # a boolean indicating whether the question is answerable
            overflow_row = row.copy()
            overflow_row[f"proc_{self.qst_attr}"] = proc_qst
            overflow_row[f"proc_{self.ctx_attr}"] = proc_ctx
            overflow_row["ans_start_token_idx"] = start_idx
            overflow_row["ans_end_token_idx"] = end_idx
            overflow_row["is_answerable"] = start_idx != 0 and end_idx != 0

            proc_data.append(overflow_row)

        return pd.DataFrame(proc_data)

#### How to preprocess your data

In [None]:
tok_kwargs = {
    "return_overflowing_tokens": True,
    "max_length": max_seq_len,
    "stride": 64,
}
preprocessor = QAPreprocessor(hf_tokenizer, id_attr="id", tok_kwargs=tok_kwargs)
proc_df = preprocessor.process_df(squad_df)

print(len(proc_df))
proc_df.head(4)

3560


Unnamed: 0,id,title,context,question,answers,is_valid,ans_start_char_idx,answer_text,ans_end_char_idx,proc_question,proc_context,ans_start_token_idx,ans_end_token_idx,is_answerable
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",False,269,in the late 1990s,286,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",84,89,True
1,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",False,269,in the late 1990s,286,When did Beyonce start becoming popular?,"in Houston, Texas, she performed in various singing and dancing competitions as a child, and ro...",32,37,True
2,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",False,269,in the late 1990s,286,When did Beyonce start becoming popular?,group became one of the world's best-selling girl groups of all time. Their hiatus saw the rele...,0,0,False
3,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",What areas did Beyonce compete in when she was growing up?,"{'text': ['singing and dancing'], 'answer_start': [207]}",False,207,singing and dancing,226,What areas did Beyonce compete in when she was growing up?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",77,80,True


In [None]:
sampled_df = proc_df.sample(n=10)
for row_idx, row in sampled_df.iterrows():
    test_example = row

    inputs = hf_tokenizer(row.proc_question, row.proc_context)

    if test_example.is_answerable:
        # print(test_example.answer_text)
        test_eq(
            test_example.answer_text,
            hf_tokenizer.decode(
                inputs["input_ids"][
                    test_example.ans_start_token_idx : test_example.ans_end_token_idx
                ]
            ).strip(),
        )
    else:
        test_eq(test_example.ans_start_token_idx, 0)
        test_eq(test_example.ans_end_token_idx, 0)

If you want to remove texts longer than your model will hold (and include only answerable contexts)

In [None]:
preprocessor = QAPreprocessor(
    hf_tokenizer,
    tok_kwargs={"return_overflowing_tokens": False, "max_length": max_seq_len},
)
proc2_df = preprocessor.process_df(squad_df)
proc2_df = proc2_df[
    (proc2_df.ans_end_token_idx < max_seq_len) & (proc2_df.is_answerable)
]

print(len(proc2_df))
proc2_df.head(2)

763


Unnamed: 0,id,title,context,question,answers,is_valid,ans_start_char_idx,answer_text,ans_end_char_idx,proc_question,proc_context,ans_start_token_idx,ans_end_token_idx,is_answerable
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",False,269,in the late 1990s,286,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",84,89,True
1,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",What areas did Beyonce compete in when she was growing up?,"{'text': ['singing and dancing'], 'answer_start': [207]}",False,207,singing and dancing,226,What areas did Beyonce compete in when she was growing up?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",77,80,True


## Mid-level API

### `QATextInput` -

In [None]:
# |export
class QATextInput(TextInput):
    pass

### `QABatchTokenizeTransform` -

In [None]:
# |export
class QABatchTokenizeTransform(BatchTokenizeTransform):
    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = "only_second",
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs.
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `BatchTokenizeTransform`
        **kwargs
    ):

        # "return_special_tokens_mask" and "return_offsets_mapping" are mandatory for extractive QA in blurr
        tok_kwargs = {
            **tok_kwargs,
            **{"return_special_tokens_mask": True, "return_offsets_mapping": True},
        }

        super().__init__(
            hf_arch,
            hf_config,
            hf_tokenizer,
            hf_model,
            include_labels=include_labels,
            ignore_token_id=ignore_token_id,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            is_split_into_words=is_split_into_words,
            tok_kwargs=tok_kwargs,
            **kwargs
        )

    def encodes(self, samples, return_batch_encoding=False):
        updated_samples, batch_encoding = super().encodes(
            samples, return_batch_encoding=True
        )

        for idx, s in enumerate(updated_samples):
            # cls_index: location of CLS token (used by xlnet and xlm); is a list.index(value) for pytorch tensor's
            s[0]["cls_index"] = (
                s[0]["input_ids"] == self.hf_tokenizer.cls_token_id
            ).nonzero()[0]
            # p_mask: mask with 1 for token than cannot be in the answer, else 0 (used by xlnet and xlm)
            s[0]["p_mask"] = s[0]["special_tokens_mask"]

            trgs = s[1:]
            if self.include_labels and len(trgs) > 0:
                s[0].pop(
                    "labels"
                )  # this is added by base class, but is not needed for extractive QA
                s[0]["start_positions"] = trgs[0]
                s[0]["end_positions"] = trgs[1]

        if return_batch_encoding:
            return updated_samples, inputs

        return updated_samples

## Examples

The following eamples demonstrate several approaches to construct your `DataBlock` for question answering tasks using the mid-level API

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects

In [None]:
hf_logging.set_verbosity_error()

pretrained_model_name = "distilroberta-base"
hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=AutoModelForQuestionAnswering
)

max_seq_len = 128
vocab = dict(enumerate(range(max_seq_len)))

#####  Step 2: Preprocess dataset

In [None]:
tok_kwargs = {
    "return_overflowing_tokens": True,
    "max_length": max_seq_len,
    "stride": 24,
}
preprocessor = QAPreprocessor(hf_tokenizer, id_attr="id", tok_kwargs=tok_kwargs)
proc_df = preprocessor.process_df(squad_df)

proc_df.head(1)

Unnamed: 0,id,title,context,question,answers,is_valid,ans_start_char_idx,answer_text,ans_end_char_idx,proc_question,proc_context,ans_start_token_idx,ans_end_token_idx,is_answerable
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",False,269,in the late 1990s,286,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",84,89,True


#####  Step 3: Create your `DataBlock`

In [None]:
before_batch_tfm = QABatchTokenizeTransform(
    hf_arch, hf_config, hf_tokenizer, hf_model, max_length=max_seq_len
)

blocks = (
    TextBlock(batch_tokenize_tfm=before_batch_tfm, input_return_type=QATextInput),
    CategoryBlock(vocab=vocab),
    CategoryBlock(vocab=vocab),
)

dblock = DataBlock(
    blocks=blocks,
    get_x=lambda x: (x.proc_question, x.proc_context),
    get_y=[ColReader("ans_start_token_idx"), ColReader("ans_end_token_idx")],
    splitter=ColSplitter(),
    n_inp=1,
)

##### Step 4: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)
len(dls.train), len(dls.valid)

(590, 94)

In [None]:
b = dls.one_batch()
len(b), len(b[0]), len(b[1]), len(b[2])

(3, 8, 4, 4)

In [None]:
b[0]["input_ids"].shape, b[0]["attention_mask"].shape, b[1].shape, b[2].shape

(torch.Size([4, 128]), torch.Size([4, 128]), torch.Size([4]), torch.Size([4]))

In [None]:
b[0]["start_positions"], b[0]["end_positions"]

(TensorCategory([ 0,  0, 85,  0], device='cuda:1'),
 TensorCategory([ 0,  0, 87,  0], device='cuda:1'))

In [None]:
# |export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `QuestionAnswerTextInput` typed inputs
    x: QATextInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders, tfms=[QABatchTokenizeTransform])
    hf_tokenizer = tfm.hf_tokenizer

    res = L()
    for sample, input_ids, start, end in zip(samples, x, *y):
        txt = hf_tokenizer.decode(sample[0], skip_special_tokens=True)[:trunc_at]
        found = start.item() != 0 and end.item() != 0
        ans_text = hf_tokenizer.decode(input_ids[start:end], skip_special_tokens=True)
        res.append((txt, found, (start.item(), end.item()), ans_text))

    display_df(
        pd.DataFrame(res, columns=["text", "found", "start/end", "answer"])[:max_n]
    )
    return ctxs

The `show_batch` method above allows us to create a more interpretable view of our question/answer data.

In [None]:
dls.show_batch(dataloaders=dls, max_n=4)

Unnamed: 0,text,found,start/end,answer
0,"Beyonce has been awarded how many Grammy nominations? ously in Love, B'Day and I Am... Sasha Fierce have all won Best Contemporary R&B Album. Beyoncé set the record for the most Grammy awards won by a female artist in one night in 2010 when she won six awards, breaking the tie she previously held with Alicia Keys, Norah Jones, Alison Krauss, and Amy Winehouse, with Adele equaling this in 2012. Following her role in Dreamgirls she was nominated for Best Original Song for ""Listen"" and Best Actress at the Golden Globe Awards, and Outstanding Actress",False,"(0, 0)",
1,"Who did Beyonce record the lead single with in the movie ""The Fighting Temptations""? cé starred opposite Cuba Gooding, Jr., in the musical comedy The Fighting Temptations as Lilly, a single mother whom Gooding's character falls in love with. The film received mixed reviews from critics but grossed $30 million in the U.S. Beyoncé released ""Fighting Temptation"" as the lead single from the film's soundtrack album, with Missy Elliott, MC Lyte, and Free which was also used to promote the film. Another of Beyoncé's contributions to the soundtrack, """,True,"(97, 100)",Missy Elliott
2,"What did Bryan Lessard name after Beyoncé?'s ""Say My Name"" and discussed his relationship with women. In January 2012, research scientist Bryan Lessard named Scaptia beyonceae, a species of horse fly found in Northern Queensland, Australia after Beyoncé due to the fly's unique golden hairs on its abdomen. In July 2014, a Beyoncé exhibit was introduced into the ""Legends of Rock"" section of the Rock and Roll Hall of Fame. The black leotard from the ""Single Ladies"" video and her outfit from the Super Bowl half time performance are among several pieces housed at",True,"(45, 50)",a species of horse fly
3,"How many awards did Beyonce take home with her at the 57th Grammy Awards? ogue magazine was unveiled online, Beyoncé as the cover star, becoming the first African-American artist and third African-American woman in general to cover the September issue. She headlined the 2015 Made in America festival in early September and also the Global Citizen Festival later that month. Beyoncé made an uncredited featured appearance on the track ""Hymn for the Weekend"" by British rock band Coldplay, on their seventh studio album A Head Full of Dreams (2015), which saw release in December. On January 7, 2016,",False,"(0, 0)",


#### Passing extra information

As mentioned in the `data.core` module documentation, BLURR now also allows you to pass extra information alongside your inputs in the form of a dictionary.  If we are splitting long documents into chunks but want to predict/aggregation by example (rather than by chunk), we'll need to include a unique identifier for each example. When we look at `modeling.question_answer` module, we'll see how the question answering bits can use such an Id for this purpose.


##### Step 1: Get your Hugging Face objects

In [None]:
hf_logging.set_verbosity_error()

pretrained_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(
    pretrained_model_name, model_cls=AutoModelForQuestionAnswering
)

max_seq_len = 128
vocab = dict(enumerate(range(max_seq_len)))

#####  Step 2: Preprocess dataset

In [None]:
preprocessor = QAPreprocessor(
    hf_tokenizer,
    id_attr="id",
    tok_kwargs={
        "return_overflowing_tokens": True,
        "max_length": max_seq_len,
        "stride": 64,
    },
)

proc_df = preprocessor.process_df(squad_df)
proc_df.head(1)

Unnamed: 0,id,title,context,question,answers,is_valid,ans_start_char_idx,answer_text,ans_end_char_idx,proc_question,proc_context,ans_start_token_idx,ans_end_token_idx,is_answerable
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",False,269,in the late 1990s,286,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",75,79,True


##### Step 2: Create your `DataBlock`

In [None]:
before_batch_tfm = QABatchTokenizeTransform(
    hf_arch, hf_config, hf_tokenizer, hf_model, max_length=max_seq_len
)

blocks = (
    TextBlock(batch_tokenize_tfm=before_batch_tfm, input_return_type=QATextInput),
    CategoryBlock(vocab=vocab),
    CategoryBlock(vocab=vocab),
)

# since its preprocessed, we include an "text" key with the values of our question and context
def get_x(item):
    return {"text": (item.proc_question, item.proc_context), "id": item.id}


dblock = DataBlock(
    blocks=blocks,
    get_x=get_x,
    get_y=[ItemGetter("ans_start_token_idx"), ItemGetter("ans_end_token_idx")],
    splitter=ColSplitter(),
    n_inp=1,
)

##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)
len(dls.train), len(dls.valid)

(733, 108)

In [None]:
b = dls.one_batch()
len(b), len(b[0]), len(b[1]), len(b[2])

(3, 10, 4, 4)

In [None]:
b[0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'offset_mapping', 'id', 'cls_index', 'p_mask', 'start_positions', 'end_positions'])

In [None]:
b[0]["input_ids"].shape, b[0]["attention_mask"].shape, b[1].shape, b[2].shape

(torch.Size([4, 128]), torch.Size([4, 128]), torch.Size([4]), torch.Size([4]))

We can see that any additional data is now located in the inputs dictionary

In [None]:
b[0]["id"]

['56be8bab3aeaaa14008c90a1',
 '56d4cde92ccc5a1400d83239',
 '56bea8463aeaaa14008c91ac',
 '56becc903aeaaa14008c94a1']

In [None]:
dls.show_batch(dataloaders=dls, max_n=4)

Unnamed: 0,text,found,start/end,answer
0,"who was the first record label to give the girls a record deal? ped and danced on the talent show circuit in houston. after seeing the group, r & b producer arne frager brought them to his northern california studio and placed them in star search, the largest talent show on national tv at the time. girl's tyme failed to win, and beyonce later said the song they performed was not good. in 1995 beyonce's father resigned from his job to manage the group. the move reduced beyonce's family's income by half, and her parents were forced to move into separated apartments. mathew",False,"(0, 0)",
1,"who said that chopin set out "" into the wide world, with no very clearly defined aim, forever? "" cki, "" into the wide world, with no very clearly defined aim, forever. "" with woyciechowski, he headed for austria, intending to go on to italy. later that month, in warsaw, the november 1830 uprising broke out, and woyciechowski returned to poland to enlist. chopin, now alone in vienna, was nostalgic for his homeland, and wrote to a friend, "" i curse the moment of my departure. "" when in september 1831 he learned, while",False,"(0, 0)",
2,"what short poem spoke of frederic's popularity as a child? yk and his family moved to a building, which still survives, adjacent to the kazimierz palace. during this period, fryderyk was sometimes invited to the belweder palace as playmate to the son of the ruler of russian poland, grand duke constantine ; he played the piano for the duke and composed a march for him. julian ursyn niemcewicz, in his dramatic eclogue, "" nasze przebiegi "" ( "" our discourses "", 1818 ), attested to "" little chopin's "" popularity",True,"(101, 107)",nasze przebiegi
3,"which national event caused beyonce to produce "" demand a plan? "" in a campaign video released on 15 may 2013, where she, along with cameron diaz, john legend and kylie minogue, described inspiration from their mothers, while a number of other artists celebrated personal inspiration from other women, leading to a call for submission of photos of women of viewers'inspiration from which a selection was shown at the concert. beyonce said about her mother tina knowles that her gift was "" finding the best qualities in every human being. "" with help of the crowdfunding platform catapult, visitors of the concert could choose between several projects promoting education",False,"(0, 0)",


## Export -

In [None]:
# |hide
nbdev_export()