## Install and Load libraries

In [1]:
! pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 7.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 72.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 77.2 MB/s 
[?25hCollecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.0-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 79.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.5 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting pyyaml>=5.1
  Downloading P

In [2]:
from datasets import load_dataset, load_metric

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import transformers

print(transformers.__version__)

4.21.0


In [5]:
from datasets import load_dataset
from datasets import Dataset
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from google.colab import files
import re

In [56]:
# This flag is the difference between SQUAD v1 or 2 (if you're using another dataset, it indicates if impossible
# answers are allowed or not).
squad_v2 = True
model_checkpoint = model_name = "drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-45500"
batch_size = 16

In [6]:
!ls drive/MyDrive/w266/data/TriviaQA

test.json  train.json  validation.json


In [15]:
train_path = "drive/MyDrive/w266/data/TriviaQA/train.json"
train_data = load_dataset('json', data_files=train_path)
train = train_data['train']

test_path = "drive/MyDrive/w266/data/TriviaQA/test.json"
test_data = load_dataset('json', data_files=test_path)
test = test_data['train']

Using custom data configuration default-060d5ca45111dd04
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-060d5ca45111dd04/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-7d16357ee71199de
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-7d16357ee71199de/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
train

Dataset({
    features: ['question', 'id', 'context', 'answers'],
    num_rows: 19965
})

In [None]:
test

Dataset({
    features: ['question', 'id', 'context', 'answers'],
    num_rows: 2496
})

In [None]:
#dataset_train_test

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 117287
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 13032
    })
})

In [None]:
# train = dataset_train_val['train']
# validation = dataset_train_val['validation']
# test = dataset_train_test['test']

In [None]:
train.save_to_disk('drive/MyDrive/w266/data/SQuAD/train_master.json')
validation.save_to_disk('drive/MyDrive/w266/data/SQuAD/validation_master.json')
test.save_to_disk('drive/MyDrive/w266/data/SQuAD/test_master.json')

Flattening the indices:   0%|          | 0/14 [00:00<?, ?ba/s]

## Preprocessing the training data

We will use TokenizerFast and leverage a few properties for QA task, such as return offset mapping etc. 

In [16]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [17]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [18]:
# tokenizer("What is your name?", "My name is Sylvain.")

In [19]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [20]:
pad_on_right = tokenizer.padding_side == "right"

In [21]:
def prepare_train_features(examples):
    # remove white space in question section 
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # maps feature back to sample
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # map from token to character position in the original context. 
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [22]:
tokenized_train = train.map(prepare_train_features, batched=True, remove_columns=train.column_names)
tokenized_test = test.map(prepare_train_features, batched=True, remove_columns=test.column_names)

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

## Fine-tuning the model

In [64]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

loading configuration file drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-45500/config.json
Model config DistilBertConfig {
  "_name_or_path": "drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-45500",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.0",
  "vocab_size": 28996
}

loading weights file drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-45500/pytorch_model.bin
All model checkpoint weights were used when initializing D

In [69]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    output_dir = 'drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01
)

    # push_to_hub=True,

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [48]:
from transformers import default_data_collator

data_collator = default_data_collator

In [70]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


    # eval_dataset=tokenized_datasets["validation"],

In [71]:
trainer.train(resume_from_checkpoint=True)

Loading model from drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-45500.
***** Running training *****
  Num examples = 1253082
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 78318
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 45500
  Will skip the first 0 epochs then the first 45500 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/45500 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss
1,0.095,0.08062


Saving model checkpoint to drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-46000
Configuration saved in drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-46000/config.json
Model weights saved in drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-46000/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-46000/tokenizer_config.json
Special tokens file saved in drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-46000/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-46500
Configuration saved in drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-46500/config.json
Model weights saved in drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-tra

TrainOutput(global_step=78318, training_loss=0.03360069165504631, metrics={'train_runtime': 11776.0695, 'train_samples_per_second': 106.409, 'train_steps_per_second': 6.651, 'total_flos': 1.2278928592072397e+17, 'train_loss': 0.03360069165504631, 'epoch': 1.0})

In [72]:
trainer.save_model("drive/MyDrive/w266/model_checkpoints/DistilBert/distilBert-NQ-Trivia-SQuAD-cased-trained")

Saving model checkpoint to drive/MyDrive/w266/model_checkpoints/DistilBert/distilBert-NQ-Trivia-SQuAD-cased-trained
Configuration saved in drive/MyDrive/w266/model_checkpoints/DistilBert/distilBert-NQ-Trivia-SQuAD-cased-trained/config.json
Model weights saved in drive/MyDrive/w266/model_checkpoints/DistilBert/distilBert-NQ-Trivia-SQuAD-cased-trained/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/w266/model_checkpoints/DistilBert/distilBert-NQ-Trivia-SQuAD-cased-trained/tokenizer_config.json
Special tokens file saved in drive/MyDrive/w266/model_checkpoints/DistilBert/distilBert-NQ-Trivia-SQuAD-cased-trained/special_tokens_map.json


In [None]:
#!cp -R distilbert-base-cased-finetuned-NQ drive/MyDrive/w266/model_checkpoints/DistilBert/

In [None]:
# !cp -R test-NQ-trained drive/MyDrive/w266/model_checkpoints/DistilBert/

In [None]:
# trainer.save_model("drive/MyDrive/w266/model_checkpoints/DistilBert/test-NQ-trained/")

In [None]:
# !ls drive/MyDrive/w266/model_checkpoints/DistilBert/test-NQ-trained

In [73]:
!ls drive/MyDrive/w266/model_checkpoints/DistilBert/distilBert-NQ-Trivia-SQuAD-cased-trained

checkpoint-45500   special_tokens_map.json  training_args.bin
config.json	   tokenizer_config.json    vocab.txt
pytorch_model.bin  tokenizer.json


## Evaluation

In [None]:
!ls drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained

checkpoint-1000   checkpoint-20000  checkpoint-30500  checkpoint-41000
checkpoint-10000  checkpoint-20500  checkpoint-31000  checkpoint-41500
checkpoint-10500  checkpoint-21000  checkpoint-31500  checkpoint-42000
checkpoint-11000  checkpoint-21500  checkpoint-32000  checkpoint-42500
checkpoint-11500  checkpoint-22000  checkpoint-32500  checkpoint-43000
checkpoint-12000  checkpoint-22500  checkpoint-33000  checkpoint-43500
checkpoint-12500  checkpoint-23000  checkpoint-33500  checkpoint-44000
checkpoint-13000  checkpoint-23500  checkpoint-34000  checkpoint-44500
checkpoint-13500  checkpoint-24000  checkpoint-34500  checkpoint-4500
checkpoint-14000  checkpoint-24500  checkpoint-3500   checkpoint-45000
checkpoint-14500  checkpoint-2500   checkpoint-35000  checkpoint-45500
checkpoint-1500   checkpoint-25000  checkpoint-35500  checkpoint-500
checkpoint-15000  checkpoint-25500  checkpoint-36000  checkpoint-5000
checkpoint-15500  checkpoint-26000  checkpoint-36500  checkpoint-5500
checkpoint-

In [10]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DistilBertForQuestionAnswering, Trainer

model_name = "drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-45500"

# Download pytorch model
model = DistilBertForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

squad_v2 = True
model_checkpoint = model_name = "drive/MyDrive/w266/model_checkpoints/DistilBert-NQ-TriviaQA-cased-trained/checkpoint-45500"
batch_size = 16

In [74]:
import torch

for batch in trainer.get_eval_dataloader():
    break
batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
with torch.no_grad():
    output = trainer.model(**batch)
output.keys()

odict_keys(['loss', 'start_logits', 'end_logits'])

In [75]:
output.start_logits.shape, output.end_logits.shape

(torch.Size([16, 384]), torch.Size([16, 384]))

In [76]:
output.start_logits.argmax(dim=-1), output.end_logits.argmax(dim=-1)

(tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0'),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0'))

Consider the following sceanarios: 
- the start position could be greater than the end position, or point to a span of text in the question instead of the answer. In that case, we might want to look at the second best prediction to see if it gives a possible answer and select that instead.

By:

- score: adding the start and end logits
- hyper-parameter we call n_best_size

In [77]:
n_best_size = 20

In [78]:
import numpy as np

start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": "" # We need to find a way to get back the original substring corresponding to the answer in the context
                }
            )

### process validation set:
- will need example id and offset mapping needed

In [79]:
def prepare_validation_features(examples):

    # similar process as train
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # need example_id for matching
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

Map the validation set

In [80]:
val_path = "drive/MyDrive/w266/data/TriviaQA/validation.json"
val_data = load_dataset('json', data_files=val_path)
validation = val_data['train']

Using custom data configuration default-7583913932156a30
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-7583913932156a30/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/1 [00:00<?, ?it/s]

In [81]:
validation_features = validation.map(
    prepare_validation_features,
    batched=True,
    remove_columns=validation.column_names
)

  0%|          | 0/3 [00:00<?, ?ba/s]

Now we can grab the predictions for all features by using the `Trainer.predict` method:

In [82]:
raw_predictions = trainer.predict(validation_features)

The following columns in the test set don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `DistilBertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 159187
  Batch size = 16


The `Trainer` *hides* the columns that are not used by the model (here `example_id` and `offset_mapping` which we will need for our post-processing).

In [83]:
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

Upper limit to avoid very long answers from our considerations.

In [84]:
max_answer_length = 30

In [86]:
start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
offset_mapping = validation_features[0]["offset_mapping"]
# The first feature comes from the first example. For the more general case, we will need to be match the example_id to
# an example index
context = validation[0]["context"]

# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
        # to part of the input_ids that are not in the context.
        if (
            start_index >= len(offset_mapping)
            or end_index >= len(offset_mapping)
            or offset_mapping[start_index] is None
            or offset_mapping[end_index] is None
        ):
            continue
        # Don't consider answers with a length that is either < 0 or > max_answer_length.
        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
            continue
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            start_char = offset_mapping[start_index][0]
            end_char = offset_mapping[end_index][1]
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": context[start_char: end_char]
                }
            )

valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
valid_answers

[{'score': 3.526806, 'text': 'is brackish'},
 {'score': 0.17349815,
  'text': 'is brackish water and what effect does a high concentration of ammonia have    The Laboratory'},
 {'score': -1.1391573, 'text': 'organisms which'},
 {'score': -1.4267871, 'text': 'is'},
 {'score': -1.6588345, 'text': 'is high'},
 {'score': -2.5726087,
  'text': 'it forms an important habitat for some unique animal species  However  it can cause environmental damage  since it is harmful for organisms which'},
 {'score': -3.7877288,
  'text': 'monia have    The Laboratory People Lucy Cook Leave a comment Brackish or brack water is water with a level of'},
 {'score': -3.9822733, 'text': 'monia have    The Laboratory'},
 {'score': -4.4170084, 'text': 'with a level of'},
 {'score': -4.7031345, 'text': 'it forms an important'},
 {'score': -4.7435255, 'text': 'is needed'},
 {'score': -4.819682,
  'text': 'with a level of salinity  between freshwater and seawater  In many places around the world  brackish water appe

Compare to ground-truth answer:

In [87]:
validation[0]["answers"]

{'answer_start': [4136], 'text': ['Salt']}

Our model picked the right as the most likely answer!

As we mentioned in the code above, this was easy on the first feature because we knew it comes from the first example. For the other features, we will need a map between examples and their corresponding features. Also, since one example can give several features, we will need to gather together all the answers in all the features generated by a given example, then pick the best one. The following code builds a map from example index to its corresponding features indices:

In [88]:
import collections

examples = validation
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

## Post-processing function
- predict the impossible answer when that score is greater than the score of the best non-impossible answer

In [89]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

And we can apply our post-processing function to our raw predictions:

In [90]:
final_predictions = postprocess_qa_predictions(validation, validation_features, raw_predictions.predictions)

Post-processing 2496 example predictions split into 159187 features.


  0%|          | 0/2496 [00:00<?, ?it/s]

Use the squad v2 metric for evluation

In [91]:
metric = load_metric("squad_v2")

Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

## Final Evaluation

In [94]:
if squad_v2:
    formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
else:
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in dataset_train_val["validation"]]
metric.compute(predictions=formatted_predictions, references=references)

{'HasAns_exact': 0.4807692307692308,
 'HasAns_f1': 0.4807692307692308,
 'HasAns_total': 2496,
 'best_exact': 0.4807692307692308,
 'best_exact_thresh': 0.0,
 'best_f1': 0.4807692307692308,
 'best_f1_thresh': 0.0,
 'exact': 0.4807692307692308,
 'f1': 0.4807692307692308,
 'total': 2496}