# NLP Project Part 1: Train a Q&A model

## Setup environment

In [1]:
import random
import collections
import numpy as np
import pandas as pd

import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

from datasets import load_dataset, load_metric
from datasets import ClassLabel, Sequence

import torch

from tqdm.auto import tqdm

from IPython.display import display, HTML

In [2]:
%run ./utils.ipynb

## Load SQuAD v2 Dataset

In [3]:
squad_v2 = False
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [4]:
train_dataset = load_dataset("squad_v2" if squad_v2 else "squad", split='train[:2%]')
valid_dataset = load_dataset("squad_v2" if squad_v2 else "squad", split='validation[:10%]')

Reusing dataset squad (/home/cloud441/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Reusing dataset squad (/home/cloud441/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [5]:
print(len(train_dataset))
print(len(valid_dataset))

1752
1057


### Pre-processing the training dataset:

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [7]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
pad_on_right = tokenizer.padding_side == "right"

In [8]:
tokenized_train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_dataset = valid_dataset.map(prepare_train_features, batched=True, remove_columns=valid_dataset.column_names)

Loading cached processed dataset at /home/cloud441/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-36782f9e999e54af.arrow
Loading cached processed dataset at /home/cloud441/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-8976bd0fd1bbef81.arrow


## Load pre-trained model:

In [9]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

## Fine-tuned the model

In [10]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [11]:
data_collator = default_data_collator

In [12]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 1793
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 339


Epoch,Training Loss,Validation Loss
1,No log,3.490108
2,No log,3.081948
3,No log,2.84462


***** Running Evaluation *****
  Num examples = 1077
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1077
  Batch size = 16
***** Running Evaluation *****
  Num examples = 1077
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=339, training_loss=3.4318977288440267, metrics={'train_runtime': 374.4792, 'train_samples_per_second': 14.364, 'train_steps_per_second': 0.905, 'total_flos': 527087268804096.0, 'train_loss': 3.4318977288440267, 'epoch': 3.0})

## Validation of the model:

In [14]:
for batch in trainer.get_eval_dataloader():
    break
batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
with torch.no_grad():
    output = trainer.model(**batch)

In [15]:
n_best_size = 20

In [16]:
start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()

# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []

for start_index in start_indexes:
    for end_index in end_indexes:
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": "" # We need to find a way to get back the original substring corresponding to the answer in the context
                }
            )

In [17]:
validation_features = valid_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=valid_dataset.column_names
)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [18]:
raw_predictions = trainer.predict(validation_features)

The following columns in the test set  don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id.
***** Running Prediction *****
  Num examples = 1077
  Batch size = 16


In [19]:
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

In [20]:
max_answer_length = 30

In [21]:
start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
offset_mapping = validation_features[0]["offset_mapping"]
# The first feature comes from the first example. For the more general case, we will need to be match the example_id to
# an example index
context = valid_dataset[0]["context"]

# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
        # to part of the input_ids that are not in the context.
        if (
            start_index >= len(offset_mapping)
            or end_index >= len(offset_mapping)
            or offset_mapping[start_index] is None
            or offset_mapping[end_index] is None
        ):
            continue
        # Don't consider answers with a length that is either < 0 or > max_answer_length.
        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
            continue
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            start_char = offset_mapping[start_index][0]
            end_char = offset_mapping[end_index][1]
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": context[start_char: end_char]
                }
            )

valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]

In [22]:
final_predictions = postprocess_qa_predictions(valid_dataset, validation_features, raw_predictions.predictions)

Post-processing 1057 example predictions split into 1077 features.


  0%|          | 0/1057 [00:00<?, ?it/s]

In [23]:
metric = load_metric("squad_v2" if squad_v2 else "squad")

In [24]:
if squad_v2:
    formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
else:
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in valid_dataset]

In [25]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 26.20624408703879, 'f1': 34.634637829859194}

## Compare to fully pre-trained model on SQuAD v2:

In [26]:
pre_trained_model_checkpoint = "mvonwyl/roberta-base-finetuned-squad2"

In [27]:
pre_trained_model = AutoModelForQuestionAnswering.from_pretrained(pre_trained_model_checkpoint)

loading configuration file https://huggingface.co/mvonwyl/roberta-base-finetuned-squad2/resolve/main/config.json from cache at /home/cloud441/.cache/huggingface/transformers/a46e82aadc0a6002126badf2b5ba957d71f1e3f1835afd294b79a5bdea73c39d.ca856a3487731f84249815edc5b3ab0f29937c0a7e2acf4d4d629aba6979c3af
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_siz

In [28]:
pre_trained_model_name = pre_trained_model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{pre_trained_model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [29]:
data_collator = default_data_collator

In [30]:
trainer = Trainer(
    pre_trained_model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [31]:
pre_trained_raw_predictions = trainer.predict(validation_features)

The following columns in the test set  don't have a corresponding argument in `RobertaForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id.
***** Running Prediction *****
  Num examples = 1077
  Batch size = 16


In [32]:
pre_trained_final_predictions = postprocess_qa_predictions(valid_dataset, validation_features, pre_trained_raw_predictions.predictions)

Post-processing 1057 example predictions split into 1077 features.


  0%|          | 0/1057 [00:00<?, ?it/s]

In [33]:
if squad_v2:
    formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in pre_trained_final_predictions.items()]
else:
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in pre_trained_final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in valid_dataset]

In [34]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact_match': 0.0, 'f1': 0.38548752834467115}

## Sample of bad classified data:

In [35]:
count = 0
i = 0

while (count < 2) and i < len(references):
    if (formatted_predictions[i]["prediction_text"] != "") and (formatted_predictions[i]["prediction_text"] not in references[i]["answers"]):
        print(f"prediction is:\n{formatted_predictions[i]}")
        print(f"reference is:\n{references[i]}\n")
        count += 1
    i += 1
        

prediction is:
{'id': '56bf1ae93aeaaa14008c951f', 'prediction_text': 'second commercial during'}
reference is:
{'id': '56bf1ae93aeaaa14008c951f', 'answers': {'text': ['third', 'third', 'third'], 'answer_start': [355, 355, 355]}}

prediction is:
{'id': '56beafca3aeaaa14008c9208', 'prediction_text': 'planned to'}
reference is:
{'id': '56beafca3aeaaa14008c9208', 'answers': {'text': ['early 2012', 'In early 2012', '2012'], 'answer_start': [3, 0, 9]}}

