# Fine Tuning BERT Based Model on SQuAD 2.0 dataset

This notebook handles model training.

## Set Up Notebook

In [None]:
# Installs for virtual environment
!pip install transformers datasets evaluate
!pip install tensorflow

In [None]:
# Import libraries

import json
from datasets import load_dataset
import tensorflow as tf

In [None]:
# share the model to the hub requires setting this up
# asks for an access token upon login
# sharing the model to the hub allows for saving checkpoints during training
# and retrieving the model and sharing it publically after training.
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# must connect to git as well for sharing to the hub

!apt install git-lfs
!git config --global user.email "elainekfrench@gmail.com"
!git config --global user.name "ekfrench17"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [None]:
import transformers

# Transformers must be at least 4.16.0 to function with pushing the model the hub community
print(transformers.__version__)

4.51.3


## Load the Data

### Attempted manual data loading from JSON

In [None]:
# load in training and validation squad 2.0 datasets manually converted to JSON (v. the preloaded ones in huggingface)
# file paths for google drive
json_filepath_train = './Data/train.json' # paths would need to adjusted locally if reproducing
json_filepath_validate = './Data/validation.json'

# Read in data from json
with open(json_filepath_train, 'r') as f:
    train_data = json.load(f)
with open(json_filepath_validate, 'r') as f:
    validation_data = json.load(f)

print("Train data length:", len(train_data["data"]))
print("Validation data length:", len(validation_data["data"]))

Train data length: 130317
Validation data length: 9238


In [None]:
# Loading in the json to Dataset structure - this is what is expected for training

dataset = load_dataset("json", data_files=json_filepath_train, field="data")

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

### Load in dataset from HuggingFace

In [None]:
# For faster trianing load in a subset of the data (split parameter)
# I used the first 10,000 datapoints for V1 of my model

squadv2 = load_dataset("squad_v2") #, split="train[:10000]")

# use train_test_split to split the train_data
# used for V1 - limited training data
# squadv2 = squadv2.train_test_split(test_size=0.2) # dont need for full dataset training

In [None]:
# Check the structure

squadv2

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

## Preprocess the training data

In [13]:
# Load a distilBERT tokenizer to process the question and context fields

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [14]:
# account for the case where the model expects padding on the left - in which case the order of the question and the context should switch
# this allows the notebook to work with any kind of model
pad_on_right = tokenizer.padding_side == "right"

In [10]:
# Some notes for preprocessing:
# to deal with text sequences longer than the maximum input length set truncation = "only_second"
# map the start and end positions of the answer by setting return_offset_mapping = True
# with this mapping, it is now possible to find the start and end tokens of the answer --
# use the sequence_ids method to find which part of the offset corresponds to the question
# and which corresponds to the context

def preprocess_train(examples):
  """ Function to truncate and map the start and end tokens of the answer to the context"""
  # strip questions and contexts to clean them up
  questions = [q.strip() for q in examples["question"]]
  contexts = [c.strip() for c in examples["context"]]

  inputs = tokenizer(
      questions if pad_on_right else contexts,
      contexts if pad_on_right else questions,
      max_length=384, # truncate if input exceeds this length
      stride=128, # overlap between chunks whn splitting long contexts
      truncation="only_second" if pad_on_right else "only_first", # truncate only contexts
      return_overflowing_tokens=True, # return multiple examples if context is split
      return_offsets_mapping=True, # map tokens back to character positions
      padding="max_length" # pad all to fixed length for batching
  )

  # Keep track of which original example each tokenized chunk came from
  # a map from a feature to its corresponding example
  sample_mapping = inputs.pop("overflow_to_sample_mapping")

  # a map from token to character position in the original context
  # helps compute start and end positions to align answers
  offset_mapping = inputs.pop("offset_mapping")

  # populate start and end token positions of answers
  #answers = examples["answers"] #old version before impossible answer fix
  inputs["start_positions"] = []
  inputs["end_positions"] = []

  # iterate over each tokenized chunk
  for i, offsets in enumerate(offset_mapping):
    # label impossible answers with the index of the CLS token
    input_ids = inputs["input_ids"][i]
    cls_index = input_ids.index(tokenizer.cls_token_id)

    # get the sequence corresponding to thta example - to know what is the context & what is the question
    sequence_ids = inputs.sequence_ids(i)

    # A single example can give many spans, this is the index of the example containing this span of text
    sample_index = sample_mapping[i]
    answers = examples["answers"][sample_index]

    # If no answers are given, set the cls_index as answer
    if len(answers["answer_start"]) == 0:
      inputs["start_positions"].append(cls_index)
      inputs["end_positions"].append(cls_index)
    else:
      # Start/end character index of the answer in the text
      start_char = answers["answer_start"][0]
      end_char = start_char + len(answers["text"][0])

      # Start token index of the current span in the text
      token_start_index = 0
      while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
        token_start_index += 1

      # End token index of the current span in the text
      token_end_index = len(input_ids) - 1
      while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
        token_end_index -= 1

      # Detect if the answer is out of the span - in this case the feature is labeled w/ CLS index
      if not (
          offsets[token_start_index][0] <= start_char
          and offsets[token_end_index][1] >= end_char
      ):
        inputs["start_positions"].append(cls_index)
        inputs["end_positions"].append(cls_index)
      else:
        # Otherwise move the token_start_index and token_end_index to the two ends of the answer
        # Note: we could go after the last offset if the answer is the last word (edge case)
        while (
            token_start_index < len(offsets)
            and offsets[token_start_index][0] <= start_char
        ):
          token_start_index += 1
        inputs["start_positions"].append(token_start_index - 1)
        while offsets[token_end_index][1] >= end_char:
          token_end_index -= 1
        inputs["end_positions"].append(token_end_index + 1)

  return inputs

In [15]:
# use dataset map function to apply the preprocessing function over the entire dataset
# batched = True speeds up the process

tokenized_squad = squadv2.map(preprocess_train, batched = True, remove_columns=squadv2["train"].column_names)

## Fine tuning

In [16]:
from transformers import TFAutoModelForQuestionAnswering

# Download the pretrained model distilbert base uncased and fine tune it

model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [17]:
# Definitions for training the model
learning_rate = 2e-5
num_train_epochs = 2
weight_decay = 0.01
batch_size = 16

In [19]:
# convert the datasets to tf.data.Dataset
# Use Model.prepare_tf_dataset():
# - can inspect the model to determine which column names it can use as input ( do not have maually specify)
# - supplies a default data collator (samples are already padded to the same length)

train_set = model.prepare_tf_dataset(
    tokenized_squad["train"],
    shuffle=True,
    batch_size=batch_size
)

validation_set = model.prepare_tf_dataset(
    #tokenized_squad["test"] #used for v1
    tokenized_squad["validation"],
    shuffle=False,
    batch_size=batch_size
)

In [25]:
# Create an optimizer and specify a loss function
# create_optimizer gives AdamW optimizer with weight decay & learnign rate schedule
# needs manual comoputation for the number of training steps to build that schedule

from transformers import create_optimizer

total_train_steps = len(train_set) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=learning_rate,
    num_warmup_steps=0,
    num_train_steps=total_train_steps
)

In [None]:
# Transformers models compute loss internally - does not have to be specified explicitly
# the built-in loss will correctly handle masking the loss on padding toeksn, or unlableled tokens
# use built-in Keras metrics
# Use jit_compile to compile the model with XLA  - all examples are padded to the same length

model.compile(optimizer=optimizer, jit_compile=True, metrics=["accuracy"])

In [None]:
from transformers.keras_callbacks import PushToHubCallback

# saving the model to the hub
model_name = "distilbert-finetuned-squad"
push_to_hub_model_id = f"ekfrench/{model_name}"

push_to_hub_callback = PushToHubCallback(
    output_dir=f"./{model_name}",
    tokenizer=tokenizer,
    hub_model_id=push_to_hub_model_id,
)

callbacks = [push_to_hub_callback]

# train the model
'''model.fit(train_set,
          validation_data=validation_set,
          epochs=num_train_epochs,
          callbacks=callbacks
          )'''

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/ekfrench/distilbert-finetuned-squad into local empty directory.


Download file tf_model.h5:   0%|          | 26.2k/253M [00:00<?, ?B/s]

Clean file tf_model.h5:   0%|          | 1.00k/253M [00:00<?, ?B/s]

'model.fit(train_set,\n          validation_data=validation_set,\n          epochs=num_train_epochs,\n          callbacks=callbacks\n          )'

In [None]:
# load in model and tokenizer from last checkpoint (last epoch that completed)
model = TFAutoModelForQuestionAnswering.from_pretrained("ekfrench/distilbert-finetuned-squad")
tokenizer = AutoTokenizer.from_pretrained("ekfrench/distilbert-finetuned-squad")

All model checkpoint layers were used when initializing TFDistilBertForQuestionAnswering.

All the layers of TFDistilBertForQuestionAnswering were initialized from the model checkpoint at ekfrench/distilbert-finetuned-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


In [27]:
# Resume training from where it left off
model.fit(
    train_set,
    validation_data=validation_set,
    initial_epoch=1, # last completed epoch
    epochs=2,
    callbacks=[push_to_hub_callback]
)

Epoch 2/2


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
   6/8234 [..............................] - ETA: 1:23:49 - loss: 2.1033 - end_logits_accuracy: 0.3854 - start_logits_accuracy: 0.3542





<tf_keras.src.callbacks.History at 0x7fbcab2fa690>

In [28]:
# Save model
model.save_pretrained("distilbert-finetuned-squad")

# Save tokenizer
tokenizer.save_pretrained("distilbert-finetuned-squad")


('distilbert-finetuned-squad/tokenizer_config.json',
 'distilbert-finetuned-squad/special_tokens_map.json',
 'distilbert-finetuned-squad/vocab.txt',
 'distilbert-finetuned-squad/added_tokens.json',
 'distilbert-finetuned-squad/tokenizer.json')

In [29]:
model.push_to_hub("ekfrench/distilbert-finetuned-squad")
tokenizer.push_to_hub("ekfrench/distilbert-finetuned-squad")


README.md:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/ekfrench/distilbert-finetuned-squad/commit/fd633beaa3a9a3adc5b27d3856b06820348f2965', commit_message='Upload tokenizer', commit_description='', oid='fd633beaa3a9a3adc5b27d3856b06820348f2965', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ekfrench/distilbert-finetuned-squad', endpoint='https://huggingface.co', repo_type='model', repo_id='ekfrench/distilbert-finetuned-squad'), pr_revision=None, pr_num=None)

## Evaluation

In [None]:
from transformers import TFAutoModelForQuestionAnswering, AutoTokenizer

# load in if needed / coming back during another session
model = TFAutoModelForQuestionAnswering.from_pretrained("ekfrench/distilbert-finetuned-squad")
tokenizer = AutoTokenizer.from_pretrained("ekfrench/distilbert-finetuned-squad")

# account for the case where the model expects padding on the left - in which case the order of the question and the context should switch
# this allows the notebook to work with any kind of model
pad_on_right = tokenizer.padding_side == "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
All model checkpoint layers were used when initializing TFDistilBertForQuestionAnswering.

All the layers of TFDistilBertForQuestionAnswering were initialized from the model checkpoint at ekfrench/distilbert-finetuned-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


In [None]:
# Reload the dataset if needed 
squadv2 = load_dataset("squad_v2")

In [5]:
# Preprocess the validation dataset
# Add to the validation features the following in order to check a given span is inside the context:
# add the ID of the example that generated the feature
# the offset mappin thta gives a map from token indices to character positions in the context

def preprocess_validation(examples):
  # Tokenize examples with truncation and padding, but keep the overflows using a stride.
  # This results in one example possible giving several features whena context is long, each of
  # those features having a context that overlaps a bit the context of the previous feature.
  questions = [q.strip() for q in examples["question"]]
  contexts = [c.strip() for c in examples["context"]]

  tokenized_examples = tokenizer(
      questions if pad_on_right else contexts,
      contexts if pad_on_right else questions,
      truncation = "only_second" if pad_on_right else "only_first",
      max_length = 384,
      stride = 128,
      return_overflowing_tokens = True,
      return_offsets_mapping = True,
      padding = "max_length"
  )

  # One example might give several features if it has a long context,
  # need a map from a feature to its corresponding example
  sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

  # keep the example_id that gives this feature and store the offset mappings
  tokenized_examples["example_id"] = []

  for i in range(len(tokenized_examples["input_ids"])):
    # Get the sequence that correspons to that example to know what is the context and what is the question
    sequence_ids = tokenized_examples.sequence_ids(i)
    context_index = 1 if pad_on_right else 0

    # An example can give several spans, this is the index of the example containing this span of text
    sample_index = sample_mapping[i]
    tokenized_examples['example_id'].append(examples["id"][sample_index])

    # Set to None the offset_mapping that are not part of the context
    # so it's easy to determine if a token position is part of the context or not
    tokenized_examples["offset_mapping"][i] = [
        (offset if sequence_ids[key] == context_index else None)
        for key,offset in enumerate(tokenized_examples["offset_mapping"][i])
    ]

  return tokenized_examples

In [6]:
# Apply the preprocess_validation function to the testing data, similar to for the training data
"""validation_features = squadv2["test"].map(
    preprocess_validation,
    batched = True,
    remove_columns = squadv2["test"].column_names,
)"""

# Use this for v2 - full dataset
validation_features = squadv2["validation"].map(
    preprocess_validation,
    batched = True,
    remove_columns = squadv2["validation"].column_names,
)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [7]:
# Turn the dataset into a tf.data.Dataset format
validation_dataset = model.prepare_tf_dataset(
    validation_features,
    shuffle=False,
    batch_size=16
)

In [8]:
# Get the predictions for all features using the model.predict method
raw_predictions = model.predict(validation_dataset)



In [None]:
from tqdm.auto import tqdm
import collections

def postprocess_qa_prediction(examples, features, all_start_logits, all_end_logits, n_best_size = 20, max_answer_length = 30):
  # Build a map between examples and their corresponding features
  example_id_to_index = {k: i for i,k in enumerate(examples["id"])}
  features_per_example = collections.defaultdict(list)
  for i,feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

  # initialize predictions dictionary to be filled
  predictions = collections.OrderedDict()

  # Log to the console
  print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

  # Loop over all the examples to gather all the answers in all the features generated
  # by a given example and then pick the best one
  for example_index, example in enumerate(tqdm(examples)):
    # Get the indices of the features associated to the current example
    feature_indices = features_per_example[example_index]

    min_null_score = None
    valid_answers = []

    context = example["context"]
    # Loop through all the featues associated to the current example
    for feature_index in feature_indices:
      # get the predictions of the model for this feature
      start_logits = all_start_logits[feature_index]
      end_logits = all_end_logits[feature_index]

      # map the positions in the logits to the span of texts in the original context
      offset_mapping = features[feature_index]["offset_mapping"]

      # Update minimum null prediction
      cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)

      #
      feature_null_score = start_logits[cls_index] + end_logits[cls_index]
      if min_null_score is None or min_null_score < feature_null_score:
        min_null_score = feature_null_score

      # Go through all possibilities for the n_best_size greater start and end logits
      start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1: -1].tolist()
      end_indexes = np.argsort(end_logits[-1 : -n_best_size -1 : -1].tolist())
      for start_index in start_indexes:
        for end_index in end_indexes:
          # Don't consider out-of-scope answers
          # either bc the indices are out of bounds or
          # correspond to part of the input_ids that are not in the context
          if( start_index >= len(offset_mapping)
            or end_index >= len(offset_mapping)
            or not offset_mapping[start_index]
            or not offset_mapping[end_index]
          ):
            continue
          # Do not consider answers w/ a length that is either <0 or > max_answer_length
          if(
              end_index < start_index
              or end_index - start_index + 1 > max_answer_length
          ):
            continue

          start_char = offset_mapping[start_index][0]
          end_char = offset_mapping[end_index][1]
          valid_answers.append(
              {
                  "score": start_logits[start_index] + end_logits[end_index],
                  "text": context[start_char:end_char],
              }
          )

          if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
          else:
            # edge case: not a sing non-null prediction
            # create a fake prediction to avoid failure
            best_answer = {"text": "", "score": 0.0}

          # pick the final answer - the best answer or the null answer
          predictions[example["id"]] = best_answer["text"] if best_answer["score"] > min_null_score else ""

  return predictions

In [11]:
# apply the post processing function to the raw predictions
final_predictions = postprocess_qa_prediction(
    squadv2["validation"], #squadv2["test"], used for v1
    validation_features,
    raw_predictions["start_logits"],
    raw_predictions["end_logits"]
)

Post-processing 11873 example predictions split into 12134 features.


  0%|          | 0/11873 [00:00<?, ?it/s]

In [15]:
def compute_metrics(p,validation_features):
    predictions = postprocess_qa_prediction(
        examples=squadv2["validation"],
        features=validation_features,
        raw_predictions=p.predictions,
        tokenizer=tokenizer
    )

    # raw string answer the model predicts
    formatted_predictions = [
        {"id": k, "prediction_text": v, "no_answer_probability": 0.0}
        for k, v in predictions.items()
    ]

    # Ground truth
    references = [
        {"id": ex["id"], "answers": ex["answers"]}
        for ex in squadv2["test"]
    ]

    return metric.compute(predictions=formatted_predictions, references=references)


In [13]:
import evaluate
metric = evaluate.load("squad_v2")

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [16]:
predicted_ids = set(final_predictions.keys())

references = [
    {"id": ex["id"], "answers": ex["answers"]}
    for ex in squadv2["validation"] #squadv2["test"]
    if ex["id"] in predicted_ids
]

formatted_predictions = [
    {"id": k, "prediction_text": v, "no_answer_probability": 0.0}
    for k, v in final_predictions.items()
]


In [17]:
metric.compute(predictions=formatted_predictions, references=references)

{'exact': 54.66534784042202,
 'f1': 55.193867443969246,
 'total': 6066,
 'HasAns_exact': 8.424391963341558,
 'HasAns_f1': 9.554458905575451,
 'HasAns_total': 2837,
 'NoAns_exact': 95.29266026633633,
 'NoAns_f1': 95.29266026633633,
 'NoAns_total': 3229,
 'best_exact': 54.66534784042202,
 'best_exact_thresh': 0.0,
 'best_f1': 55.19386744396922,
 'best_f1_thresh': 0.0}

# Inference

In [18]:
context = """The dominant sequence transduction models are based on complex recurrent or convolutional
neural networks in an encoder-decoder configuration. The best performing models also connect the encoder
and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on
two machine translation tasks show these models to be superior in quality while being more parallelizable
and requiring significantly less time to train."""
question = "What kind of mechanisms is Transformer based on?"

inputs = tokenizer(context, question, return_tensors="tf")
# Pass the inputs to the model and return the logits
outputs = model(**inputs)

In [19]:
# Get the highest probability from the model output for the start and end positions
answer_start_index = tf.argmax(outputs.start_logits, axis=-1)[0].numpy()
answer_end_index = tf.argmax(outputs.end_logits, axis=-1)[0].numpy()

In [20]:
# Decode the predicted tokens to get the answer
predict_answer_tokens = inputs["input_ids"][0][answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'[CLS]'

## inference using Pipeline API

In [21]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="ekfrench/distilbert-finetuned-squad")
question_answerer(question=question, context=context)

Some layers from the model checkpoint at ekfrench/distilbert-finetuned-squad were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at ekfrench/distilbert-finetuned-squad and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use 0


{'score': 0.37380099296569824,
 'start': 318,
 'end': 338,
 'answer': 'attention mechanisms'}