## Purpose:
- this notebook illustrates how fine-tuning is done for question answering tasks
- source: https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb

### 1. Declare initial variables

In [1]:
# This flag is the difference between SQUAD v1 or 2 (if you're using another dataset, it indicates if impossible
# answers are allowed or not).
squad_v2 = False
model_checkpoint = "distilbert-base-uncased"
batch_size = 1

### 2. Load dataset
- in this case squad v1 is used - does not have "no answers"

In [2]:
from datasets import load_dataset, load_metric

In [3]:
# load squad dataset
datasets = load_dataset("squad_v2" if squad_v2 else "squad")

Reusing dataset squad (C:\Users\tanch\.cache\huggingface\datasets\squad\plain_text\1.0.0\1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)


In [4]:
# the dataset has already been split to training and validation sets
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

### 3. Instantiate tokenizers

In [5]:
# instantiate the tokenzier 
# note that different models require different tokenizers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
# check that the tokenizer we instantiated  is a fast tokenizer because we need its special features
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

- these models have fast tokenizers
    - https://huggingface.co/transformers/index.html#bigtable

In [7]:
max_length = 384 # this refers to max number of TOKENS - not characters
doc_stride = 128 # this is number of overlap, so we do not split long documents inside an answer

### 4. prepare_train_features
- This function splits long documents while ensuring that the answer is still intact and uncorrupted

In [8]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    pad_on_right = tokenizer.padding_side == "right"
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [9]:
prepare_train_features(datasets['train'][0:1])

{'input_ids': [[101, 2000, 3183, 2106, 1996, 6261, 2984, 9382, 3711, 1999, 8517, 1999, 10223, 26371, 2605, 1029, 102, 6549, 2135, 1010, 1996, 2082, 2038, 1037, 3234, 2839, 1012, 10234, 1996, 2364, 2311, 1005, 1055, 2751, 8514, 2003, 1037, 3585, 6231, 1997, 1996, 6261, 2984, 1012, 3202, 1999, 2392, 1997, 1996, 2364, 2311, 1998, 5307, 2009, 1010, 2003, 1037, 6967, 6231, 1997, 4828, 2007, 2608, 2039, 14995, 6924, 2007, 1996, 5722, 1000, 2310, 3490, 2618, 4748, 2033, 18168, 5267, 1000, 1012, 2279, 2000, 1996, 2364, 2311, 2003, 1996, 13546, 1997, 1996, 6730, 2540, 1012, 3202, 2369, 1996, 13546, 2003, 1996, 24665, 23052, 1010, 1037, 14042, 2173, 1997, 7083, 1998, 9185, 1012, 2009, 2003, 1037, 15059, 1997, 1996, 24665, 23052, 2012, 10223, 26371, 1010, 2605, 2073, 1996, 6261, 2984, 22353, 2135, 2596, 2000, 3002, 16595, 9648, 4674, 2061, 12083, 9711, 2271, 1999, 8517, 1012, 2012, 1996, 2203, 1997, 1996, 2364, 3298, 1006, 1998, 1999, 1037, 3622, 2240, 2008, 8539, 2083, 1017, 11342, 1998, 1996, 2

In [10]:
# more samples have been produced due the the splitting function
# transformers uses smart caching - the following code needs to be run only once as subsequent runs uses cached data
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Loading cached processed dataset at C:\Users\tanch\.cache\huggingface\datasets\squad\plain_text\1.0.0\1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41\cache-8e21f5a34da7220b.arrow
Loading cached processed dataset at C:\Users\tanch\.cache\huggingface\datasets\squad\plain_text\1.0.0\1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41\cache-2d9c358a11c9b795.arrow


DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
        num_rows: 88524
    })
    validation: Dataset({
        features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
        num_rows: 10784
    })
})

In [11]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML
def show_random_elements(dataset, num_examples = 10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [12]:
# the resulting output after applying prepare_train_features()
# most important features are
# 1. tokenized ids
# 2. start position
# 3. end position
show_random_elements(tokenized_datasets["train"],3)

Unnamed: 0,attention_mask,end_positions,input_ids,start_positions
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]",184,"[101, 2029, 4743, 3631, 1037, 2501, 1999, 2129, 2521, 2009, 5520, 2058, 2009, 1005, 1055, 2166, 1029, 102, 1996, 2087, 21877, 17802, 2594, 2427, 1010, 3701, 1999, 1996, 1005, 7270, 15460, 2063, 1005, 2344, 4013, 29109, 8017, 6137, 14192, 2229, 1010, 2024, 2307, 14237, 1010, 1998, 1996, 18255, 13181, 11393, 2015, 1997, 1996, 2670, 17401, 2089, 4418, 1996, 7595, 2004, 2027, 4536, 1996, 1000, 17197, 3481, 3111, 1000, 2648, 1996, 8119, 2161, 1012, 1996, 7270, 15460, 2229, 3659, 4235, 2058, 2312, 2752, 1997, 2330, 4153, 1010, 2021, 26478, 2890, 5867, 2043, 2833, 4150, 2800, 1012, 2116, 2024, 2036, 2426, 1996, ...]",181
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]",32,"[101, 2054, 2679, 4989, 2106, 1996, 2883, 4879, 5136, 2893, 9436, 1997, 2077, 1996, 2456, 2883, 1029, 102, 2348, 2109, 1999, 1996, 2883, 1998, 1996, 2137, 2451, 5002, 1010, 1000, 2070, 2060, 2679, 1000, 2003, 2025, 2019, 2880, 2679, 1010, 1998, 1996, 4879, 2641, 15349, 2009, 3188, 2000, 1996, 2456, 2883, 1012, 2004, 1996, 2230, 2883, 2433, 2106, 2025, 5383, 1996, 3160, 4159, 1000, 11377, 1000, 2179, 1999, 3188, 2883, 2229, 1010, 2045, 2020, 8008, 2000, 2131, 2512, 1011, 6696, 2225, 2796, 4841, 1010, 5037, 4841, 1010, 7508, 4841, 1010, 5424, 4841, 1998, 7726, 4841, 2000, 5769, 2037, 5636, 2030, ...]",30
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]",0,"[101, 2073, 2064, 1996, 2087, 21688, 2742, 1997, 8529, 4710, 25152, 16061, 2723, 5997, 2075, 1029, 102, 14263, 2015, 1010, 1037, 16240, 1010, 7689, 1997, 26079, 2015, 1010, 3869, 1010, 12065, 1010, 9808, 12412, 15689, 1010, 20403, 1010, 13456, 1010, 17977, 1010, 7212, 1998, 1037, 7488, 1012, 2012, 1053, 14083, 2389, 1010, 2379, 25703, 1010, 14018, 1999, 2456, 14486, 1996, 5700, 2124, 8529, 4710, 25152, 16061, 2015, 1999, 2556, 1011, 2154, 5207, 1010, 5306, 2763, 2013, 1996, 28034, 1997, 19935, 2632, 1011, 14360, 7839, 9388, 7447, 1006, 6273, 2629, 1516, 3963, 2629, 1007, 1012, 2027, 3104, 2172, 1997, 1996, 2723, ...]",0


# Fine- tuning
- with the data prepared as shown above, we can proceed to fine tune
- DOCs: https://huggingface.co/transformers/main_classes/trainer.html#

### 5. Instantiate the model
- using from_pretrained() method we can load the QA model by name or path

In [13]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
# Initialise the QA model
# the warning is telling us that we need to fine tune the QA model - it has not been fine tuned and cannot do any QA problems
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

### 6. TrainingArguments"
- required to customise our "Trainer" to do the training


In [14]:
args = TrainingArguments(
    f"test-squad",
    evaluation_strategy = "steps",          # evaluation_strategy = "epoch" means to evaluate at the end of each epoch
    eval_steps = 100,                       # evaluate every n number of STEPS - steps = num samples/batch size
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    
)

### 7. default_data_collator
- batches our processed training examples together

In [15]:
from transformers import default_data_collator
data_collator = default_data_collator

### 8. use GPU
- using GPU performs training much much faster

In [16]:
import torch
import gc

In [17]:
gc.collect()
torch.cuda.empty_cache()                   # empty cache so cuda can be used 

In [18]:
torch.cuda.is_available()     # check if cuda/gpu available

True

In [19]:
torch.cuda.current_device()    # current GPU number

0

In [20]:
model.to(torch.device('cuda:0'))    # tells model to use cuda number 0

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

### 9. Trainer 
- performs the training/fine tuning of BERT

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
# import mlflow
# mlflow.end_run()                         # ends previous run

In [22]:
# .train() begins training
trainer.train()


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Step,Training Loss,Validation Loss


KeyboardInterrupt: 