# BERT fine-tuned with SQuAD 2.0

In [1]:
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,

In [3]:
from datasets import load_dataset
from transformers import glue_convert_examples_to_features, glue_processors, glue_output_modes
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader

squadv2 = load_dataset('squad_v2', split='train[:20000]')

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


In [4]:
squadv2_train = squadv2.train_test_split(test_size=0.2)

In [5]:
squadv2_train["train"][0]

{'id': '5a10d9ce06e79900185c341f',
 'title': 'Internet_service_provider',
 'context': 'For customers with more demanding requirements (such as medium-to-large businesses, or other ISPs) can use higher-speed DSL (such as single-pair high-speed digital subscriber line), Ethernet, metropolitan Ethernet, gigabit Ethernet, Frame Relay, ISDN Primary Rate Interface, ATM (Asynchronous Transfer Mode) and synchronous optical networking (SONET).',
 'question': 'What is available to customers with less demanding requirements?',
 'answers': {'text': [], 'answer_start': []}}

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        if answer["answer_start"] != []:
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)
        else:
            start_positions.append(0)
            end_positions.append(0)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [8]:
tokenized_squad = squadv2_train.map(preprocess_function, batched=True, remove_columns=squadv2_train["train"].column_names)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [9]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [10]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [11]:
training_args = TrainingArguments(
    output_dir="qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Cloning https://huggingface.co/dennischan/my_awesome_qa_model into local empty directory.


Download file pytorch_model.bin:   0%|          | 15.6k/415M [00:00<?, ?B/s]

Download file runs/Apr11_22-27-08_7c630c5e513e/events.out.tfevents.1681252134.7c630c5e513e.2544.0: 100%|######…

Download file runs/Apr17_15-01-59_fb7d1df264ae/1681743830.9223688/events.out.tfevents.1681743830.fb7d1df264ae.…

Download file runs/Apr17_21-35-37_0a8d0bb7071c/1681767445.5179896/events.out.tfevents.1681767445.0a8d0bb7071c.…

Clean file runs/Apr11_22-27-08_7c630c5e513e/events.out.tfevents.1681252134.7c630c5e513e.2544.0:  17%|#7       …

Download file runs/Apr17_21-35-37_0a8d0bb7071c/events.out.tfevents.1681767445.0a8d0bb7071c.2447.0: 100%|######…

Download file runs/Apr07_01-54-02_949227d5caa0/1680832568.2259943/events.out.tfevents.1680832568.949227d5caa0.…

Clean file runs/Apr17_21-35-37_0a8d0bb7071c/1681767445.5179896/events.out.tfevents.1681767445.0a8d0bb7071c.244…

Download file runs/Apr11_09-40-49_0f131312c099/events.out.tfevents.1681206164.0f131312c099.203.0: 100%|#######…

Clean file runs/Apr17_15-01-59_fb7d1df264ae/1681743830.9223688/events.out.tfevents.1681743830.fb7d1df264ae.100…

Clean file runs/Apr17_21-35-37_0a8d0bb7071c/events.out.tfevents.1681767445.0a8d0bb7071c.2447.0:  14%|#3       …

Download file runs/Apr11_03-57-19_f79449a68ec5/1681185444.294855/events.out.tfevents.1681185444.f79449a68ec5.1…

Clean file runs/Apr07_01-54-02_949227d5caa0/1680832568.2259943/events.out.tfevents.1680832568.949227d5caa0.294…

Clean file runs/Apr11_09-40-49_0f131312c099/events.out.tfevents.1681206164.0f131312c099.203.0:  10%|9         …

Clean file runs/Apr11_03-57-19_f79449a68ec5/1681185444.294855/events.out.tfevents.1681185444.f79449a68ec5.1827…

Download file runs/Apr07_05-34-46_4a69bf2302d6/1680845781.66234/events.out.tfevents.1680845781.4a69bf2302d6.95…

Download file runs/Apr11_09-40-49_0f131312c099/1681206164.6751606/events.out.tfevents.1681206164.0f131312c099.…

Clean file runs/Apr07_05-34-46_4a69bf2302d6/1680845781.66234/events.out.tfevents.1680845781.4a69bf2302d6.950.1…

Download file runs/Apr09_06-34-03_5d3dc5e25849/1681022153.8240514/events.out.tfevents.1681022153.5d3dc5e25849.…

Clean file runs/Apr11_09-40-49_0f131312c099/1681206164.6751606/events.out.tfevents.1681206164.0f131312c099.203…

Download file runs/Apr11_03-59-42_f79449a68ec5/1681185588.1563518/events.out.tfevents.1681185588.f79449a68ec5.…

Clean file runs/Apr09_06-34-03_5d3dc5e25849/1681022153.8240514/events.out.tfevents.1681022153.5d3dc5e25849.438…

Clean file runs/Apr11_03-59-42_f79449a68ec5/1681185588.1563518/events.out.tfevents.1681185588.f79449a68ec5.196…

Download file runs/Apr05_11-52-25_5b4fb9393a0f/1680695550.959254/events.out.tfevents.1680695550.5b4fb9393a0f.1…

Download file runs/Apr11_03-18-39_f79449a68ec5/1681183237.3815334/events.out.tfevents.1681183237.f79449a68ec5.…

Clean file runs/Apr05_11-52-25_5b4fb9393a0f/1680695550.959254/events.out.tfevents.1680695550.5b4fb9393a0f.1699…

Clean file runs/Apr11_03-18-39_f79449a68ec5/1681183237.3815334/events.out.tfevents.1681183237.f79449a68ec5.618…

Download file runs/Apr05_12-41-10_5f36a7cd36a8/1680698475.5919452/events.out.tfevents.1680698475.5f36a7cd36a8.…

Clean file runs/Apr05_12-41-10_5f36a7cd36a8/1680698475.5919452/events.out.tfevents.1680698475.5f36a7cd36a8.315…

Download file runs/Apr11_03-59-42_f79449a68ec5/events.out.tfevents.1681185588.f79449a68ec5.19632.0: 100%|#####…

Clean file runs/Apr11_03-59-42_f79449a68ec5/events.out.tfevents.1681185588.f79449a68ec5.19632.0:  19%|#8      …

Download file runs/Apr05_12-24-12_5f36a7cd36a8/events.out.tfevents.1680697527.5f36a7cd36a8.315.0: 100%|#######…

Clean file runs/Apr05_12-24-12_5f36a7cd36a8/events.out.tfevents.1680697527.5f36a7cd36a8.315.0:  20%|#9        …

Download file runs/Apr09_06-34-03_5d3dc5e25849/events.out.tfevents.1681022153.5d3dc5e25849.4383.0: 100%|######…

Download file runs/Apr07_05-34-46_4a69bf2302d6/events.out.tfevents.1680845781.4a69bf2302d6.950.0: 100%|#######…

Clean file runs/Apr09_06-34-03_5d3dc5e25849/events.out.tfevents.1681022153.5d3dc5e25849.4383.0:  19%|#8       …

Download file runs/Apr17_15-01-59_fb7d1df264ae/events.out.tfevents.1681743830.fb7d1df264ae.1005.0: 100%|######…

Clean file runs/Apr07_05-34-46_4a69bf2302d6/events.out.tfevents.1680845781.4a69bf2302d6.950.0:  23%|##3       …

Clean file runs/Apr17_15-01-59_fb7d1df264ae/events.out.tfevents.1681743830.fb7d1df264ae.1005.0:  19%|#9       …

Download file runs/Apr11_22-27-08_7c630c5e513e/1681252134.864408/events.out.tfevents.1681252134.7c630c5e513e.2…

Clean file runs/Apr11_22-27-08_7c630c5e513e/1681252134.864408/events.out.tfevents.1681252134.7c630c5e513e.2544…

Download file runs/Apr05_12-24-12_5f36a7cd36a8/1680697527.1217728/events.out.tfevents.1680697527.5f36a7cd36a8.…

Clean file runs/Apr05_12-24-12_5f36a7cd36a8/1680697527.1217728/events.out.tfevents.1680697527.5f36a7cd36a8.315…

Download file runs/Apr05_11-52-25_5b4fb9393a0f/events.out.tfevents.1680695550.5b4fb9393a0f.1699.0: 100%|######…

Download file runs/Apr11_03-18-39_f79449a68ec5/events.out.tfevents.1681183237.f79449a68ec5.6180.0: 100%|######…

Clean file runs/Apr05_11-52-25_5b4fb9393a0f/events.out.tfevents.1680695550.5b4fb9393a0f.1699.0:  24%|##3      …

Clean file runs/Apr11_03-18-39_f79449a68ec5/events.out.tfevents.1681183237.f79449a68ec5.6180.0:  20%|##       …

Download file runs/Apr07_01-54-02_949227d5caa0/events.out.tfevents.1680832568.949227d5caa0.2945.0: 100%|######…

Clean file runs/Apr07_01-54-02_949227d5caa0/events.out.tfevents.1680832568.949227d5caa0.2945.0:  23%|##3      …

Download file runs/Apr05_12-41-10_5f36a7cd36a8/events.out.tfevents.1680698475.5f36a7cd36a8.315.2: 100%|#######…

Clean file runs/Apr05_12-41-10_5f36a7cd36a8/events.out.tfevents.1680698475.5f36a7cd36a8.315.2:  23%|##3       …

Download file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]

Download file runs/Apr11_03-57-19_f79449a68ec5/events.out.tfevents.1681185444.f79449a68ec5.18277.0: 100%|#####…

Clean file training_args.bin:  29%|##8       | 1.00k/3.50k [00:00<?, ?B/s]

Clean file runs/Apr11_03-57-19_f79449a68ec5/events.out.tfevents.1681185444.f79449a68ec5.18277.0: 100%|########…

Clean file pytorch_model.bin:   0%|          | 1.00k/415M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,1.4815,1.257535
2,1.0012,1.187109
3,0.7339,1.224427


TrainOutput(global_step=3000, training_loss=1.2157563171386718, metrics={'train_runtime': 4026.337, 'train_samples_per_second': 11.922, 'train_steps_per_second': 0.745, 'total_flos': 9406683242496000.0, 'train_loss': 1.2157563171386718, 'epoch': 3.0})

In [None]:
question1 = "What are the symptoms of infecting covid?"
context1 = "Cough (68%), fever/chills (58%), and shortness of breath (37%) reported most often. Median number of days from symptom onset to laboratory confirmed COVID-19 diagnosis was 4 days (range 0-26 days)"

In [None]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model="qa_model")

answer1 = question_answerer(question=question1, context=context1)['answer']
question_answerer(question=question1, context=context1)

{'score': 0.2413320541381836,
 'start': 0,
 'end': 25,
 'answer': 'Cough (68%), fever/chills'}

In [None]:
question2 = "What causes death from Covid-19?"
context2 = "Since the end of 2019 the Coronavirus Disease 2019 (COVID-19), caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), has spread globally affecting people worldwide. Patients with severe CO VID-19 require intensive care unit (ICU) admission for acute respiratory failure and over 10% need noninvasive and invasive mechanical ventilation."


In [None]:
answer2 = question_answerer(question=question2, context=context2)['answer']
question_answerer(question=question2, context=context2)

{'score': 0.2504980266094208,
 'start': 73,
 'end': 106,
 'answer': 'severe acute respiratory syndrome'}

In [None]:
question3 = "Where is the origin of covid?"
context3 = "On December 31, 2019, the first case of what would later be known as SARS-CoV-2 was detected in the city of Wuhan, China. By January 10, 2022, more than 305 million people had been infected with COVID- 19, leading to more than 5.4 million deaths."


In [None]:
answer3 = question_answerer(question=question3, context=context3)['answer']
question_answerer(question=question3, context=context3)

{'score': 0.6742015480995178,
 'start': 108,
 'end': 120,
 'answer': 'Wuhan, China'}

In [None]:
question4 = "What is the origin of covid?"
context4 = "The coronavirus disease 2019 (COVID-19) is caused by a novel strain of SARS-CoV-2. More than 305 million people had been infected with the virus by January 10, 2022, leading to 5.4 million deaths."


In [None]:
answer4 = question_answerer(question=question4, context=context4)['answer']
question_answerer(question=question4, context=context4)

{'score': 0.20548632740974426, 'start': 71, 'end': 81, 'answer': 'SARS-CoV-2'}

In [None]:
question5 = "When was covid first discovered?"
context5 = "In December 2019, a new coronavirus disease named COVID-19 by the World Health Organization broke out in Wuhan, China. There is currently no evidence to support that these drugs may be effective in discouraging Covid-19."

In [None]:
answer5 = question_answerer(question=question5, context=context5)['answer']
question_answerer(question=question5, context=context5)

{'score': 0.35809004306793213,
 'start': 3,
 'end': 16,
 'answer': 'December 2019'}

# Reference

Hugging Face. Question answering — transformers doc-
umentation. https : / / huggingface . co / docs /
transformers / tasks / question _ answering,
n.d. Accessed: Apr. 18, 2023.