# BERT SQuAD 1.0 Model

In [None]:
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading token

In [None]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:20000]")

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


In [None]:
squad = squad.train_test_split(test_size=0.2)

In [None]:
squad["train"][0]

{'id': '56d1179117492d1400aab90c',
 'title': 'New_York_City',
 'context': "New York City's public bus fleet is the largest in North America, and the Port Authority Bus Terminal, the main intercity bus terminal of the city, serves 7,000 buses and 200,000 commuters daily, making it the busiest bus station in the world.",
 'question': 'How many buses visit the Port Authority Bus Terminal each day?',
 'answers': {'text': ['7,000'], 'answer_start': [155]}}

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Define a function to extract the inputs and labels for fine-tuning bert from squad dataset
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]  # Extract the questions from the squad dataset
    inputs = tokenizer(
        questions,
        examples["context"],   # Extract the context from the squad dataset
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")  # each tuple contains two integers that represent the start and end character positions of the corresponding token in the original input text.
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# Calling the preprocess_function on squad dataset
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
# Collate the examples into a batch using data collator
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

In [None]:
# Load the pre-trained bert model
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [None]:
# Perform the training and set up the parameters
training_args = TrainingArguments(
    output_dir="qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Cloning https://huggingface.co/dennischan/qa_model into local empty directory.


Epoch,Training Loss,Validation Loss
1,1.3537,1.161598
2,0.9266,1.102535
3,0.6741,1.131263


TrainOutput(global_step=3000, training_loss=1.1336800842285155, metrics={'train_runtime': 3800.1971, 'train_samples_per_second': 12.631, 'train_steps_per_second': 0.789, 'total_flos': 9406683242496000.0, 'train_loss': 1.1336800842285155, 'epoch': 3.0})

In [None]:
question1 = "What are the symptoms of infecting covid?"
context1 = "Cough (68%), fever/chills (58%), and shortness of breath (37%) reported most often. Median number of days from symptom onset to laboratory confirmed COVID-19 diagnosis was 4 days (range 0-26 days)"

In [None]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model="qa_model")

answer1 = question_answerer(question=question1, context=context1)['answer']
question_answerer(question=question1, context=context1)

{'score': 0.33816319704055786,
 'start': 0,
 'end': 25,
 'answer': 'Cough (68%), fever/chills'}

In [None]:
question2 = "What causes death from Covid-19?"
context2 = "Since the end of 2019 the Coronavirus Disease 2019 (COVID-19), caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), has spread globally affecting people worldwide. Patients with severe CO VID-19 require intensive care unit (ICU) admission for acute respiratory failure and over 10% need noninvasive and invasive mechanical ventilation."


In [None]:
answer2 = question_answerer(question=question2, context=context2)['answer']
question_answerer(question=question2, context=context2)

{'score': 0.3249039053916931,
 'start': 73,
 'end': 120,
 'answer': 'severe acute respiratory syndrome coronavirus 2'}

In [None]:
question3 = "Where is the origin of covid?"
context3 = "On December 31, 2019, the first case of what would later be known as SARS-CoV-2 was detected in the city of Wuhan, China. By January 10, 2022, more than 305 million people had been infected with COVID- 19, leading to more than 5.4 million deaths."


In [None]:
answer3 = question_answerer(question=question3, context=context3)['answer']
question_answerer(question=question3, context=context3)

{'score': 0.48505425453186035,
 'start': 108,
 'end': 120,
 'answer': 'Wuhan, China'}

In [None]:
question4 = "What is the origin of covid?"
context4 = "The coronavirus disease 2019 (COVID-19) is caused by a novel strain of SARS-CoV-2. More than 305 million people had been infected with the virus by January 10, 2022, leading to 5.4 million deaths."


In [None]:
answer4 = question_answerer(question=question4, context=context4)['answer']
question_answerer(question=question4, context=context4)

{'score': 0.08649849891662598, 'start': 71, 'end': 81, 'answer': 'SARS-CoV-2'}

In [None]:
question5 = "When was covid first discovered?"
context5 = "In December 2019, a new coronavirus disease named COVID-19 by the World Health Organization broke out in Wuhan, China. There is currently no evidence to support that these drugs may be effective in discouraging Covid-19."

In [None]:
answer5 = question_answerer(question=question5, context=context5)['answer']
question_answerer(question=question5, context=context5)

{'score': 0.81943678855896, 'start': 3, 'end': 16, 'answer': 'December 2019'}

# Reference

Hugging Face. Question answering — transformers doc-
umentation. https : / / huggingface . co / docs /
transformers / tasks / question _ answering,
n.d. Accessed: Apr. 18, 2023.