# **Full finetuning PubMedBERT with Covid_qa_deepset dataset**
>Author: Quang Thang Dinh - Team 4 | Option C project


Install neccessary libraries and dependencies

In [None]:
!pip install transformers datasets
!pip install evaluate
!pip install accelerate
!pip install evaluate

In [3]:
from transformers import DistilBertTokenizer, BertTokenizerFast, Trainer, TrainingArguments
from transformers import AutoModelForQuestionAnswering
from transformers import DistilBertModel, BertModel
from transformers import QuestionAnsweringPipeline
from transformers import BertForQuestionAnswering
from transformers import default_data_collator
from huggingface_hub import notebook_login
from transformers import TrainingArguments
from datasets import load_dataset
from evaluate import evaluator
from evaluate import load
import pandas as pd
import numpy as np
import evaluate
import torch

The model I'm going to use is PubMedBERT from Microsoft https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext.

The dataset used for finetuning is the covide_qa_deepset, with 50 last examples dedicated for evaluating. Therefore, I have a total of 1969 examples for training with a train_test_split of 0.2

In [7]:
# general constants
dataset_name = "covid_qa_deepset"
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
tokenizer_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

# loading the dataset and tokenizer
dataset = load_dataset(dataset_name, split="train[:1969]")
dataset = dataset.train_test_split(test_size=0.2)

tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name)

Load the model for question answering task

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Preprocess function to format the input dataset. The detail explanation of this process will be covered in the team's report

In [9]:
# data preprocess function
def preprocess_data(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        stride=256,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
training_dataset = dataset["train"].map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)
eval_dataset = dataset["test"].map(preprocess_data, batched=True, remove_columns=dataset["test"].column_names)

After preprocessed, the dataset will have the format

In [18]:
training_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 43578
})

Below is the training arguments and loading the data collator

In [13]:
output_dir = "qthang-finetuned"
evaluation_strategy = "epoch"
learning_rate = 2e-5
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
num_train_epochs = 3
weight_decay = 0.01
push_to_hub = False

args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy=evaluation_strategy,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    push_to_hub=push_to_hub,
)

data_collator = default_data_collator


In [14]:
trainer = Trainer(
    model,
    args,
    train_dataset=training_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

Loading the finetuned model and dataset for evaluation

In [None]:
model_finetuned = "ThangDinh/qthang-finetuned"

eval_model = BertForQuestionAnswering.from_pretrained(
    model_finetuned,
    trust_remote_code=True,
)

In [19]:
dataset_name = "covid_qa_deepset"

test_dataset = load_dataset(dataset_name, split="train[1969:]").shuffle()

Calculating the BLEU and exact match score of the model. To calculate exact match, replace the metric string with "exact-match".

In [None]:
metric = load("bleu")

for example in test_dataset:
    question = [example["question"]]
    context = [example["context"]]
    references = example["answers"]["text"][0]

    model_predictions = pipeline(question=question, context=context, max_answer_len=50, max_question_len=300)
    metric.add_batch(predictions=[model_predictions["answer"]], references=[references])
final_score = metric.compute()


Function to calculate the f1 and accuracy score based on HuggingFace documentation https://huggingface.co/spaces/evaluate-metric/f1 by team member Quoc Bao Pham.

In [None]:
pipeline = QuestionAnsweringPipeline(model=eval_model, tokenizer=tokenizer)

total_f1 = 0
total_accuracy = 0

for ins in test_dataset:
  ans = pipeline(question=ins['question'], context=ins['context'], max_answer_len=50, max_question_len=300)
  ref_tokens = tokenizer(" " + ins["answers"]["text"][0])["input_ids"]
  ans_tokens = tokenizer(ans["answer"])["input_ids"]
  common_tokens = set(ans_tokens) & set(ref_tokens)
  precision = len(common_tokens) / len(ans_tokens)
  recall = len(common_tokens) / len(ref_tokens)
  total_accuracy += precision
  print(tokenizer.decode(ans_tokens), "|", tokenizer.decode(ref_tokens), "|")
  if (len(common_tokens) == 0):
    total_f1 += 0
    print(0)
  else:
    f1 = 2 * precision * recall / (precision + recall)
    total_f1 += f1
    print(f1)

print("F1 average score:", total_f1 / 100)
print("Accuracy average score: ", total_accuracy / 100)

Finally, sample inferencing using the 9th example in the test dataset

In [None]:
question = test_dataset[9]["context"]
context = test_dataset[9]["context"]
answer = test_dataset[9]["answers"]["text"][0]

result = pipeline(question=question, context=context, max_answer_len=100, max_question_len=300)

print("predicted:" + result["answer"])
print("reference:" + answer)