<a href="https://colab.research.google.com/github/dinh-thang/COS30018-Project-C/blob/main/Falcon_1b_with_Biomed_qa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Full-finetuning Distilled Bert on Covid_qa_deepset dataset**

Load and install necessary libraries and dependancies

In [None]:
!pip install -q transformers einops
!pip install transformers[torch]
!pip install datasets wandb
!pip install --upgrade transformers
!pip install evaluate numpy

# Load the dataset

Start by loading the Covid_qa_deepset dataset from Huggingface. 70% of the dataset is extracted for training. 303 rows (not overlap with the training set) will be used for validation set

In [None]:
from datasets import load_dataset

dataset_name = "covid_qa_deepset"
dataset = load_dataset(dataset_name)
dataset = dataset["train"].train_test_split(train_size=1413,test_size=303)

dataset = dataset.filter(lambda example: not "answer_category" in example['answers'])

Load the tokenizer from the model

In [None]:
from transformers import AutoTokenizer

# load distilled bert tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")

Preprocess the dataset based on the guideline provided by Huggingface

In [None]:
def preprocess_training(example):
    inputs = tokenizer(
        example['question'],
        example["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = example["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_eval(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

training_dataset = dataset['train']
training_dataset = training_dataset.map(preprocess_training, batched=True, remove_columns=dataset['train'].column_names)

eval_dataset = dataset['test']
eval_dataset = eval_dataset.map(preprocess_training, batched=True, remove_columns=dataset['test'].column_names)


The last 50 rows of the dataset will be used for testing, which is unseen by model. This is to give unbiased evaluation for the model's performance

In [75]:
test_dataset = load_dataset(dataset_name, split="train[1969:]") #test dataset is extracted from last 50 rows of CovidQA dataset

# Check the format of datasets after preprocess



In [None]:
training_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 1413
})

In [None]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 303
})

In [76]:
test_dataset

Dataset({
    features: ['document_id', 'context', 'question', 'is_impossible', 'id', 'answers'],
    num_rows: 50
})

# Load the model

The model used is a variant of Bert, with 40% less parameters than typical Bert. This model is also finetuned with SQuAD dataset before.

In [None]:
import torch
import transformers
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering

model_name = "distilbert-base-uncased-distilled-squad"


model = DistilBertForQuestionAnswering.from_pretrained(
    model_name,
)
model.config.use_cache = False


# Load evaluation metrics

Load evaluation metric for the validation set. It can be either accuracy or F1 score

In [42]:
import evaluate
import numpy as np

metric = evaluate.load("f1")
#metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions[0], references=labels[0], average="macro")

# Start the training

Set up data collator for data enumeration

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

Configure the training arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="Distilled_bert_CovidQA_model",
    evaluation_strategy = "steps",
    save_steps = 10,
    logging_steps = 10,
    max_steps = 500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)


In [46]:
from transformers import Trainer

max_seq_length = 2048

trainer = Trainer(
    model=model,
    train_dataset=training_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    data_collator = data_collator
)

Log in to wandb to configure training loss visualisation and save the monitor

In [None]:
import wandb
wandb.login(key="2ba90e109c0da4331467290174e54e3483f0d494")
wandb.init(project="QLoRA covidQA")

In [None]:
trainer.train()


Step,Training Loss,Validation Loss,Accuracy
10,1.1265,0.906048,0.80198
20,0.6093,0.718573,0.841584
30,0.6581,0.718396,0.838284
40,0.8716,0.726818,0.828383
50,0.8199,0.686123,0.864686
60,0.9202,0.678616,0.851485
70,0.6646,0.650183,0.851485
80,0.5494,0.663274,0.848185
90,0.763,0.688111,0.844884
100,0.5091,0.664566,0.854785


TrainOutput(global_step=500, training_loss=0.38069887399673463, metrics={'train_runtime': 9669.35, 'train_samples_per_second': 0.827, 'train_steps_per_second': 0.052, 'total_flos': 1038038879754240.0, 'train_loss': 0.38069887399673463, 'epoch': 5.62})

#Run evaluation

This is evaluation on validation dataset, which is somewhat biased after finetuning as it has already been seen by model

In [81]:
#evaluation with validation dataset
trainer.evaluate()

{'eval_loss': 0.8438284397125244,
 'eval_f1': 0.294705621956618,
 'eval_runtime': 96.3285,
 'eval_samples_per_second': 3.145,
 'eval_steps_per_second': 0.197}

This is evaluation on test dataset, which has not been seen by the model

In [None]:
#evaluation with test dataset (unseen by model)
from transformers import QuestionAnsweringPipeline

pipeline = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)

total_f1 = 0
total_accuracy = 0

for ins in test_dataset:
  ans = pipeline(question=ins['question'], context=ins['context'], max_answer_len=50, max_question_len=300)
  ref_tokens = tokenizer(" " + ins["answers"]["text"][0])["input_ids"]
  ans_tokens = tokenizer(ans["answer"])["input_ids"]
  common_tokens = set(ans_tokens) & set(ref_tokens)
  precision = len(common_tokens) / len(ans_tokens)
  recall = len(common_tokens) / len(ref_tokens)
  total_accuracy += precision
  print(tokenizer.decode(ans_tokens), "|", tokenizer.decode(ref_tokens), "|")
  if (len(common_tokens) == 0):
    total_f1 += 0
    print(0)
  else:
    f1 = 2 * precision * recall / (precision + recall)
    total_f1 += f1
    print(f1)

In [83]:
print("F1 average score:", total_f1 / 50)
print("Accuracy average score: ", total_accuracy / 50)

F1 average score: 0.633776704091313
Accuracy average score:  0.7655031439244173


# Inference

Test inference for the model before and after finetuning

In [52]:
# Before training
from transformers import pipeline, QuestionAnsweringPipeline

question = "Where did Covid-19 originated"
context = 'According to epidemiological studies, the Huanan Market in Wuhan was the early and main epicentre of SARS‐CoV‐2 infection'
print(question)

pipe = pipeline(model="distilbert-base-uncased-distilled-squad")
pipe(question=question, context=context, max_answer_len=300, max_question_len=300)

Where did Covid-19 originated


{'score': 0.35626348853111267,
 'start': 42,
 'end': 64,
 'answer': 'Huanan Market in Wuhan'}

In [53]:
#After training
from transformers import QuestionAnsweringPipeline
question = "Where did Covid-19 originated"
context = "According to epidemiological studies, the Huanan Market in Wuhan was the early and main epicentre of SARS‐CoV‐2 infection"
print(question)
#print(context)

pipe2 = QuestionAnsweringPipeline(model=model, tokenizer = tokenizer)
pipe2(question=question, context=context, max_answer_len=300, max_question_len=300)

Where did Covid-19 originated


{'score': 0.4766525328159332,
 'start': 42,
 'end': 64,
 'answer': 'Huanan Market in Wuhan'}