<a href="https://colab.research.google.com/github/dinh-thang/COS30018-Project-C/blob/main/PubMedBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets

In [None]:
!pip install evaluate

In [None]:
!pip install accelerate

In [None]:
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import load_dataset
from transformers import DistilBertModel, BertModel
from transformers import DistilBertTokenizer, BertTokenizerFast, Trainer, TrainingArguments
from transformers import AutoModelForQuestionAnswering
from huggingface_hub import notebook_login
from transformers import default_data_collator


In [None]:
dataset_name = "covid_qa_deepset"

dataset = load_dataset(dataset_name, split="train")

In [None]:
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
tokenizer_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"

tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name)

In [None]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
training_dataset = dataset["train"].map(preprocess_training_examples, batched=True, remove_columns=dataset["train"].column_names)


In [None]:
eval_dataset = dataset["test"].map(preprocess_training_examples, batched=True, remove_columns=dataset["test"].column_names)


In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


In [None]:
notebook_login()

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="qthang-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)


In [None]:
data_collator = default_data_collator


In [None]:
trainer = Trainer(
    model,
    args,
    compute_metrics=compute_metrics,
    train_dataset=training_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
eval_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 11617
})

In [None]:
from evaluate import evaluator
from transformers import AutoModelForCausalLM, BertForQuestionAnswering, TFAutoModelForQuestionAnswering

eval_model = BertForQuestionAnswering.from_pretrained(
    "ThangDinh/qthang-finetuned",
    trust_remote_code=True,
)


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

{'eval_loss': 0.2763604521751404,
 'eval_runtime': 319.0849,
 'eval_samples_per_second': 36.407,
 'eval_steps_per_second': 2.278,
 'epoch': 3.0}

In [None]:
dataset_name = "covid_qa_deepset"

test_dataset = load_dataset(dataset_name, split="train[:10]")

In [None]:
from transformers import QuestionAnsweringPipeline

pipeline = QuestionAnsweringPipeline(model=eval_model, tokenizer=tokenizer)

total_f1 = 0
total_accuracy = 0

for ins in test_dataset:
  ans = pipeline(question=ins['question'], context=ins['context'], max_answer_len=50, max_question_len=300)
  ref_tokens = tokenizer(" " + ins["answers"]["text"][0])["input_ids"]
  ans_tokens = tokenizer(ans["answer"])["input_ids"]
  common_tokens = set(ans_tokens) & set(ref_tokens)
  precision = len(common_tokens) / len(ans_tokens)
  recall = len(common_tokens) / len(ref_tokens)
  total_accuracy += precision
  print(tokenizer.decode(ans_tokens), "|", tokenizer.decode(ref_tokens), "|")
  if (len(common_tokens) == 0):
    total_f1 += 0
    print(0)
  else:
    f1 = 2 * precision * recall / (precision + recall)
    total_f1 += f1
    print(f1)

print("F1 average score:", total_f1 / 100)
print("Accuracy average score: ", total_accuracy / 100)



In [None]:
from evaluate import load
from transformers import QuestionAnsweringPipeline

pipeline = QuestionAnsweringPipeline(model=eval_model, tokenizer=tokenizer)

bleu = evaluate.load("f1")

for example in test_dataset:
    question = [example["question"]]
    context = [example["context"]]
    references = example["answers"]["text"][0]

    model_predictions = pipeline(question=question, context=context, max_answer_len=50, max_question_len=300)
    bleu.add_batch(predictions=[model_predictions["answer"]], references=[references])
final_score = bleu.compute()

In [None]:
final_scoreA

{'exact_match': 0.5}

In [None]:
question #original question

'How does Mannanose Binding Lectin (MBL) affect elimination of HIV-1 pathogen?'

In [None]:
question = "what is the effect of Mannanose Binding Lectin in the elimination of HIV-1 pathogen? "
context = test_dataset[9]["context"]
answer = test_dataset[9]["answers"]["text"][0]

result = pipeline(question=question, context=context, max_answer_len=100, max_question_len=300)

print("predicted:" + result["answer"])
print("reference:" + answer)



predicted:the C-type lectin receptor, dendritic cell-specific ICAM-grabbing non-integrin-related (DC-SIGNR, also known as CD209L or liver/lymph node–specific ICAM-grabbing non-integrin (L-SIGN)), can interact with pathogens including HIV-1 and is expressed at the maternal-fetal interface, we hypothesized that it could influence MTCT of HIV-1
reference:Mannose-binding lectin (MBL) is an innate immune receptor synthesised in the liver and secreted in the bloodstream in response to inflammation signal. MBL promotes pathogen elimination by opsonization and phagocytosis,


In [None]:
print(result)

{'score': 3.246519997279762e-10, 'start': 477, 'end': 810, 'answer': 'the C-type lectin receptor, dendritic cell-specific ICAM-grabbing non-integrin-related (DC-SIGNR, also known as CD209L or liver/lymph node–specific ICAM-grabbing non-integrin (L-SIGN)), can interact with pathogens including HIV-1 and is expressed at the maternal-fetal interface, we hypothesized that it could influence MTCT of HIV-1'}


In [None]:
f1 = evaluate.load("f1")

f1_score = f1.compute(predictions=[result["answer"]], references=[test_dataset[9]["answers"]["text"][0]])

print(f1_score)


In [None]:
bleu = evaluate.load("glue")

for example in test_dataset:
    question = [example["question"]]
    context = [example["context"]]
    references = example["answers"]["text"][0]

    model_predictions = pipeline(question=question, context=context, max_answer_len=50, max_question_len=300)
    bleu.add_batch(predictions=[model_predictions["answer"]], references=[references])
final_score = bleu.compute()
