In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb
!pip install --upgrade transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:

import torch
import torch.nn as nn
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", trust_remote_code=True)



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

In [None]:
print(squad)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1000
    })
})


In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)






Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
# Import the TrainingArguments class from the transformers library
from transformers import TrainingArguments

# Create an instance of TrainingArguments with the following parameters:
training_args = TrainingArguments(
    "bert-squadv2",  # The output directory where the model predictions and checkpoints will be written.

    per_device_train_batch_size = 16,  # Batch size for training.
    per_device_eval_batch_size = 16,  # Batch size for evaluation.

    evaluation_strategy="steps",  # Evaluation is done (and logged) every `eval_steps`.
    save_strategy="epoch",  # The model checkpoint is saved at the end of each epoch.

    learning_rate=3e-5,  # Learning rate for the optimizer.

    do_eval=True,  # Whether to run evaluation during training.

    eval_steps = 5,  # Number of update steps between two evaluations.

    num_train_epochs=3,  # Total number of training epochs to perform.

    logging_steps=5,  # Number of update steps between two logs.

    fp16=True,  # Whether to use 16-bit (mixed) precision training instead of 32-bit training.
)


In [None]:
from transformers import Trainer
import torch


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

4311af5440e688d6054079f35d07feb6669756be

In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
5,5.9623,5.808389
10,5.6934,5.437664
15,5.2457,4.854823
20,4.5796,4.285094
25,4.1507,3.991126
30,4.1134,3.744383
35,3.8076,3.501874
40,3.8445,3.071477
45,3.0969,2.647505
50,2.8899,2.566236


TrainOutput(global_step=750, training_loss=1.3882247740427653, metrics={'train_runtime': 673.0291, 'train_samples_per_second': 17.83, 'train_steps_per_second': 1.114, 'total_flos': 2351670810624000.0, 'train_loss': 1.3882247740427653, 'epoch': 3.0})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#Push model to hub
trainer.push_to_hub("bert-squad")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

'https://huggingface.co/hung200504/bert-squadv2/tree/main/'

In [None]:

from transformers import  pipeline

pipeline = pipeline("question-answering", model='hung200504/bert-squadv2', tokenizer='hung200504/bert-squadv2')


Downloading (…)lve/main/config.json:   0%|          | 0.00/682 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/706k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer


eval_dataset = load_dataset("pbaoo2705/covidqa_processed_eval", split="train")

tokenizer = AutoTokenizer.from_pretrained("hung200504/bert-squadv2", trust_remote_code=True)

Downloading readme:   0%|          | 0.00/805 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/730k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
# Initialize total F1 score and accuracy to 0
total_f1 = 0
total_accuracy = 0

# Loop through each instance in the evaluation dataset
for ins in eval_dataset:
  # Generate an answer using a pipeline with the question and context from the instance
  ans = pipeline(question=ins['question'], context=ins['context_chunks'][0], max_answer_len=50, max_question_len=300)

  # Tokenize the reference answer and the generated answer
  ref_tokens = tokenizer(" " + ins["answer"])["input_ids"]
  ans_tokens = tokenizer(ans["answer"])["input_ids"]

  # Find the common tokens between the reference answer and the generated answer
  common_tokens = set(ans_tokens) & set(ref_tokens)

  # Calculate precision and recall
  precision = len(common_tokens) / len(ans_tokens)
  recall = len(common_tokens) / len(ref_tokens)

  # Add the precision to the total accuracy
  total_accuracy += precision

  # Print the decoded tokens of the generated answer and reference answer
  print(tokenizer.decode(ans_tokens), "|", tokenizer.decode(ref_tokens), "|")

  # If there are no common tokens, add 0 to the total F1 score, else calculate F1 score and add it to the total F1 score
  if (len(common_tokens) == 0):
    total_f1 += 0
    print(0)
  else:
    f1 = 2 * precision * recall / (precision + recall)
    total_f1 += f1
    print(f1)

# Print the average F1 score and accuracy
print("F1 average score:", total_f1 / eval_dataset.num_rows)
print("Accuracy average score: ", total_accuracy / eval_dataset.num_rows)


[CLS] avian influenza a ( h7n9 ) virus [SEP] | [CLS] some sporadic cases seemed to be a result of human to human transmissions [SEP] |
0.23076923076923075
[CLS] 1993 – 94 [SEP] | [CLS] appears to take a long time [SEP] |
0.3076923076923077
[CLS] the number of infections and the impact of non - pharmaceutical interventions on covid - 19 in 11 european countries [SEP] | [CLS] 3. 2 % [ 1. 3 % - 7. 6 % ] [SEP] |
0.15
[CLS] increases airway inflammation [SEP] | [CLS] oxidative stress which will further increase the local inflammation in the airway. [SEP] |
0.4
[CLS] pedv ) spreads by fecal – oral contact and can be prevented by oral immunization. therefore, it is necessary to develop an effective oral vaccine against pedv infection. [SEP] | [CLS] pro - inflammatory cytokines [SEP] |
0.09756097560975609
[CLS] pathogenicity [SEP] | [CLS] to fundamental requirements of replication [SEP] |
0.4
[CLS] real - time reverse transcription polymerase chain reaction [SEP] | [CLS] ( a ) reverse transcri

In [None]:
context = "Opioids are not first-line or routine therapy for chronic pain. Establish treatment goals before starting opioid therapy and a plan if therapy is discontinued. Only continue opioid if there is clinically meaningful improvement in pain and function. Discuss risks, benefits and responsibilities for managing therapy before starting and during treatment."
question = "When to continue opioid therapy?"
print(pipeline(question=question, context=context, max_answer_len=300, max_question_len=300))

{'score': 0.7781141996383667, 'start': 181, 'end': 247, 'answer': 'if there is clinically meaningful improvement in pain and function'}


In [None]:
#Get BLEU score
from nltk.translate.bleu_score import sentence_bleu

total_bleu = 0
for ins in eval_dataset:
  ans = pipeline(question=ins['question'], context=ins['context'], max_answer_len=350, max_question_len=50)
  bleu = sentence_bleu([ins["answer_text"]], ans["answer"])
  total_bleu += bleu

print("BLEU average score:", total_bleu / eval_dataset.num_rows)

KeyError: ignored