In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb
!pip install --upgrade transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("hung200504/bert-covid-10", trust_remote_code=True)

#Freeze all layers
for param in model.parameters():
    param.requires_grad = False

#Unfreeze qa_ouputs layers
for param in model.qa_outputs.parameters():
    param.requires_grad = True



In [None]:
from datasets import load_dataset

dataset = load_dataset("minh21/cpgQA-v1.0-unique-context", split = "train")
eval_dataset = load_dataset("minh21/cpgQA-v1.0-unique-context", split = "test")



Downloading readme:   0%|          | 0.00/732 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/147k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/871 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/226 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("hung200504/bert-covid-10", trust_remote_code=True)




In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answer_text"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = examples["answer_start"][0]
        end_char = examples["answer_start"][0] + len(examples["answer_text"])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)
eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)



Map:   0%|          | 0/871 [00:00<?, ? examples/s]

Map:   0%|          | 0/226 [00:00<?, ? examples/s]

In [None]:
print(dataset)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 871
})


In [None]:
# Import the TrainingArguments class from the transformers library
from transformers import TrainingArguments

# Create an instance of TrainingArguments with the following parameters:
training_args = TrainingArguments(
    "bert-covid-10",  # The output directory where the model predictions and checkpoints will be written.

    per_device_train_batch_size = 16,  # Batch size for training.
    per_device_eval_batch_size = 16,  # Batch size for evaluation.

    evaluation_strategy="steps",  # Evaluation is done (and logged) every `eval_steps`.
    save_strategy="epoch",  # The model checkpoint is saved at the end of each epoch.

    learning_rate=3e-5,  # Learning rate for the optimizer.

    do_eval=True,  # Whether to run evaluation during training.

    eval_steps = 5,  # Number of update steps between two evaluations.

    num_train_epochs=1,  # Total number of training epochs to perform.

    logging_steps=5,  # Number of update steps between two logs.

    fp16=True,  # Whether to use 16-bit (mixed) precision training instead of 32-bit training.
)


In [None]:
from transformers import Trainer
import torch


trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=eval_dataset,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

4311af5440e688d6054079f35d07feb6669756be

In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
5,10.2916,11.552251
10,10.1994,11.49424
15,9.7621,11.44247
20,10.0828,11.39693
25,9.9806,11.356741
30,10.0913,11.322533
35,9.4998,11.294442
40,9.4188,11.272215
45,9.7955,11.256088
50,9.6201,11.24546


TrainOutput(global_step=55, training_loss=9.876051191850141, metrics={'train_runtime': 30.9171, 'train_samples_per_second': 28.172, 'train_steps_per_second': 1.779, 'total_flos': 170692106337792.0, 'train_loss': 9.876051191850141, 'epoch': 1.0})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#Push model to hub
trainer.push_to_hub("bert-cased")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

'https://huggingface.co/hung200504/test-bert-4/tree/main/'

In [None]:

from transformers import  pipeline

pipeline = pipeline("question-answering", model="bioformers/bioformer-litcovid", tokenizer="bioformers/bioformer-litcovid")


Downloading (…)lve/main/config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

In [None]:
# Initialize total F1 score and accuracy to 0
total_f1 = 0
total_accuracy = 0

# Loop through each instance in the evaluation dataset
for ins in eval_dataset:
  # Generate an answer using a pipeline with the question and context from the instance
  ans = pipeline(question=ins['question'], context=ins['context'], max_answer_len=50, max_question_len=300)

  # Tokenize the reference answer and the generated answer
  ref_tokens = tokenizer(" " + ins["answer_text"])["input_ids"]
  ans_tokens = tokenizer(ans["answer"])["input_ids"]

  # Find the common tokens between the reference answer and the generated answer
  common_tokens = set(ans_tokens) & set(ref_tokens)

  # Calculate precision and recall
  precision = len(common_tokens) / len(ans_tokens)
  recall = len(common_tokens) / len(ref_tokens)

  # Add the precision to the total accuracy
  total_accuracy += precision

  # Print the decoded tokens of the generated answer and reference answer
  print(tokenizer.decode(ans_tokens), "|", tokenizer.decode(ref_tokens), "|")

  # If there are no common tokens, add 0 to the total F1 score, else calculate F1 score and add it to the total F1 score
  if (len(common_tokens) == 0):
    total_f1 += 0
    print(0)
  else:
    f1 = 2 * precision * recall / (precision + recall)
    total_f1 += f1
    print(f1)

# Print the average F1 score and accuracy
print("F1 average score:", total_f1 / eval_dataset.num_rows)
print("Accuracy average score: ", total_accuracy / eval_dataset.num_rows)


[CLS] are lung disease, sleep apnea, liver disease, renal disease [SEP] | [CLS] lung disease, sleep apnea, liver disease, renal disease, fall risk, advanced age [SEP] |
0.5454545454545454
[CLS] consider tapering opioids [SEP] | [CLS] consider tapering opioids [SEP] |
1.0
[CLS] ptsd, depression, anxiety [SEP] | [CLS] ptsd, depression, anxiety [SEP] |
0.8571428571428571
[CLS] discuss the risks of continued use [SEP] | [CLS] the risks of continued use, along with possible benefits [SEP] |
0.7000000000000001
[CLS] alcohol use disorder ( aud ), opioid use disorder ( oud ), and / or a use disorder involving other substances [SEP] | [CLS] alcohol use disorder ( aud ), opioid use disorder ( oud ), and / or a use disorder involving other substances [SEP] |
0.7307692307692306
[CLS] benzodiazepines [SEP] | [CLS] benzodiazepines [SEP] |
1.0
[CLS] special attention [SEP] | [CLS] special attention must be given to ensure that the veteran does not feel abandoned [SEP] |
0.38095238095238093
[CLS] when

In [None]:
context = "Opioids are not first-line or routine therapy for chronic pain. Establish treatment goals before starting opioid therapy and a plan if therapy is discontinued. Only continue opioid if there is clinically meaningful improvement in pain and function. Discuss risks, benefits and responsibilities for managing therapy before starting and during treatment."
question = "When to continue opioid therapy?"
print(pipeline(question=question, context=context, max_answer_len=300, max_question_len=300))

{'score': 0.8287140727043152, 'start': 181, 'end': 247, 'answer': 'if there is clinically meaningful improvement in pain and function'}


In [None]:
#Get BLEU score
from nltk.translate.bleu_score import sentence_bleu

total_bleu = 0
for ins in eval_dataset:
  ans = pipeline(question=ins['question'], context=ins['context'], max_answer_len=350, max_question_len=50)
  bleu = sentence_bleu([ins["answer_text"]], ans["answer"])
  total_bleu += bleu

print("BLEU average score:", total_bleu / eval_dataset.num_rows)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU average score: 0.576891102218069
