# BioBERT based model

#### Written by Carlos Cuevas Villarmín

Last update: 29/01/2024

**0. Load the Dataset.**

In [2]:
from datasets import load_dataset

dataset = load_dataset("cuevascarlos/PICO-breast-cancer")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 808
    })
    valid: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 101
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 102
    })
})

In [4]:
example = dataset['train'][0]
example

{'id': 0,
 'tokens': ['A',
  'multicenter',
  'randomized',
  'trial',
  'of',
  'the',
  'effects',
  'of',
  'exercise',
  'dose',
  'and',
  'type',
  'on',
  'psychosocial',
  'distress',
  'in',
  'breast',
  'cancer',
  'patients',
  'undergoing',
  'chemotherapy',
  '.',
  'Exercise',
  'may',
  'improve',
  'psychosocial',
  'distress',
  'in',
  'patients',
  'with',
  'cancer',
  ';',
  'however',
  ',',
  'few',
  'studies',
  'have',
  'examined',
  'the',
  'effects',
  'of',
  'different',
  'types',
  'or',
  'doses',
  'of',
  'exercise',
  ',',
  'or',
  'whether',
  'exercise',
  'effects',
  'are',
  'related',
  'to',
  'baseline',
  'depression',
  'levels',
  '.',
  'In',
  'a',
  'multicenter',
  'trial',
  'in',
  'Canada',
  ',',
  'we',
  'randomized',
  '301',
  'patients',
  'with',
  'breast',
  'cancer',
  'initiating',
  'chemotherapy',
  'to',
  'thrice',
  'weekly',
  ',',
  'supervised',
  'exercise',
  'consisting',
  'of',
  'either',
  'a',
  'stand

In [5]:
#Same dictionaries when Dataset was created
tag2idx  = {'I-iv-bin-percent': 0, 'I-control-participants': 1, 'B-cv-bin-percent': 2, 'B-eligibility': 3, 'B-age': 4, 'I-age': 5, 'I-intervention-participants': 6, 'B-cv-cont-q1': 7, 'I-cv-cont-q3': 8, 'B-iv-cont-mean': 9, 'B-location': 10, 'I-condition': 11, 'I-cv-bin-percent': 12, 'B-iv-cont-q1': 13, 'I-control': 14, 'O': 15, 'B-cv-cont-sd': 16, 'I-intervention': 17, 'B-total-participants': 18, 'I-iv-cont-mean': 19, 'B-iv-cont-sd': 20, 'I-cv-bin-abs': 21, 'I-outcome': 22, 'B-outcome-Measure': 23, 'I-iv-cont-q3': 24, 'B-cv-cont-median': 25, 'B-intervention': 26, 'B-cv-bin-abs': 27, 'B-condition': 28, 'I-cv-cont-sd': 29, 'B-control-participants': 30, 'B-iv-cont-median': 31, 'B-intervention-participants': 32, 'I-iv-cont-median': 33, 'B-iv-cont-q3': 34, 'I-outcome-Measure': 35, 'B-cv-cont-mean': 36, 'I-iv-bin-abs': 37, 'I-iv-cont-sd': 38, 'B-control': 39, 'B-ethinicity': 40, 'I-cv-cont-median': 41, 'I-location': 42, 'Entity': 43, 'I-ethinicity': 44, 'I-cv-cont-mean': 45, 'B-iv-bin-percent': 46, 'B-iv-bin-abs': 47, 'I-total-participants': 48, 'B-outcome': 49, 'I-eligibility': 50, 'B-cv-cont-q3': 51}
idx2tag = {0: 'I-iv-bin-percent', 1: 'I-control-participants', 2: 'B-cv-bin-percent', 3: 'B-eligibility', 4: 'B-age', 5: 'I-age', 6: 'I-intervention-participants', 7: 'B-cv-cont-q1', 8: 'I-cv-cont-q3', 9: 'B-iv-cont-mean', 10: 'B-location', 11: 'I-condition', 12: 'I-cv-bin-percent', 13: 'B-iv-cont-q1', 14: 'I-control', 15: 'O', 16: 'B-cv-cont-sd', 17: 'I-intervention', 18: 'B-total-participants', 19: 'I-iv-cont-mean', 20: 'B-iv-cont-sd', 21: 'I-cv-bin-abs', 22: 'I-outcome', 23: 'B-outcome-Measure', 24: 'I-iv-cont-q3', 25: 'B-cv-cont-median', 26: 'B-intervention', 27: 'B-cv-bin-abs', 28: 'B-condition', 29: 'I-cv-cont-sd', 30: 'B-control-participants', 31: 'B-iv-cont-median', 32: 'B-intervention-participants', 33: 'I-iv-cont-median', 34: 'B-iv-cont-q3', 35: 'I-outcome-Measure', 36: 'B-cv-cont-mean', 37: 'I-iv-bin-abs', 38: 'I-iv-cont-sd', 39: 'B-control', 40: 'B-ethinicity', 41: 'I-cv-cont-median', 42: 'I-location', 43: 'Entity', 44: 'I-ethinicity', 45: 'I-cv-cont-mean', 46: 'B-iv-bin-percent', 47: 'B-iv-bin-abs', 48: 'I-total-participants', 49: 'B-outcome', 50: 'I-eligibility', 51: 'B-cv-cont-q3'}

In [6]:
#Load tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
tokenizer

BertTokenizerFast(name_or_path='dmis-lab/biobert-base-cased-v1.2', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)

tokenized_input
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'a', 'multi', '##cent', '##er', 'random', '##ized', 'trial', 'of', 'the', 'effects', 'of', 'exercise', 'dose', 'and', 'type', 'on', 'ps', '##ych', '##oso', '##cial', 'distress', 'in', 'breast', 'cancer', 'patients', 'undergoing', 'ch', '##em', '##otherapy', '.', 'exercise', 'may', 'improve', 'ps', '##ych', '##oso', '##cial', 'distress', 'in', 'patients', 'with', 'cancer', ';', 'however', ',', 'few', 'studies', 'have', 'examined', 'the', 'effects', 'of', 'different', 'types', 'or', 'doses', 'of', 'exercise', ',', 'or', 'whether', 'exercise', 'effects', 'are', 'related', 'to', 'base', '##line', 'depression', 'levels', '.', 'in', 'a', 'multi', '##cent', '##er', 'trial', 'in', 'can', '##ada', ',', 'we', 'random', '##ized', '301', 'patients', 'with', 'breast', 'cancer', 'in', '##iti', '##ating', 'ch', '##em', '##otherapy', 'to', 'th', '##rice', 'weekly', ',', 'supervised', 'exercise', 'consisting', 'of', 'either', 'a', 'standard', 'dose', 'of', '25', 'to', '30', 'minutes', 'of', '

In [8]:
def tokenize_and_align_labels(data):
    tokenized_inputs = tokenizer(data["tokens"], padding='max_length', truncation = True, max_length=512, is_split_into_words=True)

    labels = []
    for i, label in enumerate(data["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
tokenized_train_dataset = dataset['train'].map(tokenize_and_align_labels, batched=True)
tokenized_valid_dataset = dataset['valid'].map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [10]:
tokenized_train_dataset

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 808
})

In [11]:
import evaluate
import numpy as np

label_list = list(tag2idx.keys())
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return results

In [12]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer

model = AutoModelForMaskedLM.from_pretrained("dmis-lab/biobert-base-cased-v1.2", num_labels=len(tag2idx), id2label=idx2tag, label2id=tag2idx)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    adam_epsilon=1e-8,
    adam_beta1=0.9,
    lr_scheduler_type="linear",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/153 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


: 