# BioBERT based model

#### Written by Carlos Cuevas Villarmín

Last update: 29/01/2024

In [103]:
#Load data from csv files

path_train = "./train_full_text.txt"
path_valid = "./valid_full_text.txt"

import pandas as pd

def load_data(path):
    #Read csv file specifying that the first row is the header
    df = pd.read_csv(path, sep=" ", header=0)
    return df

train = load_data(path_train)
valid = load_data(path_valid)


In [104]:
train

Unnamed: 0,fileId,sentence,label_sentence
0,10459028,"A randomized , prospective study of endometria...",O O O O O O B-intervention I-intervention O O ...
1,11136837,Cardiovascular effects of tamoxifen in women w...,O O O B-intervention O O O O O O O O O O O O O...
2,11283119,Tamoxifen for the prevention of breast cancer ...,B-intervention O O O O O O O O O O O O O O O O...
3,12377957,"Tamoxifen , radiation therapy , or both for pr...",B-intervention I-intervention I-intervention I...
4,12393819,Twenty - year follow - up of a randomized stud...,O O O O O O O O O O O B-intervention I-interve...
...,...,...,...
803,8523049,Adequate locoregional treatment for early brea...,O O O O O O O O O O O O O O O O O O O O O O O ...
804,9060533,Bisphosphonate risedronate prevents bone loss ...,B-intervention I-intervention O B-condition I-...
805,9672274,Interim analysis of the incidence of breast ca...,O O O O O O O O O O O O O B-intervention O O O...
806,9678620,Tamoxifen as adjuvant after surgery for breast...,B-intervention O O O O O O O O O O B-control O...


In [105]:
#Types of the columns
train.dtypes

fileId             int64
sentence          object
label_sentence    object
dtype: object

I split the elements of each sentence (or text) and save it in a list.

In [106]:
def SplitData(df):
    '''
    Function that splits the data into words and labels
    Args:   
        df: pandas dataframe
    Returns:
        words: list of lists of words
        words_labels: list of lists of labels
    '''
    words   = [sentence.split() for sentence in df['sentence']]
    words_labels = [label.split() for label in df['label_sentence']]
    print("Number of sentences: ", len(words))
    print("Number of labels: ", len(words_labels))

    return words, words_labels

In [107]:
words_train, words_labels_train = SplitData(train)
words_valid, words_labels_valid = SplitData(valid)

Number of sentences:  808
Number of labels:  808
Number of sentences:  101
Number of labels:  101


Map the words_labels into int values.

In [130]:
#Define the tags
data = pd.read_csv("./dataBIO.txt", sep=" ", header=None, names = ['words', 'fileId', 'start', 'end', 'label'])
tag_values = list(set(data["label"].values))

tag2idx = {t: i for i, t in enumerate(tag_values)}
idx2tag = {i: t for i, t in enumerate(tag_values)}

print(tag2idx)
print(idx2tag)

{'B-total_participants': 0, 'I-cv_bin_percent': 1, 'B-outcome_Measure': 2, 'I-condition': 3, 'B-cv_cont_mean': 4, 'I-location': 5, 'B-iv_cont_q3': 6, 'I-cv_cont_mean': 7, 'B-iv_cont_median': 8, 'B-eligibility': 9, 'I-iv_cont_q3': 10, 'I-age': 11, 'I-outcome': 12, 'B-intervention_participants': 13, 'B-control': 14, 'I-cv_cont_sd': 15, 'I-eligibility': 16, 'I-ethinicity': 17, 'I-outcome_Measure': 18, 'B-ethinicity': 19, 'I-iv_cont_mean': 20, 'B-iv_cont_sd': 21, 'I-iv_cont_sd': 22, 'B-iv_bin_percent': 23, 'O': 24, 'I-cv_cont_median': 25, 'B-condition': 26, 'B-age': 27, 'B-cv_bin_percent': 28, 'I-total_participants': 29, 'B-iv_cont_q1': 30, 'B-outcome': 31, 'B-cv_cont_q3': 32, 'I-cv_cont_q3': 33, 'I-control': 34, 'B-cv_cont_median': 35, 'B-control_participants': 36, 'I-intervention_participants': 37, 'B-cv_cont_q1': 38, 'I-iv_cont_median': 39, 'I-control_participants': 40, 'B-intervention': 41, 'I-intervention': 42, 'I-cv_bin_abs': 43, 'B-iv_cont_mean': 44, 'I-iv_bin_percent': 45, 'I-iv_bi

In [131]:
#Map the labels to the tags
def MapLabels(words_labels, tag2idx):
    '''
    Function that maps the labels to the tags
    Args:
        words_labels: list of lists of labels
        tag2idx: dictionary that maps the labels to the tags
    Returns:
        labels: list of lists of tags
    '''
    labels = [[tag2idx.get(l) for l in lab] for lab in words_labels]
    return labels

In [132]:
labels_train = MapLabels(words_labels_train, tag2idx)
labels_valid = MapLabels(words_labels_valid, tag2idx)

Now I am going to follow the tutorial of HuggingFace [https://huggingface.co/docs/transformers/tasks/token_classification]

In [133]:
#Create a dataset for the train set
from datasets import Dataset

def CreateDataset(words, words_labels):
    '''
    Function that creates a dataset with id, words and labels
    Args:
        words: list of lists of words
        words_labels: list of lists of labels
    Returns:
        dataset: dataset with id, words and labels
    '''
    dataset = Dataset.from_dict({"id": range(len(words)), "tokens": words, "ner_tags": words_labels})
    return dataset



In [134]:
train_dataset = CreateDataset(words_train, labels_train)
valid_dataset = CreateDataset(words_valid, labels_valid)

train_dataset[0]

{'id': 0,
 'tokens': ['A',
  'randomized',
  ',',
  'prospective',
  'study',
  'of',
  'endometrial',
  'resection',
  'to',
  'prevent',
  'recurrent',
  'endometrial',
  'polyps',
  'in',
  'women',
  'with',
  'breast',
  'cancer',
  'receiving',
  'tamoxifen',
  '.',
  'To',
  'assess',
  'the',
  'role',
  'of',
  'endometrial',
  'resection',
  'in',
  'preventing',
  'recurrence',
  'of',
  'tamoxifen',
  '-',
  'associated',
  'endometrial',
  'polyps',
  'in',
  'women',
  'with',
  'breast',
  'cancer',
  '.',
  'Randomized',
  ',',
  'prospective',
  'study',
  '(',
  'Canadian',
  'Task',
  'Force',
  'classification',
  'I',
  ')',
  '.',
  'Tertiary',
  'university',
  '-',
  'affiliated',
  'medical',
  'center',
  '.',
  'Twenty',
  'consecutive',
  'women',
  '(',
  'age',
  'range',
  '43',
  '-',
  '61',
  'yrs',
  ')',
  '.',
  'Hysteroscopic',
  'removal',
  'of',
  'tamoxifen',
  '-',
  'associated',
  'endometrial',
  'polyps',
  'with',
  'or',
  'without',
  '

In [135]:
#Load tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
tokenizer

BertTokenizerFast(name_or_path='dmis-lab/biobert-base-cased-v1.2', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [136]:
example = train_dataset[0]

tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)

tokenized_input
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

tokenized_input.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [137]:
def tokenize_and_align_labels(data):
    tokenized_inputs = tokenizer(data["tokens"], padding='max_length', truncation = True, max_length=42, is_split_into_words=True)

    labels = []
    for i, label in enumerate(data["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [138]:
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_valid_dataset = valid_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/808 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [147]:
import evaluate
import numpy as np

label_list = list(tag2idx.keys())
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return results

In [141]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer

model = AutoModelForMaskedLM.from_pretrained("dmis-lab/biobert-base-cased-v1.2", num_labels=len(tag2idx), id2label=idx2tag, label2id=tag2idx)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [149]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

from transformers import TrainingArguments, Trainer

BB_training_args = TrainingArguments(
    output_dir="model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    adam_epsilon=1e-8,
    adam_beta1=0.9,
    lr_scheduler_type="linear",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=BB_training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory model/checkpoint-51 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.28153184056282043, 'eval_precision': 0.5314285714285715, 'eval_recall': 0.5406976744186046, 'eval_f1': 0.5360230547550432, 'eval_accuracy': 0.91524835012157, 'eval_runtime': 11.4244, 'eval_samples_per_second': 8.841, 'eval_steps_per_second': 0.613, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory model/checkpoint-102 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.31829413771629333, 'eval_precision': 0.495, 'eval_recall': 0.5755813953488372, 'eval_f1': 0.5322580645161291, 'eval_accuracy': 0.8996179228898923, 'eval_runtime': 12.5051, 'eval_samples_per_second': 8.077, 'eval_steps_per_second': 0.56, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory model/checkpoint-153 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.30592969059944153, 'eval_precision': 0.518324607329843, 'eval_recall': 0.5755813953488372, 'eval_f1': 0.5454545454545455, 'eval_accuracy': 0.9058700937825633, 'eval_runtime': 11.5804, 'eval_samples_per_second': 8.722, 'eval_steps_per_second': 0.604, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


{'train_runtime': 1068.7449, 'train_samples_per_second': 2.268, 'train_steps_per_second': 0.143, 'train_loss': 0.16100337458591835, 'epoch': 3.0}


TrainOutput(global_step=153, training_loss=0.16100337458591835, metrics={'train_runtime': 1068.7449, 'train_samples_per_second': 2.268, 'train_steps_per_second': 0.143, 'train_loss': 0.16100337458591835, 'epoch': 3.0})