# Fine-tuning ClinicalBERT (Model training) 

In [1]:
import torch
import pandas as pd
from torch.utils.data import DataLoader
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

### 1. Read the dataset

In [2]:
data_files = {
    "train" : "final_dataset_chunkedtrain.jsonl",
    "test" : "final_dataset_chunkedtest.jsonl"    
}

chunkedDataset = load_dataset("json", data_files = data_files)
chunkedDataset

Using custom data configuration default-5b7ef2585281ad5b
Reusing dataset json (/media/SharedUsers/dlc19/home/.cache/huggingface/datasets/json/default-5b7ef2585281ad5b/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 206331
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 3345
    })
})

In [3]:
example = chunkedDataset["train"]["input_ids"][0]
print("Type: ", type(example))
print("Length: ", len(example))
print(example)

Type:  <class 'list'>
Length:  128
[101, 1509, 2592, 8179, 2229, 185, 1161, 1105, 2495, 1204, 12754, 175, 1114, 1207, 15415, 1112, 14375, 1116, 174, 7501, 1111, 8974, 5531, 2229, 185, 1161, 1105, 11937, 7577, 3839, 9505, 1175, 1110, 1185, 17811, 20994, 185, 1513, 12602, 174, 3101, 17268, 1137, 185, 1673, 1818, 12858, 25632, 20557, 6873, 5552, 11769, 7409, 4233, 1115, 1211, 2620, 4248, 15070, 6719, 1103, 3621, 2660, 16418, 2050, 14196, 27316, 1110, 2999, 16973, 1933, 1166, 1103, 1286, 13093, 9046, 1439, 1103, 7209, 1103, 3077, 1181, 3105, 14701, 1110, 8362, 16996, 23822, 1895, 13306, 19353, 24211, 1785, 1104, 1103, 16530, 1286, 3971, 1105, 5001, 10346, 1132, 2382, 8351, 1185, 12104, 3621, 2660, 16091, 13505, 7637, 1616, 1965, 102, 101, 1509, 2592, 8179, 2229, 185, 1161, 1105, 2495, 1204, 12754, 1607, 175, 1114]


### 2. Prepare dataset batches

In [4]:
# Whole-word-masking function should return our own default data collator 
# and then will use that data collator instead!!! 
# for now using the default data collator first 
from transformers import DataCollatorForLanguageModeling

# replace this data_collator with whole-word-masking-data-collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [5]:
example_batch = [chunkedDataset["train"][i] for i in range(2)]
for sample in example_batch:
    _ = sample.pop("word_ids")  # limit it to only the required columns

for chunk in data_collator(example_batch)["input_ids"]:
    print(f"\n{tokenizer.decode(chunk)}")


[CLS] final [MASK] examination chest pa [MASK] lat [MASK] [MASK] with [MASK] onset ascites eval for infectionzong chest pa and lateral [MASK] none findings there is no focal consolidation ple [MASK] effusion or pneumothorax bilateral nodular opac [MASK] that most likely represent nipple shadows the cardiomediastinal silhouette is [MASK] clips project over the left lung potentially within the breast [MASK] image [MASK] upper abdomen [MASK] unremark [MASK] chronic deformity of the posterior left sixth and seventh ribs are noted impression no acute cardiopu [MASK]onary process [SEP] [CLS] final report [MASK] chest [MASK]a and lat indication history f with

[MASK]ness of breath [MASK] chest pa and lateral comparison findings the cardiac mediastinal and hilar con [MASK]s are normal pulmonary vascula [MASK] [MASK] [MASK] lungs are clear no pleural eff [MASK] or pneumothorax is present [MASK] clips are again seen projecting over the left breast remote left [MASK] rib fractures are also re de

In [6]:
# Eliminate this source of randomness is to apply the masking once on the whole test set,
# and then use the default data collator in 🤗 Transformers to collect the batches during evaluation

# replace data_collator here with the whole-word-masking ones
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [9]:
chunkedDataset = chunkedDataset.remove_columns(["word_ids"])
chunkedDataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 206331
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3345
    })
})

In [10]:
# Apply this function to our test set and drop the unmasked columns so we can replace them with the masked ones
eval_chunkedDataset = chunkedDataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=chunkedDataset["test"].column_names,
)
eval_chunkedDataset = eval_chunkedDataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_token_type_ids": "token_type_ids", 
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Loading cached processed dataset at /media/SharedUsers/dlc19/home/.cache/huggingface/datasets/json/default-5b7ef2585281ad5b/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-64fc3915ad5805f3.arrow


### 3. Set up Dataloaders

In [11]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64

train_dataloader = DataLoader(
    chunkedDataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator, # replace self-defined whole-word-masking-data-collator
)

# Use the default_data_collator from Transformers for the evaluation set
eval_dataloader = DataLoader(
    eval_chunkedDataset, 
    batch_size=batch_size, 
    collate_fn=default_data_collator
)

### 4. Steps for training with Accelerate

In [12]:
# Is it correct lolllll?
model = AutoModelForMaskedLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# https://huggingface.co/docs/transformers/model_doc/auto

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
# Adam optimizer 
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [14]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, 
    optimizer, 
    train_dataloader, 
    eval_dataloader
)

In [27]:
# Learning rate scheduler:
from transformers import get_scheduler

num_train_epochs = 2 # change this later

num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [28]:
######### Saving onto Huggingface hub ###########
from huggingface_hub import get_full_repo_name

model_name = "CXR_BioClinicalBERT_v1"
repo_name = get_full_repo_name(model_name)
repo_name

'dorltcheng/CXR_BioClinicalBERT_v1'

In [29]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

/media/SharedUsers/dlc19/home/codes/nlp-fineTuningBERT/CXR_BioClinicalBERT_v1 is already a clone of https://huggingface.co/dorltcheng/CXR_BioClinicalBERT_v1. Make sure you pull the latest changes with `repo.git_pull()`.


### 5. Full Training Loop

In [30]:
torch.cuda.device(0)

<torch.cuda.device at 0x7fdfca3475c0>

In [33]:
from tqdm.auto import tqdm
import math

progress_bar = tqdm(range(num_training_steps))
perplexities = []

for epoch in range(num_train_epochs): # for now try 2 epochs and see what happen
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_chunkedDataset)]
    
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")
    perplexities.append(perplexity)

    
    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/6448 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 1.7190158058929503
>>> Epoch 1: Perplexity: 1.7190158058929503


In [34]:
print(perplexities)
print(losses)

[1.7190158058929503, 1.7190158058929503]
tensor([0.4909, 0.4909, 0.4909,  ..., 0.6640, 0.6640, 0.6640], device='cuda:0')


### 6. Using the fine-tuned model

In [39]:
from transformers import pipeline

mask_filler_ft = pipeline(
    "fill-mask", model="dorltcheng/CXR_BioClinicalBERT_v1")

mask_filler_original = pipeline(
    "fill-mask", model="emilyalsentzer/Bio_ClinicalBERT")

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
text1 = "There are no signs of [MASK]."

preds1_ft = mask_filler_ft(text1)
preds1_org = mask_filler_original(text1)

print("Predictions for text1:")
print("1. Pre-trained Bio_ClinicalBERT")
for pred in preds1_org:
    print(f">>> {pred['sequence']}")
print()
print("2. Fine-tuned CXR_Bio_ClinicalBERT_v1")
for pred in preds1_ft:
    print(f">>> {pred['sequence']}")


Predictions for text1:
1. Pre-trained Bio_ClinicalBERT
>>> there are no signs of bleeding.
>>> there are no signs of infection.
>>> there are no signs of withdrawal.
>>> there are no signs of change.
>>> there are no signs of distress.

2. Fine-tuned CXR_Bio_ClinicalBERT_v1
>>> there are no signs of pneumonia.
>>> there are no signs of complications.
>>> there are no signs of failure.
>>> there are no signs of tension.
>>> there are no signs of congestion.


In [43]:
text2 = "The patient suffered from [MASK]. "
preds2_ft = mask_filler_ft(text2)
preds2_org = mask_filler_original(text2)

print("Predictions for text2:")
print("1. Pre-trained Bio_ClinicalBERT")
for pred in preds2_org:
    print(f">>> {pred['sequence']}")
print()
print("2. Fine-tuned CXR_Bio_ClinicalBERT_v1")
for pred in preds2_ft:
    print(f">>> {pred['sequence']}")

Predictions for text2:
1. Pre-trained Bio_ClinicalBERT
>>> the patient suffered from?
>>> the patient suffered from a
>>> the patient suffered from pneumonia
>>> the patient suffered from anxiety
>>> the patient suffered from radiation

2. Fine-tuned CXR_Bio_ClinicalBERT_v1
>>> the patient suffered from pneumonia
>>> the patient suffered from fall
>>> the patient suffered from trauma
>>> the patient suffered from diabetes
>>> the patient suffered from surgery


### For Trainer API (not in use) 

In [None]:
######### Trainer API (not used I guess) ###########
batch_size = 64

# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = "clinicalBERT"

training_args = TrainingArguments(
    output_dir=f"{model_name}-FT-mimicCXR-chunked",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",  #do_eval = True
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)
