# Fine-tuning BioClinicalBERT: CXR_BioClinicalBERT_SW 
#### (pre-version of CXR_BioClinicalBERT_MLM) 
- Version: padded
### Access all dataset here: 
https://imperiallondon-my.sharepoint.com/:f:/g/personal/dlc19_ic_ac_uk/EgobSIgJFitCuMdL0Sg6KmABP7qqtibuOz1R1jIZDEX22Q?e=U5CfiK

In [1]:
import os
import sys
import random
import warnings
warnings.filterwarnings("ignore")

#data manupulation libs
import pandas as pd
import numpy as np
#from sklearn.model_selection import train_test_split

#from pandarallel import pandarallel

# Initialization
#pandarallel.initialize()

#string manupulation libs
#string manupulation libs
import re
import string
from string import digits
import spacy
import nltk
from nltk.corpus import stopwords 

#torch libs
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from datasets import Dataset, load_dataset
import torch.nn as nn
import torch.optim as optim

import transformers

from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments

# data manipulations
from pathlib import Path
import uuid
import pydicom

from PIL import Image

import cv2
import matplotlib.pyplot as plt

import collections

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

from transformers import default_data_collator

In [2]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(1)) # use gpu 1

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 3 GPU(s) available.
We will use the GPU: GeForce RTX 3090


### Data Preprocessing - done separately from data loader

In [3]:
# df_train_temp = pd.read_csv('train_raw_reports.csv')
# df_test_temp = pd.read_csv('test_raw_reports.csv')

# #convetring study id to string as it doesn't work as an int
# df_train_temp['study_id']=df_train_temp['study_id'].astype(str)
# df_test_temp['study_id']=df_test_temp['study_id'].astype(str)

In [4]:
# # display raw reports in a dataframe
# df_train_temp.head()

In [5]:
# # preprocessing of the train dataset
# # stop_words = set(stopwords.words('english'))
# def preprocessing(text):
#         cleanedReport = re.sub(r'[^\w\s]','',text)            # remove punctuation (not word characters and whitespace)
#         cleanedReport = re.sub('_', '', cleanedReport)        # remove __ in the report
#         cleanedReport = re.sub(r'[\d-]', '', cleanedReport)   # remove numbers in the report 
#         cleanedReport = re.sub('\n', '', cleanedReport)
        
#         return cleanedReport   
    

# def preprocessDataframe(df):
    
#     i = 0
    
#     for i in range(len(df)):
        
#         preprocessedText = preprocessing(df.at[i, "raw_report"])
    
#         df.at[i,'raw_report'a] = preprocessedText
#         i = i + 1 
#     return df

In [6]:
# # preprocess the raw reports in the dataframe 
# df_train_preprocessed = preprocessDataframe(df_train_temp)
# df_test_preprocessed = preprocessDataframe(df_test_temp)

In [7]:
# df_train_preprocessed.head() # check if that worked

### Removing Stopwords
Don't actually know whether we should be doing this. If we do, need to make our own list (big problem with getting rid of negations and changing meaning).
If you want to use, comment out and use "df_trained_preprocessed_nostopwords" to make csv


In [8]:
# # Example. Notice the reports have different meanings because of removal of words like 'no'
# print(df_train_preprocessed['raw_report'][0])
# words = [word for word in df_train_preprocessed['raw_report'][0].split() if word.lower() not in stop_words ]
# new_text = " ".join(words)

# print()
# print(new_text)

#### Uncomment to remove stopwords

In [9]:
# df_train_preprocessed_nostopwords = df_train_preprocessed.copy()
# l = len(df_train_preprocessed['raw_report'])
# for i in range(0,l):
#     if i%10000 == 0:
#         print(i) # just to check progress - should take 10-15mins, 220,000 reports
#     words = [word for word in df_train_preprocessed['raw_report'][i].split() if word.lower() not in stop_words]
#     new_report = " ".join(words)
#     df_train_preprocessed_nostopwords['raw_report'][i] = new_report

# df_test_preprocessed_nostopwords = df_test_preprocessed.copy()
# l = len(df_test_preprocessed['raw_report'])
# for i in range(0,l):
#     words = [word for word in df_test_preprocessed['raw_report'][i].split() if word.lower() not in stop_words]
#     new_report = " ".join(words)
#     df_test_preprocessed_nostopwords['raw_report'][i] = new_report
    
    

#### Checking:

In [10]:
# print(df_train_preprocessed['raw_report'][10030])
# print()
# print(df_train_preprocessed_nostopwords['raw_report'][10030])
# print()
# print(df_test_preprocessed['raw_report'][2000])
# print()
# print(df_test_preprocessed_nostopwords['raw_report'][2000])

In [11]:
# # if removing stopwords, change these to _nostopwords versions.
# df_train_preprocessed.to_csv('train_preprocessed.csv', index=False)
# df_test_preprocessed.to_csv('test_preprocessed.csv', index=False)

### Loading data in

In [12]:
data_files = {"train": "train_preprocessed.csv", "test": "test_preprocessed.csv"}
reports_dataset = load_dataset("csv", data_files=data_files)

reports_dataset

Using custom data configuration default-26f909f52d14cf3e
Reusing dataset csv (/media/SharedUsers/elh19/home/.cache/huggingface/datasets/csv/default-26f909f52d14cf3e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['study_id', 'raw_report'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['study_id', 'raw_report'],
        num_rows: 3269
    })
})

In [13]:
print(reports_dataset["train"]["raw_report"][0])
print(type(reports_dataset["train"]["raw_report"][0]))

                                 FINAL REPORT EXAMINATION  CHEST PA AND LAT  INDICATION  F with new onset ascites   eval for infection  TECHNIQUE  Chest PA and lateral  COMPARISON  None  FINDINGS   There is no focal consolidation pleural effusion or pneumothorax  Bilateral nodular opacities that most likely represent nipple shadows The cardiomediastinal silhouette is normal  Clips project over the left lung potentially within the breast The imaged upper abdomen is unremarkable Chronic deformity of the posterior left sixth and seventh ribs are noted  IMPRESSION   No acute cardiopulmonary process
<class 'str'>


### Data Loader - Padding
- Include: loading data, text preprocessing, words frequency check, tokenization, tokens-IDs-conversion 

In [14]:
padsize = 128

In [15]:
def tokenize_function_padding(dataset):
    result = tokenizer(dataset["raw_report"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets = reports_dataset.map(
    tokenize_function_padding, batched=True, remove_columns=["raw_report", "study_id"])

tokenized_datasets = tokenized_datasets.remove_columns(['token_type_ids']) # don't think this column actually does anything so YEET
tokenized_datasets

Loading cached processed dataset at /media/SharedUsers/elh19/home/.cache/huggingface/datasets/csv/default-26f909f52d14cf3e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-82f9e6d11c675f24.arrow
Loading cached processed dataset at /media/SharedUsers/elh19/home/.cache/huggingface/datasets/csv/default-26f909f52d14cf3e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-73472fbea581181d.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 3269
    })
})

In [16]:
def padding(dataset):
    
    num_items = len(dataset['input_ids']) # to get number of all items in train dataset
        
    while(len(dataset['input_ids']) < padsize):
        dataset['input_ids'].append(0)
#         dataset['token_type_ids'].append(0)
        dataset['attention_mask'].append(0)
        dataset['word_ids'].append(0)

    dataset['labels'] = dataset['input_ids'].copy()
    return dataset

padded_dataset = tokenized_datasets.map(padding, batched=False)

Loading cached processed dataset at /media/SharedUsers/elh19/home/.cache/huggingface/datasets/csv/default-26f909f52d14cf3e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-d42473ced2a9368b.arrow
Loading cached processed dataset at /media/SharedUsers/elh19/home/.cache/huggingface/datasets/csv/default-26f909f52d14cf3e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-675d6922af97fee1.arrow


In [17]:
len(padded_dataset["test"]["input_ids"][0])

128

In [18]:
smaller_dataset = padded_dataset.filter(lambda example: len(example['input_ids'])<=padsize)

Loading cached processed dataset at /media/SharedUsers/elh19/home/.cache/huggingface/datasets/csv/default-26f909f52d14cf3e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-75c72beb3acc1572.arrow
Loading cached processed dataset at /media/SharedUsers/elh19/home/.cache/huggingface/datasets/csv/default-26f909f52d14cf3e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-cc29a4d4868ddcdf.arrow


In [19]:
padded_dataset = smaller_dataset

def check_function(dataset):

    lenReport = len(dataset['input_ids'])
    
    if (lenReport > padsize):
        print('yes')
    return dataset

padded_dataset.map(check_function, batched = False)

Loading cached processed dataset at /media/SharedUsers/elh19/home/.cache/huggingface/datasets/csv/default-26f909f52d14cf3e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-936497202e421b13.arrow
Loading cached processed dataset at /media/SharedUsers/elh19/home/.cache/huggingface/datasets/csv/default-26f909f52d14cf3e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-c451da5a7399117d.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 148704
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1775
    })
})

In [20]:
torch.cuda.empty_cache()

### If loading from JSON files, use this:
- JSON aren't working rn, hence dataloader

In [21]:
# data_files = {
#     "train" : "final_dataset_chunkedtrain.jsonl",
#     "test" : "final_dataset_chunkedtest.jsonl"    
# }

# chunkedDataset = load_dataset("json", data_files = data_files)
# chunkedDataset


# #testing
# data_files_padded = {
#     "train" : "final_dataset_padded_train.jsonl",
#     "test" : "final_dataset_padded_test.jsonl"    
# }
# paddedDataset = load_dataset("json", data_files = data_files_padded)
# chunkedDataset
    

# MASKING

### Report example

In [22]:
# # same report, padded
# example = padded_dataset["train"]["input_ids"][0]
# print("Type: ", type(example))
# print("Length: ", len(example))
# print(example)
# tokenizer.decode(example)

In [23]:
# whole word masking: want to mask all the tokens that correspond to a single word. 

import collections
import numpy as np

from transformers import default_data_collator

def whole_word_masking_data_collator(features, wwm_prob=0.15):
    
    for feature in features:
        word_ids = feature.pop("word_ids") #to fit into default_data_collator
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word: # removing repeat word_ids 
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)
            #created a list where each index is a whole word, as a list of the indices of it's tokens
            
        mask = np.random.binomial(1, wwm_prob, (len(mapping),))#each whole word rather than each token has equal chance of selection for masking

        input_ids = feature['input_ids']
        #print(f'Length of input_ids is  {len(input_ids)}')
        labels = feature['labels']
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id    
                
    return default_data_collator(features)

## Masking Padded Example:

In [24]:
example_samples = [padded_dataset["train"][i] for i in range(1)]
for sample in example_samples:
    # THIS IS A FIX - THE PADDED DATA IS MISSING LABELS
    sample["labels"] = sample["input_ids"].copy()
example_batch = whole_word_masking_data_collator(example_samples)

for chunk in example_batch["input_ids"]:
    a = tokenizer.convert_ids_to_tokens(chunk)
    print(f"\n'>>> {tokenizer.convert_ids_to_tokens(chunk)}'")


'>>> ['[CLS]', 'final', '[MASK]', 'examination', 'chest', '[MASK]', '[MASK]', 'and', '[MASK]', '[MASK]', 'indication', 'f', 'with', 'new', 'onset', 'as', '##cite', '##s', '[MASK]', '[MASK]', 'for', 'infection', 'technique', '[MASK]', 'p', '##a', 'and', '[MASK]', 'comparison', 'none', 'findings', 'there', 'is', 'no', 'focal', 'consolidation', '[MASK]', '[MASK]', '[MASK]', 'e', '##ff', '##usion', '[MASK]', 'p', '##ne', '##um', '##oth', '##orax', 'bilateral', 'nod', '##ular', 'op', '##ac', '##ities', 'that', 'most', 'likely', 'represent', 'nipple', 'shadows', 'the', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', 'silhouette', 'is', 'normal', 'clips', 'project', 'over', 'the', 'left', '[MASK]', 'potentially', '[MASK]', 'the', 'breast', 'the', '[MASK]', '[MASK]', 'upper', 'abdomen', 'is', 'un', '##rem', '##ark', '##able', 'chronic', 'def', '##orm', '##ity', 'of', 'the', 'posterior', 'left', 'sixth', 'and', 'seventh', '[MASK]', 'are', 'noted', 'impression', 'no', 'acute', 'card', '##io',

# Model Training (Padding)
### 1. Prepare the datasets batches (with whole-word-masking) 

In [25]:
# Eliminate this source of randomness is to apply the masking once on the whole test set,
# and then use the default data collator in 🤗 Transformers to collect the batches during evaluation


# replace data_collator here with the whole-word-masking ones
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = whole_word_masking_data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [26]:
paddedDataset = padded_dataset.remove_columns(['word_ids'])
paddedDataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 148704
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1775
    })
})

In [27]:
# Apply this function to our test set and drop the unmasked columns so we can replace them with the masked ones
eval_paddedDataset = padded_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=padded_dataset["test"].column_names,
)
eval_paddedDataset = eval_paddedDataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
#         "masked_token_type_ids": "token_type_ids", 
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)
tokenizer.decode(eval_paddedDataset["input_ids"][0])

Loading cached processed dataset at /media/SharedUsers/elh19/home/.cache/huggingface/datasets/csv/default-26f909f52d14cf3e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-e8cea59ba2d243b1.arrow


'[CLS] final report ap chest [MASK] history intubated yearold woman check tube [MASK] impression ap chest compared to tip of the endotracheal tube at [MASK] upper margin of the clavicles is no less than mm from the carina care should be taken that the tube does not withdraw any further lungs [MASK] clear cardiomediastinal [MASK] hilar silhouettes [MASK] pleural surfaces are normal [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [28]:
torch.cuda.empty_cache()

### 2. Set up dataloaders

In [29]:
batch_size = 32

train_dataloader = DataLoader(
    padded_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=whole_word_masking_data_collator, # replace self-defined whole-word-masking-data-collator
)

# Use the default_data_collator from Transformers for the evaluation set
eval_dataloader = DataLoader(
    eval_paddedDataset, 
    batch_size=batch_size, 
    collate_fn=default_data_collator
)

In [30]:
# for batch_idx, data in enumerate(train_dataloader):
#     print(batch_idx)
#     print(data["input_ids"])

### 3. Steps for training with Accelerate

In [31]:
# Is it correct lolllll?
model = AutoModelForMaskedLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# https://huggingface.co/docs/transformers/model_doc/auto

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
# Adam optimizer 
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [33]:
from accelerate import Accelerator

accelerator = Accelerator(cpu=True)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, 
    optimizer, 
    train_dataloader, 
    eval_dataloader
)

In [34]:
# Learning rate scheduler:
from transformers import get_scheduler

num_train_epochs = 10 # change this later

num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [35]:
######### Saving onto Huggingface hub ###########
from huggingface_hub import get_full_repo_name

model_name = "CXR_BioClinicalBERT_SW"
repo_name = get_full_repo_name(model_name)
repo_name

'ICLbioengNLP/CXR_BioClinicalBERT_SW'

In [36]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

Cloning https://huggingface.co/ICLbioengNLP/CXR_BioClinicalBERT_SW into local empty directory.


### 5. Full Training Loop

In [None]:
from tqdm.auto import tqdm
import math

progress_bar = tqdm(range(num_training_steps))
perplexities = []

for epoch in range(num_train_epochs): 
    # Training
    model.train()
    
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    print("Finishing epoch ", epoch)
    
    # Evaluation
    model.eval()
    losses = []
    
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_paddedDataset)]
    
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")
    perplexities.append(perplexity)

    
    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/46470 [00:00<?, ?it/s]

############# Epoch 0: Training Start   #############


In [None]:
print(perplexities)
print(losses)

# Comparing pre-trained and fine-tuned models
#### Can also do BLEU score evaluation?

In [None]:
from transformers import pipeline

mask_filler_ft = pipeline(
    "fill-mask", model="ICLbioengNLP/CXR_BioClinicalBERT_SW")

mask_filler_original = pipeline(
    "fill-mask", model="emilyalsentzer/Bio_ClinicalBERT")

In [None]:
text1 = "There are no signs of [MASK]."

preds1_ft = mask_filler_ft(text1)
preds1_org = mask_filler_original(text1)

print("Predictions for text1:")
print("1. Pre-trained Bio_ClinicalBERT")
for pred in preds1_org:
    print(f">>> {pred['sequence']}")
print()
print("2. Fine-tuned CXR_BioClinicalBERT_SW")
for pred in preds1_ft:
    print(f">>> {pred['sequence']}")

In [None]:
text2 = "The patient suffered from [MASK]. "
preds2_ft = mask_filler_ft(text2)
preds2_org = mask_filler_original(text2)

print("Predictions for text2:")
print("1. Pre-trained Bio_ClinicalBERT")
for pred in preds2_org:
    print(f">>> {pred['sequence']}")
print()
print("2. Fine-tuned CXR_BioClinicalBERT_SW")
for pred in preds2_ft:
    print(f">>> {pred['sequence']}")