# Data Loading and Pre-processing (chunking, padding, masking)
### Access all dataset here: 
https://imperiallondon-my.sharepoint.com/:f:/g/personal/dlc19_ic_ac_uk/EgobSIgJFitCuMdL0Sg6KmABP7qqtibuOz1R1jIZDEX22Q?e=U5CfiK

In [5]:
import os
import sys
import random
import warnings
warnings.filterwarnings("ignore")

#data manupulation libs
import pandas as pd
import numpy as np
#from sklearn.model_selection import train_test_split

#from pandarallel import pandarallel

# Initialization
#pandarallel.initialize()

#string manupulation libs
#string manupulation libs
import re
import string
from string import digits
import spacy
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

#torch libs
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from datasets import Dataset, load_dataset
import torch.nn as nn
import torch.optim as optim

import transformers

from transformers import BertTokenizer, AutoTokenizer

# data manipulations
from pathlib import Path
import uuid
import pydicom

from PIL import Image

import cv2
import matplotlib.pyplot as plt

import collections

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

from transformers import default_data_collator

import torch
import pandas as pd
from torch.utils.data import DataLoader
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

[nltk_data] Downloading package stopwords to
[nltk_data]     /media/SharedUsers/dlc19/home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Preprocessing - done separately from data loader

In [7]:
df_train_temp = pd.read_csv('train_raw_reports.csv')
df_test_temp = pd.read_csv('test_raw_reports.csv')

#convetring study id to string as it doesn't work as an int
df_train_temp['study_id']=df_train_temp['study_id'].astype(str)
df_test_temp['study_id']=df_test_temp['study_id'].astype(str)

In [8]:
# display raw reports in a dataframe
df_train_temp.head()

Unnamed: 0,study_id,raw_report
0,50414267,FINAL REPORT\...
1,53189527,FINAL REPORT\...
2,53911762,FINAL REPORT\...
3,56699142,FINAL REPORT\...
4,57375967,FINAL REPORT\...


In [9]:
# preprocessing of the train dataset
def preprocessing(text):
        cleanedReport = re.sub(r'[^\w\s]','',text)            # remove punctuation (not word characters and whitespace)
        cleanedReport = re.sub('_', '', cleanedReport)        # remove __ in the report
        cleanedReport = re.sub(r'[\d-]', '', cleanedReport)   # remove numbers in the report 
        cleanedReport = re.sub('\n', '', cleanedReport)

        return cleanedReport   
    

def preprocessDataframe(df):
    
    i = 0
    
    for i in range(len(df)):
        
        preprocessedText = preprocessing(df.at[i, "raw_report"])
    
        df.at[i,'raw_report'] = preprocessedText
        i = i + 1 
    return df

In [10]:
# preprocess the raw reports in the dataframe 
df_train_preprocessed = preprocessDataframe(df_train_temp)
df_test_preprocessed = preprocessDataframe(df_test_temp)

In [11]:
df_train_preprocessed.head() # check if that worked

Unnamed: 0,study_id,raw_report
0,50414267,FINAL REPORT ...
1,53189527,FINAL REPORT ...
2,53911762,FINAL REPORT ...
3,56699142,FINAL REPORT ...
4,57375967,FINAL REPORT ...


In [12]:
df_train_preprocessed.to_csv('train_preprocessed.csv', index=False)
df_test_preprocessed.to_csv('test_preprocessed.csv', index=False)

### Data Loader - Chunking
- Include: loading data, text preprocessing, words frequency check, tokenization, tokens-IDs-conversion 

In [13]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
stop_words = set(stopwords.words('english'))

In [14]:
data_files = {"train": "train_preprocessed.csv", "test": "test_preprocessed.csv"}
reports_dataset = load_dataset("csv", data_files=data_files)

reports_dataset

Using custom data configuration default-fc03e2e28cbbeb1b


Downloading and preparing dataset csv/default to /media/SharedUsers/dlc19/home/.cache/huggingface/datasets/csv/default-fc03e2e28cbbeb1b/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /media/SharedUsers/dlc19/home/.cache/huggingface/datasets/csv/default-fc03e2e28cbbeb1b/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['study_id', 'raw_report'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['study_id', 'raw_report'],
        num_rows: 3269
    })
})

In [15]:
#example of how to acess one report as a dataset
reports_dataset["train"]
#example of how to acess only written report 
#reports_dataset["train"]['raw_report'][0]

Dataset({
    features: ['study_id', 'raw_report'],
    num_rows: 222337
})

In [16]:
def tokenize_function(dataset):
    result = tokenizer(dataset["raw_report"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = reports_dataset.map(
    tokenize_function, batched=True, remove_columns=["raw_report", "study_id"]
)


chunk_size = 128

def group_texts(dataset):
    # Concatenate all texts
    concatenated_text = {k: sum(dataset[k], []) for k in dataset.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_text[list(dataset.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_text.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result


chunked_dataset = tokenized_datasets.map(group_texts, batched=True)
chunked_dataset

  0%|          | 0/223 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/223 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 206331
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 3345
    })
})

In [34]:
chunkedDataset = chunked_dataset.copy()
print(type(chunked_dataset))
print(type(chunkedDataset))

<class 'datasets.dataset_dict.DatasetDict'>
<class 'dict'>


### Data Loader - Padding
- Include: loading data, text preprocessing, words frequency check, tokenization, tokens-IDs-conversion 

In [18]:
def tokenize_function_padding(dataset):
    result = tokenizer(dataset["raw_report"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets = reports_dataset.map(
    tokenize_function_padding, batched=True, remove_columns=["raw_report", "study_id"])

tokenized_datasets

  0%|          | 0/223 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 3269
    })
})

In [19]:
def padding(dataset):
    
    #for train dataset

    
    num_items = len(dataset['input_ids']) # to get number of all items in train dataset
        
        
    if (len(dataset['input_ids'])) > 300:
        
        while(len(dataset['input_ids']) > 300):
            dataset['input_ids'].pop()
            dataset['token_type_ids'].pop()
            dataset['attention_mask'].pop()
            dataset['word_ids'].pop()

    while(len(dataset['input_ids']) < 301):

        dataset['input_ids'].append(0)
        dataset['token_type_ids'].append(0)
        dataset['attention_mask'].append(0)
        dataset['word_ids'].append(0)
 
    dataset['labels'] = dataset['input_ids'].copy()
    return dataset

padded_dataset = tokenized_datasets.map(padding, batched=False)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [20]:
len(padded_dataset["test"]["input_ids"][10])

301

In [21]:
def check_function(dataset):

    lenReport = len(dataset['input_ids'])
    
    if (lenReport > 301):
        print('yes')
    return dataset

padded_dataset.map(check_function, batched = False)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 3269
    })
})

### If loading from JSON files, use this:
- JSON aren't working rn, hence dataloader

In [22]:
# data_files = {
#     "train" : "final_dataset_chunkedtrain.jsonl",
#     "test" : "final_dataset_chunkedtest.jsonl"    
# }

# chunkedDataset = load_dataset("json", data_files = data_files)
# chunkedDataset


# #testing
# data_files_padded = {
#     "train" : "final_dataset_padded_train.jsonl",
#     "test" : "final_dataset_padded_test.jsonl"    
# }
# paddedDataset = load_dataset("json", data_files = data_files_padded)
# chunkedDataset
    

# MASKING

### Report example

In [35]:
# first chunked report
example = chunked_dataset["train"]["input_ids"][0]
print("Type: ", type(example))
print("Length: ", len(example))
print(example)
tokenizer.decode(example)

Type:  <class 'list'>
Length:  128
[101, 1509, 2592, 8179, 2229, 185, 1161, 1105, 2495, 1204, 12754, 175, 1114, 1207, 15415, 1112, 14375, 1116, 174, 7501, 1111, 8974, 5531, 2229, 185, 1161, 1105, 11937, 7577, 3839, 9505, 1175, 1110, 1185, 17811, 20994, 185, 1513, 12602, 174, 3101, 17268, 1137, 185, 1673, 1818, 12858, 25632, 20557, 6873, 5552, 11769, 7409, 4233, 1115, 1211, 2620, 4248, 15070, 6719, 1103, 3621, 2660, 16418, 2050, 14196, 27316, 1110, 2999, 16973, 1933, 1166, 1103, 1286, 13093, 9046, 1439, 1103, 7209, 1103, 3077, 1181, 3105, 14701, 1110, 8362, 16996, 23822, 1895, 13306, 19353, 24211, 1785, 1104, 1103, 16530, 1286, 3971, 1105, 5001, 10346, 1132, 2382, 8351, 1185, 12104, 3621, 2660, 16091, 13505, 7637, 1616, 1965, 102, 101, 1509, 2592, 8179, 2229, 185, 1161, 1105, 2495, 1204, 12754, 1607, 175, 1114]


'[CLS] final report examination chest pa and lat indication f with new onset ascites eval for infection technique chest pa and lateral comparison none findings there is no focal consolidation pleural effusion or pneumothorax bilateral nodular opacities that most likely represent nipple shadows the cardiomediastinal silhouette is normal clips project over the left lung potentially within the breast the imaged upper abdomen is unremarkable chronic deformity of the posterior left sixth and seventh ribs are noted impression no acute cardiopulmonary process [SEP] [CLS] final report examination chest pa and lat indication history f with'

In [36]:
# same report, padded
example = padded_dataset["train"]["input_ids"][0]
print("Type: ", type(example))
print("Length: ", len(example))
print(example)
tokenizer.decode(example)

Type:  <class 'list'>
Length:  301
[101, 1509, 2592, 8179, 2229, 185, 1161, 1105, 2495, 1204, 12754, 175, 1114, 1207, 15415, 1112, 14375, 1116, 174, 7501, 1111, 8974, 5531, 2229, 185, 1161, 1105, 11937, 7577, 3839, 9505, 1175, 1110, 1185, 17811, 20994, 185, 1513, 12602, 174, 3101, 17268, 1137, 185, 1673, 1818, 12858, 25632, 20557, 6873, 5552, 11769, 7409, 4233, 1115, 1211, 2620, 4248, 15070, 6719, 1103, 3621, 2660, 16418, 2050, 14196, 27316, 1110, 2999, 16973, 1933, 1166, 1103, 1286, 13093, 9046, 1439, 1103, 7209, 1103, 3077, 1181, 3105, 14701, 1110, 8362, 16996, 23822, 1895, 13306, 19353, 24211, 1785, 1104, 1103, 16530, 1286, 3971, 1105, 5001, 10346, 1132, 2382, 8351, 1185, 12104, 3621, 2660, 16091, 13505, 7637, 1616, 1965, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

'[CLS] final report examination chest pa and lat indication f with new onset ascites eval for infection technique chest pa and lateral comparison none findings there is no focal consolidation pleural effusion or pneumothorax bilateral nodular opacities that most likely represent nipple shadows the cardiomediastinal silhouette is normal clips project over the left lung potentially within the breast the imaged upper abdomen is unremarkable chronic deformity of the posterior left sixth and seventh ribs are noted impression no acute cardiopulmonary process [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [57]:
# whole word masking: want to mask all the tokens that correspond to a single word. 

import collections
import numpy as np

from transformers import default_data_collator

def whole_word_masking_data_collator(features, wwm_prob=0.15):
    
    for feature in features:
        word_ids = feature.pop("word_ids") #to fit into default_data_collator
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word: # removing repeat word_ids 
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)
            #created a list where each index is a whole word, as a list of the indices of it's tokens
            
        mask = np.random.binomial(1, wwm_prob, (len(mapping),))#each whole word rather than each token has equal chance of selection for masking

        input_ids = feature['input_ids']
        #print(f'Length of input_ids is  {len(input_ids)}')
        labels = feature['labels']
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id    
                
    return default_data_collator(features)

## Masking Chunked Example:

In [58]:
example_samples = [chunked_dataset["train"][i] for i in range(1)]

example_batch = whole_word_masking_data_collator(example_samples)
print(type(example_batch))

for chunk in example_batch["input_ids"]:
    a = tokenizer.convert_ids_to_tokens(chunk)
    print(f"\n'>>> {tokenizer.convert_ids_to_tokens(chunk)}'")

<class 'dict'>

'>>> ['[CLS]', 'final', 'report', 'examination', 'chest', 'p', '##a', '[MASK]', 'la', '##t', '[MASK]', 'f', 'with', 'new', 'onset', 'as', '##cite', '##s', 'e', '##val', 'for', '[MASK]', 'technique', '[MASK]', 'p', '##a', 'and', 'lateral', 'comparison', 'none', 'findings', '[MASK]', 'is', 'no', 'focal', 'consolidation', 'p', '##le', '##ural', 'e', '##ff', '##usion', 'or', 'p', '##ne', '##um', '##oth', '##orax', 'bilateral', 'nod', '##ular', 'op', '##ac', '##ities', '[MASK]', 'most', '[MASK]', 'represent', 'nipple', 'shadows', 'the', 'card', '##io', '##media', '##st', '##inal', 'silhouette', 'is', 'normal', 'clips', 'project', 'over', 'the', 'left', 'lung', 'potentially', '[MASK]', 'the', 'breast', 'the', 'image', '##d', '[MASK]', 'abdomen', 'is', 'un', '##rem', '##ark', '##able', 'chronic', 'def', '##orm', '##ity', 'of', '[MASK]', 'posterior', 'left', 'sixth', 'and', '[MASK]', 'ribs', 'are', 'noted', 'impression', 'no', 'acute', 'card', '##io', '##pu', '##lm', '##ona', '

## Masking Padded Example:

In [60]:
example_samples = [padded_dataset["train"][i] for i in range(1)]
for sample in example_samples:
    # THIS IS A FIX - THE PADDED DATA IS MISSING LABELS
    sample["labels"] = sample["input_ids"].copy()
example_batch = whole_word_masking_data_collator(example_samples)

for chunk in example_batch["input_ids"]:
    a = tokenizer.convert_ids_to_tokens(chunk)
    print(f"\n'>>> {tokenizer.convert_ids_to_tokens(chunk)}'")


'>>> ['[CLS]', 'final', 'report', 'examination', 'chest', 'p', '##a', 'and', 'la', '##t', 'indication', '[MASK]', 'with', '[MASK]', 'onset', 'as', '##cite', '##s', 'e', '##val', 'for', 'infection', 'technique', 'chest', '[MASK]', '[MASK]', '[MASK]', 'lateral', 'comparison', 'none', '[MASK]', 'there', 'is', 'no', 'focal', '[MASK]', 'p', '##le', '##ural', 'e', '##ff', '##usion', 'or', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', 'bilateral', 'nod', '##ular', 'op', '##ac', '##ities', 'that', 'most', '[MASK]', 'represent', 'nipple', 'shadows', '[MASK]', 'card', '##io', '##media', '##st', '##inal', '[MASK]', 'is', 'normal', 'clips', 'project', '[MASK]', '[MASK]', 'left', '[MASK]', 'potentially', 'within', 'the', 'breast', 'the', 'image', '##d', 'upper', 'abdomen', 'is', 'un', '##rem', '##ark', '##able', 'chronic', 'def', '##orm', '##ity', 'of', 'the', '[MASK]', 'left', '[MASK]', 'and', 'seventh', 'ribs', 'are', 'noted', 'impression', 'no', 'acute', 'card', '##io', '##pu', '##lm', '##o

# Model Training (chunking)
### 1. Prepare the datasets batches (with whole-word-masking) 

In [61]:
# Eliminate this source of randomness is to apply the masking once on the whole test set,
# and then use the default data collator in 🤗 Transformers to collect the batches during evaluation


# replace data_collator here with the whole-word-masking ones
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = whole_word_masking_data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [62]:
chunkedDataset = chunked_dataset.remove_columns(['word_ids'])
chunkedDataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 206331
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3345
    })
})

In [63]:
# Apply this function to our test set and drop the unmasked columns so we can replace them with the masked ones
eval_chunkedDataset = chunked_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=chunked_dataset["test"].column_names,
)
eval_chunkedDataset = eval_chunkedDataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_token_type_ids": "token_type_ids", 
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)
tokenizer.decode(eval_chunkedDataset["input_ids"][0])

  0%|          | 0/4 [00:00<?, ?ba/s]

'[CLS] final report ap chest am history [MASK] [MASK] [MASK] [MASK] [MASK] woman check tube placement [MASK] ap chest compared [MASK] tip [MASK] [MASK] endotracheal tube at the upper margin of the clavicles is [MASK] [MASK] than [MASK] from the [MASK] [MASK] care should be taken that the tube does not withdraw any further lungs are clear cardiomediastinal and [MASK] [MASK] silhouettes and pleural [MASK] are [MASK] [SEP] [CLS] final report ap chest [MASK] [MASK] history et tube advanced impression et tube [MASK] standard placement the nasogastric tube ends in the stomach the lungs [MASK] fully expanded and clear the heart size is normal adenopathy at least'

### 2. Set up dataloaders

In [66]:
batch_size = 64

train_dataloader = DataLoader(
    chunked_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=whole_word_masking_data_collator, # replace self-defined whole-word-masking-data-collator
)

# Use the default_data_collator from Transformers for the evaluation set
eval_dataloader = DataLoader(
    eval_chunkedDataset, 
    batch_size=batch_size, 
    collate_fn=default_data_collator
)

In [67]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fdf2c4c8d68>

### 3. Steps for training with Accelerate

In [69]:
# Is it correct lolllll?
model = AutoModelForMaskedLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# https://huggingface.co/docs/transformers/model_doc/auto

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [70]:
# Adam optimizer 
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [71]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, 
    optimizer, 
    train_dataloader, 
    eval_dataloader
)

In [76]:
# Learning rate scheduler:
from transformers import get_scheduler

num_train_epochs = 10 # change this later

num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [86]:
######### Saving onto Huggingface hub ###########
from huggingface_hub import get_full_repo_name

model_name = "CXR_BioClinicalBERT_chunkedv1"
repo_name = get_full_repo_name(model_name)
repo_name

'ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1'

In [87]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

Cloning https://huggingface.co/ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1 into local empty directory.


### 5. Full Training Loop

In [88]:
torch.cuda.device(0)

<torch.cuda.device at 0x7fdedab419e8>

In [89]:
from tqdm.auto import tqdm
import math

progress_bar = tqdm(range(num_training_steps))
perplexities = []

for epoch in range(num_train_epochs): # for now try 2 epochs and see what happen
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    print("Finishing epoch ", epoch)
    
    # Evaluation
    model.eval()
    losses = []
    
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_chunkedDataset)]
    
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")
    perplexities.append(perplexity)

    
    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/32240 [00:00<?, ?it/s]

Finishing epoch  0
>>> Epoch 0: Perplexity: 1.1576959602969485
Finishing epoch  1
>>> Epoch 1: Perplexity: 1.1498089768602706


Several commits (2) will be pushed upstream.


Finishing epoch  2
>>> Epoch 2: Perplexity: 1.1442581128031095


Several commits (2) will be pushed upstream.


Finishing epoch  3
>>> Epoch 3: Perplexity: 1.1400363561367657


Several commits (3) will be pushed upstream.


Finishing epoch  4
>>> Epoch 4: Perplexity: 1.1363503719807606


Several commits (2) will be pushed upstream.


Finishing epoch  5
>>> Epoch 5: Perplexity: 1.13433012291388


Several commits (3) will be pushed upstream.


Finishing epoch  6
>>> Epoch 6: Perplexity: 1.132400135905862


Several commits (4) will be pushed upstream.


Finishing epoch  7
>>> Epoch 7: Perplexity: 1.1313966997247378


Several commits (3) will be pushed upstream.


Finishing epoch  8
>>> Epoch 8: Perplexity: 1.1313809281236062


Several commits (4) will be pushed upstream.


Finishing epoch  9
>>> Epoch 9: Perplexity: 1.1313809281236062


In [90]:
print(perplexities)
print(losses)

[1.1576959602969485, 1.1498089768602706, 1.1442581128031095, 1.1400363561367657, 1.1363503719807606, 1.13433012291388, 1.132400135905862, 1.1313966997247378, 1.1313809281236062, 1.1313809281236062]
tensor([0.1343, 0.1343, 0.1343,  ..., 0.1029, 0.1029, 0.1029], device='cuda:0')


# Comparing pre-trained and fine-tuned models
#### Can also do BLEU score evaluation?

In [91]:
from transformers import pipeline

mask_filler_ft = pipeline(
    "fill-mask", model="ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1")

mask_filler_original = pipeline(
    "fill-mask", model="emilyalsentzer/Bio_ClinicalBERT")

Downloading:   0%|          | 0.00/643 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/358 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [92]:
text1 = "There are no signs of [MASK]."

preds1_ft = mask_filler_ft(text1)
preds1_org = mask_filler_original(text1)

print("Predictions for text1:")
print("1. Pre-trained Bio_ClinicalBERT")
for pred in preds1_org:
    print(f">>> {pred['sequence']}")
print()
print("2. Fine-tuned CXR_Bio_ClinicalBERT_v1")
for pred in preds1_ft:
    print(f">>> {pred['sequence']}")

Predictions for text1:
1. Pre-trained Bio_ClinicalBERT
>>> there are no signs of bleeding.
>>> there are no signs of infection.
>>> there are no signs of withdrawal.
>>> there are no signs of change.
>>> there are no signs of distress.

2. Fine-tuned CXR_Bio_ClinicalBERT_v1
>>> there are no signs of pneumonia.
>>> there are no signs of failure.
>>> there are no signs of consolidation.
>>> there are no signs of complications.
>>> there are no signs of congestion.


In [93]:
text2 = "The patient suffered from [MASK]. "
preds2_ft = mask_filler_ft(text2)
preds2_org = mask_filler_original(text2)

print("Predictions for text2:")
print("1. Pre-trained Bio_ClinicalBERT")
for pred in preds2_org:
    print(f">>> {pred['sequence']}")
print()
print("2. Fine-tuned CXR_Bio_ClinicalBERT_v1")
for pred in preds2_ft:
    print(f">>> {pred['sequence']}")

Predictions for text2:
1. Pre-trained Bio_ClinicalBERT
>>> the patient suffered from pneumonia.
>>> the patient suffered from anxiety.
>>> the patient suffered from fatigue.
>>> the patient suffered from fall.
>>> the patient suffered from this.

2. Fine-tuned CXR_Bio_ClinicalBERT_v1
>>> the patient suffered from trauma.
>>> the patient suffered from pneumonia.
>>> the patient suffered from bleeding.
>>> the patient suffered from seizure.
>>> the patient suffered from surgery.
