# Text classification (multi-label)
From tutorial: 
- https://curiousily.com/posts/multi-label-text-classification-with-bert-and-pytorch-lightning/
- https://kyawkhaung.medium.com/multi-label-text-classification-with-bert-using-pytorch-47011a7313b9

In [1]:
import os
import sys
import random
import warnings
warnings.filterwarnings("ignore")

#data manupulation libs
import pandas as pd
import numpy as np
import pickle
#from sklearn.model_selection import train_test_split

#from pandarallel import pandarallel

# Initialization
#pandarallel.initialize()

#string manupulation libs
#string manupulation libs
import re
import string
from string import digits
import spacy
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

#torch libs
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from datasets import Dataset, load_dataset, DatasetDict
import torch.nn as nn
import torch.optim as optim

import transformers

from transformers import AutoTokenizer, AutoModel, TrainingArguments

# data manipulations
from pathlib import Path
import uuid
import pydicom

from PIL import Image

import cv2
import matplotlib.pyplot as plt

import collections

from transformers import default_data_collator

[nltk_data] Downloading package stopwords to
[nltk_data]     /media/SharedUsers/dlc19/home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# replace with the padded one
tokenizer = AutoTokenizer.from_pretrained("ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1")
model = AutoModel.from_pretrained("ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1") 

Some weights of the model checkpoint at ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1 and are newly initialized: ['bert.poole

In [16]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(2))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 3 GPU(s) available.
We will use the GPU: GeForce RTX 3090


In [4]:
# load in test pickle
with open('test.pkl', 'rb') as f:
    df_test = pickle.load(f)

#load in train pickle
with open('train.pkl', 'rb') as f:
    df_train = pickle.load(f)

In [5]:
# load dataframes as datasets
tds = Dataset.from_pandas(df_train)
vds = Dataset.from_pandas(df_test)


# put both train and test set into one DatasetDict
reports_dataset = DatasetDict()

reports_dataset['train'] = tds
reports_dataset['test'] = vds

print(reports_dataset)

DatasetDict({
    train: Dataset({
        features: ['study_id', 'raw_report', 'diagnosis', '__index_level_0__'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['study_id', 'raw_report', 'diagnosis', '__index_level_0__'],
        num_rows: 3269
    })
})


In [6]:
reports_dataset = reports_dataset.remove_columns(["__index_level_0__"])
reports_dataset = reports_dataset.rename_column("diagnosis", "labels")
reports_dataset

DatasetDict({
    train: Dataset({
        features: ['study_id', 'raw_report', 'labels'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['study_id', 'raw_report', 'labels'],
        num_rows: 3269
    })
})

In [7]:
# Example
print(reports_dataset["train"]["study_id"][0])
print(reports_dataset["train"]["raw_report"][0])
print(reports_dataset["train"]["labels"][0])
print(type(reports_dataset["train"]["labels"][0]))

50414267
                                 FINAL REPORT EXAMINATION  CHEST PA AND LAT  INDICATION  F with new onset ascites   eval for infection  TECHNIQUE  Chest PA and lateral  COMPARISON  None  FINDINGS   There is no focal consolidation pleural effusion or pneumothorax  Bilateral nodular opacities that most likely represent nipple shadows The cardiomediastinal silhouette is normal  Clips project over the left lung potentially within the breast The imaged upper abdomen is unremarkable Chronic deformity of the posterior left sixth and seventh ribs are noted  IMPRESSION   No acute cardiopulmonary process
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
<class 'list'>


### Tokenization and Padding
- same as before: returning input_ids, attention_mask, labels (which is the array [0, 1, 1, 0, 0] 

In [8]:
def tokenize_function_padding(dataset):
    result = tokenizer(dataset["raw_report"])
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets = reports_dataset.map(
    tokenize_function_padding, batched=True, remove_columns=["raw_report", "study_id"])

tokenized_datasets

  0%|          | 0/223 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 222337
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3269
    })
})

In [9]:
def padding(dataset):
    
    num_items = len(dataset['input_ids']) # to get number of all items in train dataset
        
    while(len(dataset['input_ids']) < 300):
        dataset['input_ids'].append(0)
        dataset['token_type_ids'].append(0)
        dataset['attention_mask'].append(0)
        
    return dataset

padded_dataset = tokenized_datasets.map(padding, batched=False)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [10]:
len(padded_dataset["test"]["input_ids"][0])

300

In [11]:
smaller_dataset = padded_dataset.filter(lambda example: len(example['input_ids'])<=300)

  0%|          | 0/223 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [12]:
padded_dataset = smaller_dataset

def check_function(dataset):

    lenReport = len(dataset['input_ids'])
    
    if (lenReport > 300):
        print('yes')
    return dataset

padded_dataset.map(check_function, batched = False)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 221006
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3250
    })
})

In [15]:
# class customDataset(Dataset):
#     def __init__(self, dataset, types):
#         self.dataset = dataset[types]
    
#     def __len__(self):
#         return len(self.dataset["input_ids"])
    
#     def __getitem__(self, idx):
#         input_ids = self.dataset["input_ids"][idx]
#         token_type_ids = self.dataset["token_type_ids"][idx]
#         attention_mask = self.dataset["attention_mask"][idx]
#         labels = self.dataset["labels"][idx]
        
#         return dict(
#             input_ids = torch.tensor(input_ids, dtype=torch.long), 
#             token_type_ids = torch.tensor(token_type_ids, dtype=torch.long), 
#             attention_mask = torch.tensor(attention_mask, dtype=torch.long), 
#             labels = torch.tensor(labels, dtype=torch.float)
#         )

In [16]:
# train_dataset = customDataset(padded_dataset, "train")
# test_dataset = customDataset(padded_dataset, "test")

In [13]:
# print(train_dataset[20000]["input_ids"])
# print(train_dataset[20000]["input_ids"].size)
# print(train_dataset[20000]["token_type_ids"])
# print(train_dataset[20000]["token_type_ids"].size)
# print(train_dataset[20000]["attention_mask"])
# print(train_dataset[20000]["attention_mask"].size)
# print(train_dataset[20000]["labels"])

### Adding linear layer to BERTmodel

In [18]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert = AutoModel.from_pretrained("ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1") 
#         self.dropout = torch.nn.Dropout(0.3) # dunno if this work, forget about it now
        self.classifier = torch.nn.Linear(768, 13) # 768 and 13 are fixed in our case
    
    def forward(self, input_ids, mask, token_type_ids):
        _, output = self.bert(input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids) # dun need word_id I think
#         output = self.dropout(output)
        output = self.classifier(output.pooler_output)
        return output

model = BERTClass()

Some weights of the model checkpoint at ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1 and are newly initialized: ['bert.poole

### Train with Accelerate
- Try to train it the same way as before, with a linear layer added to BERTmodel
- Define new evaluation score: BCEwithLogits
- Add labels into training

#### 1. Dataloaders

In [19]:
# from transformers import default_data_collator
# data_collator = default_data_collator

In [20]:
batch_size = 64

train_dataloader = DataLoader(
    padded_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
#     collate_fn=default_data_collator,
)

# Use the default_data_collator from Transformers for the evaluation set
eval_dataloader = DataLoader(
    padded_dataset["test"], 
    batch_size=batch_size, 
#     collate_fn=default_data_collator,
)

In [25]:
# # not yet return as a tensor?
# for batch_idx, data in enumerate(train_dataloader):
#     print(batch_idx)
#     print(data)

In [26]:
print(len(train_dataloader))
print(len(eval_dataloader))

3454
51


#### 2. Train with Accelerate 

In [27]:
# Adam optimizer 
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [28]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, 
    optimizer, 
    train_dataloader, 
    eval_dataloader
)

RuntimeError: CUDA out of memory. Tried to allocate 86.00 MiB (GPU 0; 23.69 GiB total capacity; 249.20 MiB already allocated; 24.81 MiB free; 268.00 MiB reserved in total by PyTorch)

In [30]:
# Learning rate scheduler:
from transformers import get_scheduler

num_train_epochs = 10 # change this later

num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [31]:
######### Saving onto Huggingface hub ###########
from huggingface_hub import get_full_repo_name

model_name = "CXR_BioClinicalBERT_Class"
repo_name = get_full_repo_name(model_name)
repo_name

'ICLbioengNLP/CXR_BioClinicalBERT_Class'

In [32]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

/media/SharedUsers/dlc19/home/codes/nlp-fineTuningBERT/TextClassification/CXR_BioClinicalBERT_Class is already a clone of https://huggingface.co/ICLbioengNLP/CXR_BioClinicalBERT_Class. Make sure you pull the latest changes with `repo.git_pull()`.


#### 4. Training Loop

In [33]:
# Loss function - BCEwithLogitsLoss
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [34]:
val_targets = []
val_outputs = []

In [72]:
model.to(device)

from tqdm.auto import tqdm
import math

progress_bar = tqdm(range(num_training_steps))


for epoch in range(num_train_epochs): 
    
    train_loss = 0
    valid_loss = 0
    
    # Training
    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    
    for batch_idx, data in enumerate(train_dataloader):
        # optional - see Accelerate documentation 
        ids = data["input_ids"].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        
#         if batch_idx%5000==0:
#             print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        
        progress_bar.update(1)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    
    
    # Evaluation
    model.eval()
    
    with torch.no_grad():
        for batch_idx, data in enumerate(eval_dataloader):
            ids = data["input_ids"].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(accelerator.gather(loss.repeat(batch_size)))

            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

    losses = torch.cat(losses)
    losses = losses[: len(eval_chunkedDataset)]
    
    print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
    
    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/34540 [00:00<?, ?it/s]

############# Epoch 0: Training Start   #############


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

### Prediction (without using pipeline)


In [None]:
# Sth like this?

THRESHOLD = 0.5
test_comment = "You are such a loser! You'll regret everything you've done to me!"

encoding = tokenizer.encode_plus(
  test_comment,
  add_special_tokens=True,
  max_length=512,
  return_token_type_ids=False,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

_, test_prediction = trained_model(encoding["input_ids"], encoding["attention_mask"])
test_prediction = test_prediction.flatten().numpy()

for label, prediction in zip(LABEL_COLUMNS, test_prediction):
  if prediction < THRESHOLD:
    continue
  print(f"{label}: {prediction}")

### Prediction - use Pipeline: zero-shot-classification
**Supports multi_label!**
- Source: https://discuss.huggingface.co/t/new-pipeline-for-zero-shot-text-classification/681
- Source: https://huggingface.co/facebook/bart-large-mnli

In [16]:
# Example: 
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [17]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing', 'exploration']
classifier(sequence_to_classify, candidate_labels, multi_class=True)

The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.


{'sequence': 'one day I will see the world',
 'labels': ['travel', 'exploration', 'dancing', 'cooking'],
 'scores': [0.9945111274719238,
  0.9383889436721802,
  0.005706179421395063,
  0.0018193130381405354]}

In [11]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1")

Some weights of the model checkpoint at ICLbioengNLP/CXR_BioClinicalBERT_chunkedv1 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at IC

In [14]:
sequence_to_classify = " FINAL REPORT EXAMINATION  CHEST PA AND LAT  INDICATION  F with new onset ascites   eval for infection  TECHNIQUE  Chest PA and lateral  COMPARISON  None  FINDINGS   There is no focal consolidation pleural effusion or pneumothorax  Bilateral nodular opacities that most likely represent nipple shadows The cardiomediastinal silhouette is normal  Clips project over the left lung potentially within the breast The imaged upper abdomen is unremarkable Chronic deformity of the posterior left sixth and seventh ribs are noted  IMPRESSION   No acute cardiopulmonary process"
candidate_labels = ["Atelectasis", "Cardiomegaly", "Consolidation", "Edema", "Enlarged Cardiomediastinum", "Fracture", "Lung Lesion", "Lung Opacity", "No Finding", "Pleural Effusion", "Pleural Other", "Pneumonia", "Pneumothorax"]
classifier(sequence_to_classify, candidate_labels, multi_label=True)

{'sequence': ' FINAL REPORT EXAMINATION  CHEST PA AND LAT  INDICATION  F with new onset ascites   eval for infection  TECHNIQUE  Chest PA and lateral  COMPARISON  None  FINDINGS   There is no focal consolidation pleural effusion or pneumothorax  Bilateral nodular opacities that most likely represent nipple shadows The cardiomediastinal silhouette is normal  Clips project over the left lung potentially within the breast The imaged upper abdomen is unremarkable Chronic deformity of the posterior left sixth and seventh ribs are noted  IMPRESSION   No acute cardiopulmonary process',
 'labels': ['No Finding',
  'Enlarged Cardiomediastinum',
  'Pneumothorax',
  'Atelectasis',
  'Pleural Effusion',
  'Edema',
  'Pleural Other',
  'Cardiomegaly',
  'Pneumonia',
  'Lung Opacity',
  'Lung Lesion',
  'Consolidation',
  'Fracture'],
 'scores': [0.447907954454422,
  0.4478684961795807,
  0.44755780696868896,
  0.4475393295288086,
  0.4475173354148865,
  0.4472104012966156,
  0.44697681069374084,
  