<a href="https://colab.research.google.com/github/diya0603/NLP-Project/blob/main/Named_Enity_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NER task using ClinicalBERT

In [None]:
import numpy as np
import pandas as pd

In [None]:
import os
import json
import csv

In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m74.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m96.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
from tqdm import tqdm, trange
from google.colab import drive
drive.mount('/content/drive')
data=pd.read_csv('/content/drive/MyDrive/train.csv').fillna(method='ffill')
data.head(50)
# data=data[:70006]

In [None]:
class SentenceGetter(object):

  def __init__(self, data):
    self.n_sent = 1
    self.data = data
    self.empty = False

    agg_func = lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(), s['tag'].values.tolist())]
    self.grouped = self.data.groupby('Sent_ID').apply(agg_func)
    self.sentences = [s for s in self.grouped]

  def get_next(self):
    try:
      s = self.grouped['{}'.format(self.n_sent)]
      self.n_sent +=1
      return s
    except:
      return None


In [None]:
getter = SentenceGetter(data)

In [None]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
print(sentences[0])

['Obesity', 'in', 'Low-', 'and', 'Middle-Income', 'Countries', ':', 'Burden', ',', 'Drivers', ',', 'and', 'Emerging', 'Challenges', '.']


In [None]:
labels = [[s[1] for s in sentence] for sentence in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
tag_values = list(set(data["tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}
#Padding is addded end of each sentence
tag2idx

{'B-indications': 0, 'O': 1, 'I-indications': 2, 'PAD': 3}

Preprocess the sentence nd labels. Prepare to use with pytorch and bert

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForTokenClassification
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split


In [None]:
torch.__version__

'2.0.1+cu118'

In [None]:
MAX_LEN = 75
bs = 32
#batch size = bs
# sentence length fixed to 75 i.e. 75 tokens
# but bert supports up to 512 tokens

Here we fix some configurations. We will limit our sequence length to 75 tokens and we will use a batch size of 32 as suggested by the Bert paper. Note, that Bert supports sequences of up to 512 tokens.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [None]:
torch.cuda.get_device_name(0)

'Tesla T4'

In [None]:
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT', do_lower_case=False)

In [None]:

def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [None]:

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
tags

array([[2, 2, 2, ..., 3, 3, 3],
       [2, 2, 2, ..., 3, 3, 3],
       [2, 2, 2, ..., 3, 3, 3],
       ...,
       [2, 2, 2, ..., 3, 3, 3],
       [2, 2, 2, ..., 3, 3, 3],
       [2, 2, 2, ..., 3, 3, 3]])

In [None]:
#attenation mask to ignore PAD token
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:

tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
# convert to torch tenors
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
#training time shuffling of the data and testing time we pass them sequentially
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

Fine Tune

In [None]:
import transformers
from transformers import AdamW


In [None]:
model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",num_labels=len(tag2idx),output_attentions = False, output_hidden_states=False)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint 

In [None]:
model.cuda();

AdamW optimizer for finetuning

FULL_FINETUNING = True: This is a flag that determines whether full fine-tuning is enabled. If FULL_FINETUNING is True, the entire model will be fine-tuned. If it is False, only the parameters in the model's classifier (typically the last layer(s)) will be fine-tuned.

param_optimizer = list(model.named_parameters()): This line retrieves the named parameters of the model. It obtains a list of tuples, where each tuple contains the name of a parameter and its corresponding value.

no_decay = ['bias', 'gamma', 'beta']: This is a list of parameter names that will not undergo weight decay during optimization. Typically, bias terms (bias) and normalization parameters (gamma, beta) are excluded from weight decay to prevent undesired effects.

optimizer_grouped_parameters: This variable is a list that will hold dictionaries defining the parameters and weight decay rates for the optimizer. It will be used to group the parameters based on whether they should undergo weight decay or not.

If FULL_FINETUNING is True: The parameters are split into two groups. The first group includes parameters for which none of the names in no_decay appear (i.e., regular parameters), and they will have a weight decay rate of 0.01. The second group includes parameters for which any of the names in no_decay appear (i.e., parameters without weight decay), and they will have a weight decay rate of 0.0. This grouping facilitates applying different weight decay rates to different parameter groups.

If FULL_FINETUNING is False: Only the parameters of the model's classifier are considered, and they are added to optimizer_grouped_parameters as a single group.

optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8): This line initializes the AdamW optimizer. It takes optimizer_grouped_parameters as the parameters to optimize. The learning rate (lr) is set to 3e-5, and eps represents a small value to prevent division by zero in the optimizer's calculations.

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)



In [None]:
#schduler to reduce learning rate linearly throughout the epochs
from transformers import get_linear_schedule_with_warmup

epochs = 3
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

Firt bert for ner

In [None]:

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    #Training
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    #               Validation
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        eval_accuracy += flat_accuracy(logits, label_ids)
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print()

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Average train loss: 0.025805184785694275
Validation loss: 0.016787767390762206
Validation Accuracy: 0.9944383221850583


Epoch:  33%|███▎      | 1/3 [42:09<1:24:19, 2529.69s/it]


Average train loss: 0.011735703979407763
Validation loss: 0.01548701220188219
Validation Accuracy: 0.9951318004459268


Epoch:  67%|██████▋   | 2/3 [1:24:19<42:09, 2529.71s/it]


Average train loss: 0.006418607395349668
Validation loss: 0.015464913262411357
Validation Accuracy: 0.9955615105908555


Epoch: 100%|██████████| 3/3 [2:06:28<00:00, 2529.59s/it]







##LIME

In [None]:
!pip install eli5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import scipy
import numpy as np
def monkeypath_itemfreq(sampler_indices):
   return zip(*np.unique(sampler_indices, return_counts=True))

scipy.stats.itemfreq=monkeypath_itemfreq

In [None]:
from eli5.lime import TextExplainer
from eli5.lime.samplers import MaskingTextSampler

In [None]:
import torch.nn.functional as F

In [None]:
! pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
class NERExplainerGenerator1(object):
    def __init__(self, model):
        self.model = model

    def dict2vec(self, pred):
        vectors = []
        for sent in pred:
            sent_res = []
            for word_probs in sent:
                vector = [word_probs[class_idx] for class_idx in range(4)]
                sent_res.append(vector)
            sent_res = np.array(sent_res)
            vectors.append(sent_res)
        vectors = np.array(vectors)
        return vectors

    def get_predict_function(self, word_index):
        def predict_func(texts):
            token_probabilities_list = []
            for sentence in texts:
                tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=True)
                input_ids = torch.tensor([tokenized_sentence]).cuda()
                with torch.no_grad():
                    output = model(input_ids)
                logits = output[0]
                class_probabilities = F.softmax(logits, dim=2)
                token_probabilities = class_probabilities[0]
                token_probabilities = token_probabilities.to('cpu').numpy()
                token_probabilities_list.append(token_probabilities)
            pred = np.array(token_probabilities_list, dtype=object)
            # Filter out the "PAD" class probabilities
            max_words = max(len(subarray) for subarray in pred)  # Determine the maximum number of words in a subarray

            # Pad the subarrays with zeros to make them the same size
            padded_pred = [np.pad(subarray, [(0, max_words - len(subarray)), (0, 0)], mode='constant') for subarray in pred]

            pred_3d = np.array(padded_pred)
            # Filter out the "PAD" class probabilities
            pred_3d=np.delete(pred_3d,tag2idx['PAD'],axis=2)
            pred_3d = pred_3d / np.sum(pred_3d, axis=2, keepdims=True)
            return pred_3d[:, word_index, :]
        return predict_func


In [None]:
text = 'Patient presented with severe abdominal pain and vomiting. CT scan revealed a 5 cm mass in the liver. Liver biopsy confirmed hepatocellular carcinoma.'
explainer1= NERExplainerGenerator1(model)
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

for index,word in enumerate(word_tokenize(text)):
  print(index,word)

0 Patient
1 presented
2 with
3 severe
4 abdominal
5 pain
6 and
7 vomiting
8 .
9 CT
10 scan
11 revealed
12 a
13 5
14 cm
15 mass
16 in
17 the
18 liver
19 .
20 Liver
21 biopsy
22 confirmed
23 hepatocellular
24 carcinoma
25 .


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
word_index = 7 #explain vomitting

func1 = explainer1.get_predict_function(word_index)
sampler = MaskingTextSampler( replacement="UNK",max_replace=0.7,token_pattern=None,bow=False)
samples, similarity = sampler.sample_near(text, n_samples=4)
print(samples)

('Patient UNK with severe UNK pain and UNK. CT scan UNK UNK 5 cm mass in the UNK. Liver UNK UNK hepatocellular carcinoma.', 'Patient UNK UNK UNK UNK UNK UNK vomiting. CT UNK revealed a UNK cm UNK in the liver. Liver biopsy confirmed hepatocellular carcinoma.', 'Patient presented with severe abdominal UNK UNK vomiting. CT scan revealed UNK 5 cm UNK in the liver. Liver biopsy confirmed hepatocellular carcinoma.', 'Patient UNK UNK UNK UNK UNK UNK UNK. CT UNK UNK a UNK UNK UNK in UNK liver. Liver biopsy UNK UNK UNK.')


In [None]:
te = TextExplainer(sampler=sampler, position_dependent=True, random_state=42)
te.fit(text, func1)

# The explainer needs just the one instance text from texts list
explain = te.explain_prediction(target_names=list(tag2idx.keys()), top_targets=3)
print("WORD TO EXPLAIN", word_tokenize(text)[word_index])
explain

  pred_3d = pred_3d / np.sum(pred_3d, axis=2, keepdims=True)


WORD TO EXPLAIN vomiting


Contribution?,Feature
7.575,<BIAS>
-6.196,Highlighted in text (sum)

Contribution?,Feature
6.898,Highlighted in text (sum)
-9.112,<BIAS>

Contribution?,Feature
3.901,Highlighted in text (sum)
-6.53,<BIAS>
