In [1]:
!pip install seqeval



In [2]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [5]:
# verify GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


## Convert to DataFrame for EDA

In [6]:
def convert_data(filepath):

  # Read the data from the text file
  with open(filepath, "r") as file:
      lines = file.readlines()

  # Define an empty list to store the data
  data = []

  # Iterate over each line in the file
  for line in lines:
      # Split the line by spaces
      parts = line.strip().split()

      # Check if the line has the expected number of elements
      if len(parts) == 9:
          # Extract the values from the line
          text_file_name = parts[0]
          sentence_line_number = int(parts[1])
          sentence_word_index = int(parts[2])
          sentence_seq = parts[3]
          start_token = int(parts[4])
          end_token = int(parts[5])
          original_word = parts[6]
          word = parts[7]
          label = parts[8]

          # Append the values as a tuple to the data list
          data.append((text_file_name, sentence_line_number, sentence_word_index, sentence_seq,
                      start_token, end_token, original_word, word, label))

  # Create a DataFrame from the data list with appropriate column names
  df = pd.DataFrame(data, columns=['text_file_name', 'sentence_line_number', 'sentence_word_index',
                                  'sentence_seq', 'start_token', 'end_token', 'original_word',
                                  'word', 'label'])

  return df


In [7]:
train_data_path = "/content/drive/MyDrive/266_final/data/Original_text/dataset1_train.txt"
test_data_path = "/content/drive/MyDrive/266_final/data/Original_text/dataset1_test.txt"

train = convert_data(train_data_path)
test = convert_data(test_data_path)

print(f"Length of train: {len(train)}")
print(f"Length of test: {len(test)}")


Length of train: 895141
Length of test: 585761


In [8]:
df = train.copy()

In [9]:
df['sentence_line_number'].nunique()

1053

In [10]:
df.head()

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
0,data/training_20180910/110727.txt,1,0,,0,9,Admission,Admission,O
1,data/training_20180910/110727.txt,1,1,,10,14,Date,Date,O
2,data/training_20180910/110727.txt,1,2,,14,15,:,:,O
3,data/training_20180910/110727.txt,1,3,,17,18,[,[,O
4,data/training_20180910/110727.txt,1,4,,18,19,*,*,O


In [11]:
df['label'].value_counts()

label
O              802045
B-Drug          16222
I-Frequency     13023
I-Dosage         8779
B-Strength       6691
B-Form           6647
I-Strength       6617
B-Frequency      6279
B-Route          5475
I-Drug           4298
B-Dosage         4221
I-Form           4173
B-Reason         3791
I-Reason         3125
I-Duration       1034
B-ADE             956
I-ADE             776
B-Duration        592
I-Route           397
Name: count, dtype: int64

In [12]:
df[df['sentence_line_number'] == 1]

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
0,data/training_20180910/110727.txt,1,0,,0,9,Admission,Admission,O
1,data/training_20180910/110727.txt,1,1,,10,14,Date,Date,O
2,data/training_20180910/110727.txt,1,2,,14,15,:,:,O
3,data/training_20180910/110727.txt,1,3,,17,18,[,[,O
4,data/training_20180910/110727.txt,1,4,,18,19,*,*,O
...,...,...,...,...,...,...,...,...,...
893314,data/training_20180910/100883.txt,1,23,,64,65,-,-,O
893315,data/training_20180910/100883.txt,1,24,,65,66,3,ORDINAL,O
893316,data/training_20180910/100883.txt,1,25,,66,67,*,*,O
893317,data/training_20180910/100883.txt,1,26,,67,68,*,*,O


### Label labels

In [13]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['label'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]

print(unique_labels)

{'I-Reason', 'I-Frequency', 'I-Drug', 'I-Strength', 'B-Strength', 'B-Duration', 'B-Drug', 'B-ADE', 'B-Route', 'B-Dosage', 'I-Form', 'I-Route', 'B-Reason', 'I-ADE', 'I-Duration', 'I-Dosage', 'B-Form', 'B-Frequency', 'O'}


In [14]:
# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(labels_to_ids)

{'B-ADE': 0, 'B-Dosage': 1, 'B-Drug': 2, 'B-Duration': 3, 'B-Form': 4, 'B-Frequency': 5, 'B-Reason': 6, 'B-Route': 7, 'B-Strength': 8, 'I-ADE': 9, 'I-Dosage': 10, 'I-Drug': 11, 'I-Duration': 12, 'I-Form': 13, 'I-Frequency': 14, 'I-Reason': 15, 'I-Route': 16, 'I-Strength': 17, 'O': 18}


In [15]:
def formatted_df_cl(df):
  df['sentence'] = df[[
      'text_file_name',
      'original_word',
      'label']].groupby(
          ['text_file_name'])['original_word'].transform(lambda x: ' '.join(x))

  df['word_labels'] = df[[
      'text_file_name',
      'original_word',
      'label']].groupby(
          ['text_file_name'])['label'].transform(lambda x: ','.join(x))

  return df

In [16]:
df = formatted_df_cl(df)
test = formatted_df_cl(test)

In [17]:
df.iloc[0]

text_file_name                          data/training_20180910/110727.txt
sentence_line_number                                                    1
sentence_word_index                                                     0
sentence_seq                                                           NA
start_token                                                             0
end_token                                                               9
original_word                                                   Admission
word                                                            Admission
label                                                                   O
sentence                Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...
word_labels             O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...
Name: 0, dtype: object

In [18]:
def sentence_level_data(df):
  sentence_level_data = df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
  return sentence_level_data

In [19]:
# sentence_level_data = df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
# sentence_level_data.head()

In [20]:
sentence_level_train = sentence_level_data(df)
sentence_level_test = sentence_level_data(test)

In [21]:
sentence_level_train.iloc[85].sentence



In [22]:
sentence_level_train.iloc[85].word_labels

'O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-Drug,O,B-Drug,O,B-Drug,O,O,O,O,B-Drug,O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-Strength,I-Strength,B-Drug,O,B-Strength,I-Strength,B-Drug,O,B-Strength,I-Strength,B-Route,B-Drug,O,B-Strength,I-Strength,B-

In [23]:
data_train = sentence_level_train.copy()
data_test = sentence_level_test.copy()

In [24]:
data_train.head()

Unnamed: 0,sentence,word_labels
0,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Admission Date : [ * * 2130 - 10 - 2 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,Admission Date : [ * * 2151 - 1 - 25 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,Admission Date : [ * * 2193 - 12 - 23 * * ] Di...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,Admission Date : [ * * 2133 - 3 - 28 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [25]:
# data_test.rename(columns={'sentence': 'text', 'word_labels': 'labes'}, inplace=True)

In [26]:

# data_train.to_csv('/content/drive/MyDrive/266_final/data/lf_train.csv')
# # data_test.to_csv('/content/drive/MyDrive/266_final/data/lf_test.csv')


# data_train = pd.read_csv()

### It looks reasonable, let's proceed with training

In [27]:
# from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
# import os




In [28]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("yikuan8/Clinical-Longformer", add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(
    "yikuan8/Clinical-Longformer", num_labels=len(labels_to_ids), gradient_checkpointing=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at yikuan8/Clinical-Longformer and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# # from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
# import os
# from transformers import AutoTokenizer, AutoModelForTokenClassification

# model_name = "emilyalsentzer/Bio_ClinicalBERT"  # Bio_ClinicalBERT model identifier
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(labels_to_ids))

In [30]:
MAX_LEN = 2048
TRAIN_BATCH_SIZE = 1
VALID_BATCH_SIZE = 1
TEST_BATCH_SIZE = 2
EPOCHS = 10
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


In [31]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             return_offsets_mapping=True,
                             padding='max_length',
                             is_split_into_words=True,
                             truncation=True,
                             max_length=self.max_len,
                            #  add_prefix_space=True
                            )

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

  def __len__(self):
        return self.len

In [32]:
train_size = 0.8
train_dataset = data_train.sample(frac=train_size,random_state=200)
val_dataset = data_train.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data_train.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("Val Dataset: {}".format(val_dataset.shape))
print("Test Dataset: {}".format(data_test.shape))


training_set = dataset(train_dataset, tokenizer, MAX_LEN)
val_set = dataset(val_dataset, tokenizer, MAX_LEN)
test_set = dataset(data_test.reset_index(drop=True), tokenizer, MAX_LEN)


FULL Dataset: (303, 2)
TRAIN Dataset: (242, 2)
Val Dataset: (61, 2)
Test Dataset: (202, 2)


In [33]:
train_dataset.to_csv('/content/drive/MyDrive/266_final/data/train.csv', index=False)
val_dataset.to_csv('/content/drive/MyDrive/266_final/data/val.csv', index=False)


In [34]:
training_set[0]

{'input_ids': tensor([    0, 18032, 10566,  ..., 17844, 26056,     2]),
 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1]),
 'offset_mapping': tensor([[0, 0],
         [0, 9],
         [0, 4],
         ...,
         [0, 2],
         [0, 4],
         [0, 0]]),
 'labels': tensor([-100,   18,   18,  ...,   17,    4, -100])}

In [35]:
test_set[0]

{'input_ids': tensor([    0, 18032, 10566,  ...,    79,  2226,     2]),
 'attention_mask': tensor([1, 1, 1,  ..., 1, 1, 1]),
 'offset_mapping': tensor([[0, 0],
         [0, 9],
         [0, 4],
         ...,
         [0, 3],
         [0, 9],
         [0, 0]]),
 'labels': tensor([-100,   18,   18,  ...,   18,   18, -100])}

In [36]:
# for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
#   print('{0:10}  {1}'.format(token, label))

In [37]:
# for token, label in zip(tokenizer.convert_ids_to_tokens(test_set[0]["input_ids"]), test_set[0]["labels"]):
#   print('{0:10}  {1}'.format(token, label))

In [38]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }


training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)
testing_loader = DataLoader(test_set, **test_params)



In [39]:
# model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

LongformerForTokenClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
             

In [40]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.7948, device='cuda:0', grad_fn=<NllLossBackward0>)

In [41]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 2048, 19])

In [42]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE, )

In [43]:
import torch.nn.functional as F


def focal_loss(logits, labels, alpha=0.25, gamma=2.0, ignore_index=-100):
    """
    logits: [batch_size, seq_len, num_labels] - model predictions
    labels: [batch_size, seq_len] - ground truth labels
    """
    # Calculate Cross Entropy Loss without reduction
    ce_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), reduction='none', ignore_index=ignore_index)

    # Get the predictions
    pred_probs = F.softmax(logits.view(-1, logits.size(-1)), dim=-1)
    pred_class = labels.view(-1)

    # focusing parameter
    gamma = gamma

    # Filter out 'ignore_index' labels
    filtered = labels.view(-1) != ignore_index

    # Calculate focal loss
    ce_loss_filtered = ce_loss[filtered]
    pred_probs_filtered = pred_probs[filtered]
    pred_class_filtered = pred_class[filtered]

    # Construct the loss
    pt = pred_probs_filtered.gather(1, pred_class_filtered.unsqueeze(-1)).squeeze()
    loss = ((1 - pt) ** gamma * ce_loss_filtered).mean()  # mean over the batch

    return loss



In [44]:
def create_global_attention_mask_for_ade(tokens, labels):
    """
    Create a global attention mask for tokens identified as related to ADEs.

    Args:
        tokens (List[str]): List of tokens in the sequence.
        labels (List[str]): List of labels for each token, with ADE-related labels clearly marked.

    Returns:
        torch.Tensor: The global attention mask.
    """
    global_attention_mask = torch.zeros(len(tokens))

    for idx, label in enumerate(labels):
        if label.startswith('B-ADE') or label.startswith('I-ADE'):  # Assuming ADE labels are marked as such
            global_attention_mask[idx] = 1

    return global_attention_mask

In [45]:
# def train(epoch):
#     tr_loss, tr_accuracy = 0, 0
#     nb_tr_steps = 0
#     tr_preds, tr_labels = [], []

#     model.train()

#     for idx, (batch, labels) in enumerate(training_loader):
#         ids = batch['input_ids'].to(device, dtype=torch.long)
#         mask = batch['attention_mask'].to(device, dtype=torch.long)
#         labels = labels.to(device, dtype=torch.long)

#         # Create a global attention mask with 1s for first token and ADE entity locations
#         global_attention_mask = torch.zeros_like(ids)
#         global_attention_mask[:, 0] = 1  # Global attention on the first token ([CLS])

#         for batch_idx in range(ids.size(0)):  # Loop over batch dimension
#             for token_idx in range(ids.size(1)):  # Loop over sequence length
#                 # Check if the current token is B-ADE or I-ADE and set global attention if so
#                 if labels[batch_idx, token_idx] == labels_to_ids['B-ADE'] or labels[batch_idx, token_idx] == labels_to_ids['I-ADE']:
#                     global_attention_mask[batch_idx, token_idx] = 1

#         outputs = model(input_ids=ids, attention_mask=mask, global_attention_mask=global_attention_mask.to(device))
#         logits = outputs.logits
#         loss = focal_loss(logits, labels)  # Implement your focal_loss function

#         tr_loss += loss.item()
#         nb_tr_steps += 1

#         if idx % 100 == 0:
#             loss_step = tr_loss / nb_tr_steps
#             print(f"Training loss per 100 training steps: {loss_step}")

#         # Assuming implementation for gradient clipping and optimizer step
#         optimizer.zero_grad()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
#         optimizer.step()

#     print(f"Training loss epoch: {tr_loss / nb_tr_steps}")

In [46]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []

    model.train()

    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        # Create a global attention mask with 1s for first token and ADE entity locations
        global_attention_mask = torch.zeros_like(ids)
        global_attention_mask[:, 0] = 1  # Global attention on the first token ([CLS])

        # # Assuming 'labels_to_ids' is a dictionary mapping label strings to their encoded IDs
        # b_ade_id = labels_to_ids['B-Drug']
        # i_ade_id = labels_to_ids['I-Drug']

        # # Find indices where labels are B-ADE or I-ADE
        # b_ade_indices = (labels == b_ade_id).nonzero(as_tuple=True)
        # i_ade_indices = (labels == i_ade_id).nonzero(as_tuple=True)

        # # Apply global attention to B-ADE and I-ADE locations
        # # Assuming a 2D structure for `labels`, adjust dimensions as necessary
        # for idx in b_ade_indices[0]:
        #     global_attention_mask[idx] = 1
        # for idx in i_ade_indices[0]:
        #     global_attention_mask[idx] = 1

        # Perform a forward pass to get the logits
        outputs = model(input_ids=ids, attention_mask=mask, global_attention_mask=global_attention_mask)
        logits = outputs.logits
        # Calculate the loss using focal_loss function directly
        loss = focal_loss(logits, labels)  # Assuming focal_loss is correctly implemented elsewhere

        tr_loss += loss.item()
        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100 == 0:
            loss_step = tr_loss / nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # Decoding the logits to compute accuracy
        active_logits = logits.view(-1, model.num_labels)  # Adjust according to your model's output shape
        flattened_predictions = torch.argmax(active_logits, axis=1)

        # Ignoring the predictions of the padding tokens
        active_accuracy = labels.view(-1) != -100
        active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(active_labels, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels.cpu().numpy())
        tr_preds.extend(predictions.cpu().numpy())

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [47]:
torch.cuda.empty_cache()

In [48]:
from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()

In [49]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.475255012512207
Training loss per 100 training steps: 0.3614272003311037
Training loss per 100 training steps: 0.24854684799483315
Training loss epoch: 0.21998569088668585
Training accuracy epoch: 0.9418938144051487
Training epoch: 2
Training loss per 100 training steps: 0.1692674458026886
Training loss per 100 training steps: 0.07292473481191175
Training loss per 100 training steps: 0.06451280263225914
Training loss epoch: 0.06167368947967025
Training accuracy epoch: 0.9778579885027718
Training epoch: 3
Training loss per 100 training steps: 0.07445260137319565
Training loss per 100 training steps: 0.039080188472497064
Training loss per 100 training steps: 0.037626489910467374
Training loss epoch: 0.03667426920096171
Training accuracy epoch: 0.98333298312044
Training epoch: 4
Training loss per 100 training steps: 0.0032598497346043587
Training loss per 100 training steps: 0.030899977374886447
Training loss per 100 training steps

In [50]:
def valid(model, testing_loader):
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps = 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)
            global_attention_mask = torch.zeros_like(ids)
            global_attention_mask[:, 0] = 1  # Apply global attention to the first token


            outputs = model(input_ids=ids, attention_mask=mask, global_attention_mask=global_attention_mask)
            loss = focal_loss(outputs.logits, labels)

            eval_loss += loss.item()
            nb_eval_steps += 1

            if idx % 100 == 0:
                print(f"Validation loss per 100 evaluation steps: {eval_loss / nb_eval_steps}")

            active_logits = outputs.logits.view(-1, model.num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)
            active_accuracy = labels.view(-1) != -100
            active_labels = torch.masked_select(labels.view(-1), active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(active_labels.cpu().numpy())
            eval_preds.extend(predictions.cpu().numpy())

    # Calculate and print final evaluation metrics
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    print(f"Validation Loss: {eval_loss / nb_eval_steps}")
    print(f"Validation Accuracy: {eval_accuracy}")

    # Optionally return detailed evaluation metrics
    return eval_labels, eval_preds


In [51]:
# def valid(model, testing_loader, labels_to_ids):
#     model.eval()
#     eval_loss, eval_accuracy = 0, 0
#     nb_eval_steps = 0
#     eval_preds, eval_labels = [], []

#     with torch.no_grad():
#         for idx, batch in enumerate(testing_loader):
#             ids = batch['input_ids'].to(device, dtype=torch.long)
#             mask = batch['attention_mask'].to(device, dtype=torch.long)
#             labels = batch['labels'].to(device, dtype=torch.long)


#             # Initialize global attention mask: all zeros, then set dynamically for ADE tokens
#             global_attention_mask = torch.zeros_like(ids)
#             global_attention_mask[:, 0] = 1

#             # # Dynamically set global attention for B-ADE and I-ADE locations
#             # for batch_idx in range(ids.size(0)):  # Loop over batch dimension
#             #     for token_idx in range(ids.size(1)):  # Loop over sequence length
#             #         label_id = labels[batch_idx, token_idx].item()  # Get the numerical label ID
#             #         # Check if the current token is B-ADE or I-ADE and set global attention if so
#             #         if label_id == labels_to_ids['B-Drug'] or label_id == labels_to_ids['I-Drug']:
#             #             global_attention_mask[batch_idx, token_idx] = 1

#             # Perform a forward pass to get the logits
#             outputs = model(input_ids=ids, attention_mask=mask, global_attention_mask=global_attention_mask)
#             logits = outputs.logits
#             loss = focal_loss(logits, labels)  # Assuming focal_loss is correctly implemented elsewhere

#             eval_loss += loss.item()
#             nb_eval_steps += 1

#             if idx % 100 == 0:
#                 print(f"Validation loss per 100 evaluation steps: {eval_loss / nb_eval_steps}")

#             # Decoding the logits to compute accuracy
#             active_logits = logits.view(-1, model.num_labels)
#             flattened_predictions = torch.argmax(active_logits, axis=1)
#             active_accuracy = labels.view(-1) != -100
#             active_labels = torch.masked_select(labels.view(-1), active_accuracy)
#             predictions = torch.masked_select(flattened_predictions, active_accuracy)

#             eval_labels.extend(active_labels.cpu().numpy())
#             eval_preds.extend(predictions.cpu().numpy())

#     eval_accuracy = accuracy_score(eval_labels, eval_preds)
#     print(f"Validation Loss: {eval_loss / nb_eval_steps}")
#     print(f"Validation Accuracy: {eval_accuracy}")

#     return eval_labels, eval_preds

In [53]:
labels, predictions = valid(model, val_loader)


Validation loss per 100 evaluation steps: 0.0052735209465026855
Validation Loss: 0.020369224226878493
Validation Accuracy: 0.9864398564355864


In [54]:
# Convert numerical labels and predictions to their string equivalents
string_labels = [ids_to_labels[id] for id in labels]  # Convert every label ID in the list
string_predictions = [ids_to_labels[id] for id in predictions]  # Same for predictions

# Since `seqeval` expects a list of lists (one per sentence), adjust accordingly
# This step assumes each label/prediction is already grouped by sentences
string_labels = [string_labels]  # Wrap in another list if you have flat lists
string_predictions = [string_predictions]

from seqeval.metrics import classification_report

print(classification_report(string_labels, string_predictions))

              precision    recall  f1-score   support

         ADE       0.44      0.65      0.53        80
      Dosage       0.84      0.84      0.84       190
        Drug       0.90      0.92      0.91      1184
    Duration       0.70      0.79      0.74        42
        Form       0.88      0.86      0.87       197
   Frequency       0.70      0.77      0.74       299
      Reason       0.48      0.62      0.54       252
       Route       0.81      0.79      0.80       255
    Strength       0.93      0.96      0.95       358

   micro avg       0.80      0.85      0.83      2857
   macro avg       0.74      0.80      0.77      2857
weighted avg       0.82      0.85      0.83      2857



In [55]:
test_labels, test_predictions = valid(model, testing_loader)


Validation loss per 100 evaluation steps: 0.030953148379921913
Validation loss per 100 evaluation steps: 0.01939068721966947
Validation Loss: 0.01939068721966947
Validation Accuracy: 0.9871641601114471


In [None]:
!PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [None]:
# test_labels, test_preds = valid(model, testing_loader)

In [61]:
# Convert numerical labels and predictions to their string equivalents
string_labels = [ids_to_labels[id] for id in test_labels]  # Convert every label ID in the list
string_predictions = [ids_to_labels[id] for id in test_predictions]  # Same for predictions

# Since `seqeval` expects a list of lists (one per sentence), adjust accordingly
# This step assumes each label/prediction is already grouped by sentences
string_labels = [string_labels]  # Wrap in another list if you have flat lists
string_predictions = [string_predictions]

from seqeval.metrics import classification_report

print(classification_report(string_labels, string_predictions))

              precision    recall  f1-score   support

         ADE       0.48      0.58      0.53       266
      Dosage       0.83      0.84      0.83       528
        Drug       0.90      0.94      0.92      3467
    Duration       0.50      0.55      0.52       110
        Form       0.87      0.84      0.86       553
   Frequency       0.65      0.72      0.68       633
      Reason       0.43      0.58      0.50       871
       Route       0.88      0.88      0.88       716
    Strength       0.91      0.95      0.93       849

   micro avg       0.79      0.85      0.82      7993
   macro avg       0.72      0.76      0.74      7993
weighted avg       0.80      0.85      0.82      7993



In [None]:
# labels_to_ids

In [None]:
# labels_to_ids = {v: k for k, v in ids_to_labels.items()}

# # Assuming ADE has a specific numerical ID or label name in `ids_to_labels`
# ade_id = labels_to_ids['B-ADE']

# # Convert labels and predictions back to IDs if they are not already
# true_ids = [labels_to_ids[label] for label in labels]
# predicted_ids = [labels_to_ids[prediction] for prediction in predictions]

# # Find indices where ADE was the true label but was predicted incorrectly,
# # and where ADE was predicted but not the true label
# misclassified_as_ade = []
# misclassified_not_ade = []
# for i, (true, pred) in enumerate(zip(true_ids, predicted_ids)):
#     if true == ade_id and true != pred:
#         misclassified_not_ade.append(i)
#     elif pred == ade_id and true != pred:
#         misclassified_as_ade.append(i)


In [None]:
# misclassified_not_ade

In [None]:
# def retrieve_original_sentence(index, dataframe):
#     """Fetch the original sentence and its word labels based on the index."""
#     sentence = dataframe.iloc[index]['sentence']
#     word_labels = dataframe.iloc[index]['word_labels']
#     return sentence, word_labels.split(',')


In [None]:
# val_dataset

In [None]:
# # Assuming `misclassified_not_ade` and `misclassified_as_ade` are lists of indices from previous steps
# for index in misclassified_not_ade[:10]:  # Adjust the range as needed
#     sentence, labels = retrieve_original_sentence(index, val_dataset)
#     print(f"Index {index} - Misclassified (Not ADE):")
#     print("Sentence:", sentence)
#     print("Labels:", labels)
#     print("\n")

# for index in misclassified_as_ade[:10]:  # Adjust the range as needed
#     sentence, labels = retrieve_original_sentence(index, val_dataset)
#     print(f"Index {index} - Misclassified (False ADE):")
#     print("Sentence:", sentence)
#     print("Labels:", labels)
#     print("\n")


In [None]:
# Convert numerical labels and predictions to their string equivalents
string_labels = [ids_to_labels[id] for id in labels]  # Convert every label ID in the list
string_predictions = [ids_to_labels[id] for id in predictions]  # Same for predictions

# Since `seqeval` expects a list of lists (one per sentence), adjust accordingly
# This step assumes each label/prediction is already grouped by sentences
string_labels = [string_labels]  # Wrap in another list if you have flat lists
string_predictions = [string_predictions]



In [62]:
import os

directory = "/content/drive/MyDrive/266_final/clinical_longformer_final"

if not os.path.exists(directory):
    os.makedirs(directory)

tokenizer.save_vocabulary(directory)
model.save_pretrained(directory, safe_serialization=True)

In [None]:
from seqeval.metrics import classification_report

print(classification_report([test_labels], [test_preds]))

Resources

### https://github.com/lcampillos/Medical-NER/blob/master/bert_ner.ipynb
### https://medium.com/analytics-vidhya/bio-tagged-text-to-original-text-99b05da6664