In [1]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=af3d0105e3b0ba8d25333373a105c7c65d5ca6b2a8240da50fd10682ad0dce1f
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [2]:
import pandas as pd
import numpy as np

import os

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:

# verify GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


## Convert to DataFrame for EDA

In [5]:
def convert_data(filepath):

  # Read the data from the text file
  with open(filepath, "r") as file:
      lines = file.readlines()

  # Define an empty list to store the data
  data = []

  # Iterate over each line in the file
  for line in lines:
      # Split the line by spaces
      parts = line.strip().split()

      # Check if the line has the expected number of elements
      if len(parts) == 9:
          # Extract the values from the line
          text_file_name = parts[0]
          sentence_line_number = int(parts[1])
          sentence_word_index = int(parts[2])
          sentence_seq = parts[3]
          start_token = int(parts[4])
          end_token = int(parts[5])
          original_word = parts[6]
          word = parts[7]
          label = parts[8]

          # Append the values as a tuple to the data list
          data.append((text_file_name, sentence_line_number, sentence_word_index, sentence_seq,
                      start_token, end_token, original_word, word, label))

  # Create a DataFrame from the data list with appropriate column names
  df = pd.DataFrame(data, columns=['text_file_name', 'sentence_line_number', 'sentence_word_index',
                                  'sentence_seq', 'start_token', 'end_token', 'original_word',
                                  'word', 'label'])

  return df


In [6]:
train_data_path = "/content/drive/MyDrive/266_final/data/Original_text/dataset1_train.txt"
test_data_path = "/content/drive/MyDrive/266_final/data/Original_text/dataset1_test.txt"

train = convert_data(train_data_path)
test = convert_data(test_data_path)

print(f"Length of train: {len(train)}")
print(f"Length of test: {len(test)}")


Length of train: 895141
Length of test: 585761


In [7]:
df = train.copy()
df['sentence_line_number'].nunique()

1053

In [8]:
df['label'].value_counts().sort_values()

label
I-Route           397
B-Duration        592
I-ADE             776
B-ADE             956
I-Duration       1034
I-Reason         3125
B-Reason         3791
I-Form           4173
B-Dosage         4221
I-Drug           4298
B-Route          5475
B-Frequency      6279
I-Strength       6617
B-Form           6647
B-Strength       6691
I-Dosage         8779
I-Frequency     13023
B-Drug          16222
O              802045
Name: count, dtype: int64

### Label labels

In [9]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['label'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]

print(unique_labels)

{'B-Strength', 'I-ADE', 'I-Form', 'B-Route', 'I-Drug', 'I-Duration', 'B-Dosage', 'B-Frequency', 'B-ADE', 'B-Form', 'B-Reason', 'I-Strength', 'I-Route', 'B-Duration', 'I-Reason', 'I-Frequency', 'O', 'I-Dosage', 'B-Drug'}


In [10]:
# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(labels_to_ids)

{'B-ADE': 0, 'B-Dosage': 1, 'B-Drug': 2, 'B-Duration': 3, 'B-Form': 4, 'B-Frequency': 5, 'B-Reason': 6, 'B-Route': 7, 'B-Strength': 8, 'I-ADE': 9, 'I-Dosage': 10, 'I-Drug': 11, 'I-Duration': 12, 'I-Form': 13, 'I-Frequency': 14, 'I-Reason': 15, 'I-Route': 16, 'I-Strength': 17, 'O': 18}


In [11]:
def formatted_df(df):
  df['sentence'] = df[[
      'text_file_name',
      'sentence_line_number',
      'original_word',
      'label']].groupby(
          ['text_file_name', 'sentence_line_number'])['original_word'].transform(lambda x: ' '.join(x))

  df['word_labels'] = df[[
      'text_file_name',
      'sentence_line_number',
      'original_word',
      'label']].groupby(
          ['text_file_name', 'sentence_line_number'])['label'].transform(lambda x: ','.join(x))

  return df

In [12]:
df = formatted_df(df)
df_test = formatted_df(test)

In [13]:
def sentence_level_data_fn(df):
  sentence_level_data = df[["text_file_name", "sentence_line_number", "sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
  return sentence_level_data

sentence_level_train = sentence_level_data_fn(df)
sentence_level_test = sentence_level_data_fn(df_test)

In [14]:
sentence_level_data = sentence_level_train[["text_file_name", "sentence_line_number", "sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
sentence_level_test = sentence_level_test[["text_file_name","sentence_line_number",  "sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)

sentence_level_data.head()

Unnamed: 0,text_file_name,sentence_line_number,sentence,word_labels
0,data/training_20180910/110727.txt,1,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/110727.txt,3,Date of Birth : [ * * 2163 - 9 - 18 * * ] Sex : M,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,data/training_20180910/110727.txt,5,Service : MEDICINE,"O,O,O"
3,data/training_20180910/110727.txt,7,Allergies :,"O,O"
4,data/training_20180910/110727.txt,8,Keflex / Orencia / Remicade,"B-Drug,O,B-Drug,O,B-Drug"


#### Some sentences are really short. *Try* concatenating sentence to get longer inputs

In [15]:
df = sentence_level_data.sort_values(by=['text_file_name', 'sentence_line_number'])
df_test = sentence_level_test.sort_values(by=['text_file_name', 'sentence_line_number'])

def cs(dataframe):
    processed_data_list = []

    # Group by 'text_file_name'
    grouped = dataframe.groupby("text_file_name")

    for name, group in grouped:
        sentences = group['sentence'].tolist()
        # Labels are already strings, so we take them as is
        labels = group['word_labels'].tolist()

        for i in range(0, len(sentences), 5):
            end_index = min(i + 5, len(sentences))
            current_batch = sentences[i:end_index]
            current_labels = labels[i:end_index]

            concatenated_sentence = " ".join(current_batch)
            # Concatenate labels as they are, assuming they're correctly formatted strings
            concatenated_label = ",".join(current_labels)

            processed_data_list.append({
                "text_file_name": name,
                "sentences": concatenated_sentence,
                "labels": concatenated_label
            })

    processed_data = pd.DataFrame(processed_data_list)
    return processed_data


# Correcting the function to properly handle labels
concatenated_df = cs(df)
concatenated_test= cs(df_test)

concatenated_df



Unnamed: 0,text_file_name,sentences,labels
0,data/training_20180910/100035.txt,Admission Date : [ * * 2115 - 2 - 22 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/100035.txt,Attending : [ * * First Name3 ( LF ) 4891 * * ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,data/training_20180910/100035.txt,Removal of chest tubes placed at an outside ho...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,data/training_20180910/100035.txt,to an OSH with dyspnea now admitted to the MIC...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,data/training_20180910/100035.txt,"and was intubated . He received epinephrine , ...","O,O,O,O,O,O,B-Drug,O,B-Drug,O,B-Drug,O,O,B-Dru..."
...,...,...,...
16783,data/training_20180910/198406.txt,# Contact : [ * * Name ( NI ) * * ] [ * * Tele...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
16784,data/training_20180910/198406.txt,"4 . Ergocalciferol 50 , 000 unit weekly 5 . Ri...","O,O,B-Drug,B-Dosage,I-Dosage,I-Dosage,I-Dosage..."
16785,data/training_20180910/198406.txt,9 . Acetaminophen 1000 mg PO Q6H 10 . Mirtazap...,"O,O,B-Drug,B-Strength,I-Strength,B-Route,B-Fre..."
16786,data/training_20180910/198406.txt,Expired Discharge Diagnosis : Chief cause of d...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


In [16]:
# make sure test is formatted same way
concatenated_df.iloc[301].sentences

"could be due to patient's liver dysfunction/third spacing from CHF . If cholecystitis is of clinical concern , HIDA scan can be performed provided the total bilirubin is not elevated . 3 ) Hyperdense renal cortex in left lower quadrant transplanted kidney . Findings are most likely due to chronic rejection or"

In [17]:
def contains_ade(labels):
    return any('ADE' in label for label in labels.split(','))

ade_mask = concatenated_df['labels'].apply(contains_ade)

# Use the mask to filter the DataFrame
ade_sentences_df = concatenated_df[ade_mask]

# Display the filtered DataFrame
ade_sentences_df

Unnamed: 0,text_file_name,sentences,labels
60,data/training_20180910/100035.txt,being treated for infection . Since no new inf...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
84,data/training_20180910/100039.txt,Right heart catheterization IR guided paracent...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
92,data/training_20180910/100039.txt,OTHER MEDICAL HISTORY : - Embolic stroke in [ ...,"O,O,O,O,O,B-Reason,I-Reason,O,O,O,O,O,O,O,O,O,..."
145,data/training_20180910/100039.txt,anthracycline - induced cardiomyopathy ( EF 15...,"B-Drug,O,O,B-ADE,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."
149,data/training_20180910/100039.txt,She continued to have mild - moderate abdomina...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
...,...,...,...
16647,data/training_20180910/196798.txt,. 3 ) Anuric renal failure : ATN likely from T...,"O,O,O,O,O,O,O,B-ADE,O,O,B-Drug,O,B-Drug,O,O,O,..."
16667,data/training_20180910/196798.txt,. 11 ) Rash : Patient noted to have morbillifo...,"O,O,O,B-ADE,O,O,O,O,O,B-ADE,I-ADE,O,O,O,O,O,O,..."
16699,data/training_20180910/197869.txt,flagyl and zosyn ) his WBC continues to rise w...,"B-Drug,O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
16765,data/training_20180910/198406.txt,evidence of acute focal pneumonia . Brief Hosp...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [18]:
data = concatenated_df.copy()
data_test = concatenated_test.copy()

data.rename(columns={'concatenated_sentence': 'sentence', 'concatenated_labels': 'word_labels'}, inplace=True)
data_test.rename(columns={'concatenated_sentence': 'sentence', 'concatenated_labels': 'word_labels'}, inplace=True)

### It looks reasonable, let's proceed with training

In [19]:
from transformers import BertTokenizerFast, BertForTokenClassification

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(labels_to_ids))
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [20]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
TEST_BATCH_SIZE = 32
EPOCHS = 3 # train for 5
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10


In [21]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentences[index].strip().split()
        word_labels = self.data.labels[index].split(",")
        # sentence_id = self.data.iloc[index]['sentence_id']

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             return_offsets_mapping=True,
                             padding='max_length',
                             is_split_into_words=True,
                             truncation=True,
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        # item['sentence_id'] = sentence_id

        return item

  def __len__(self):
        return self.len

In [22]:
from sklearn.model_selection import train_test_split

train_dataset, val_dataset = train_test_split(data, test_size=0.2, shuffle=False)


In [23]:
train_dataset.reset_index(drop=True, inplace=True)
val_dataset.reset_index(drop=True, inplace=True)

In [24]:
train_dataset['sent_id']  = train_dataset.index
val_dataset['sent_id']= val_dataset.index
data_test['sent_id'] = data_test.index


In [25]:
print("FULL Dataet: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VAL Dataset: {}".format(val_dataset.shape))
print("TEST Dataset: {}".format(data_test.shape))

data_test.reset_index(drop=True, inplace=True)

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
val_set = dataset(val_dataset, tokenizer, MAX_LEN)
test_set = dataset(data_test, tokenizer, MAX_LEN)

FULL Dataet: (16788, 3)
TRAIN Dataset: (13430, 4)
VAL Dataset: (3358, 4)
TEST Dataset: (11146, 4)


In [26]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[3]["input_ids"]), training_set[3]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
to          18
an          18
OS          18
##H         -100
with        18
d           18
##ys        -100
##p         -100
##nea       -100
now         18
admitted    18
to          18
the         18
MI          18
##CU        -100
after       18
P           18
##EA        -100
arrest      18
x           18
##2         -100
.           18
The         18
patient     18
initially   18
presented   18
to          18
L           18
##G         -100
##H         -100
E           18
##D         -100
with        18
h           18
##y         -100
##pox       -100
##em        -100
##ic        -100
respiratory  18
distress    18
.           18
While       18
at          18
the         18
OS          18
##H         -100
,           18
he          18
received    18
CT          2
##X         -100
,           18
a           2
##zi        -100
##th        -100
##rom       -100
##y         -100
##cin       -100
,           18
SC          7
e           2
##pine      -100
##ph        

In [27]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }


training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)
test_loader = DataLoader(test_set, **test_params)


In [28]:
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [29]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [30]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)


        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        tr_loss += loss.item()

        # outputs = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        # loss = outputs["loss"]
        # tr_logits = outputs["logits"]

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

## Train

In [31]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.6972134113311768
Training loss per 100 training steps: 0.6938525337246385
Training loss per 100 training steps: 0.5113698519877533
Training loss per 100 training steps: 0.4168917549343972
Training loss per 100 training steps: 0.3591719093079913
Training loss per 100 training steps: 0.32088597634706845
Training loss per 100 training steps: 0.2955557961588704
Training loss per 100 training steps: 0.27267015248951576
Training loss per 100 training steps: 0.25535703801663917
Training loss per 100 training steps: 0.2410037577287661
Training loss per 100 training steps: 0.2286284691281989
Training loss per 100 training steps: 0.21756808253911938
Training loss per 100 training steps: 0.20670914868622564
Training loss per 100 training steps: 0.1972364641764136
Training loss per 100 training steps: 0.18945931411127417
Training loss per 100 training steps: 0.18424523335703613
Training loss per 100 training steps: 0.17854508690676069
Train

## Validate

In [37]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            # indices_list = batch[idx]


            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            # indices.extend(indices_list)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [38]:
labels, predictions = valid(model, val_loader)

Validation loss per 100 evaluation steps: 0.0009855637326836586
Validation loss per 100 evaluation steps: 0.07453529008864171
Validation loss per 100 evaluation steps: 0.06829191104050515
Validation loss per 100 evaluation steps: 0.06791674635928832
Validation loss per 100 evaluation steps: 0.06626828216797451
Validation loss per 100 evaluation steps: 0.06474324461272112
Validation loss per 100 evaluation steps: 0.06920493651371191
Validation loss per 100 evaluation steps: 0.07021010329611994
Validation loss per 100 evaluation steps: 0.07150066968177625
Validation loss per 100 evaluation steps: 0.07305849279653599
Validation loss per 100 evaluation steps: 0.07343592080768532
Validation loss per 100 evaluation steps: 0.07375368195585431
Validation loss per 100 evaluation steps: 0.07592790762123343
Validation loss per 100 evaluation steps: 0.07635740672238306
Validation loss per 100 evaluation steps: 0.07511526842302851
Validation loss per 100 evaluation steps: 0.0745546699625018
Validat

In [39]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

         ADE       0.42      0.33      0.37       165
      Dosage       0.83      0.90      0.86       782
        Drug       0.89      0.91      0.90      3216
    Duration       0.68      0.82      0.75       120
        Form       0.89      0.89      0.89      1269
   Frequency       0.78      0.83      0.81      1270
      Reason       0.66      0.55      0.60       698
       Route       0.92      0.91      0.91      1143
    Strength       0.92      0.93      0.93      1341

   micro avg       0.85      0.87      0.86     10004
   macro avg       0.78      0.79      0.78     10004
weighted avg       0.85      0.87      0.86     10004



### Test Set

In [40]:
test_labels, test_predictions = valid(model, test_loader)

Validation loss per 100 evaluation steps: 0.08192465454339981
Validation loss per 100 evaluation steps: 0.06567942334505
Validation loss per 100 evaluation steps: 0.06710636445937392
Validation loss per 100 evaluation steps: 0.06792395626445397
Validation Loss: 0.06754587140910989
Validation Accuracy: 0.9815674658492675


In [41]:
from seqeval.metrics import classification_report

print(classification_report([test_labels], [test_predictions]))

              precision    recall  f1-score   support

         ADE       0.49      0.31      0.38       634
      Dosage       0.86      0.93      0.89      2704
        Drug       0.91      0.93      0.92     10593
    Duration       0.66      0.72      0.69       385
        Form       0.91      0.93      0.92      4373
   Frequency       0.82      0.86      0.84      4149
      Reason       0.65      0.51      0.57      2560
       Route       0.95      0.93      0.94      3513
    Strength       0.95      0.95      0.95      4237

   micro avg       0.88      0.88      0.88     33148
   macro avg       0.80      0.78      0.79     33148
weighted avg       0.87      0.88      0.87     33148



### Save

In [None]:
import os

directory = "/content/drive/MyDrive/266_final/microsoft_bbb_concat7"

if not os.path.exists(directory):
    os.makedirs(directory)

tokenizer.save_vocabulary(directory)
model.save_pretrained(directory)

Resources

### https://github.com/lcampillos/Medical-NER/blob/master/bert_ner.ipynb
### https://medium.com/analytics-vidhya/bio-tagged-text-to-original-text-99b05da6664