In [1]:
!pip install seqeval



In [2]:
import pandas as pd
import numpy as np

import os

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:

# verify GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


## Convert to DataFrame for EDA

In [5]:
def convert_data(filepath):

  # Read the data from the text file
  with open(filepath, "r") as file:
      lines = file.readlines()

  # Define an empty list to store the data
  data = []

  # Iterate over each line in the file
  for line in lines:
      # Split the line by spaces
      parts = line.strip().split()

      # Check if the line has the expected number of elements
      if len(parts) == 9:
          # Extract the values from the line
          text_file_name = parts[0]
          sentence_line_number = int(parts[1])
          sentence_word_index = int(parts[2])
          sentence_seq = parts[3]
          start_token = int(parts[4])
          end_token = int(parts[5])
          original_word = parts[6]
          word = parts[7]
          label = parts[8]

          # Append the values as a tuple to the data list
          data.append((text_file_name, sentence_line_number, sentence_word_index, sentence_seq,
                      start_token, end_token, original_word, word, label))

  # Create a DataFrame from the data list with appropriate column names
  df = pd.DataFrame(data, columns=['text_file_name', 'sentence_line_number', 'sentence_word_index',
                                  'sentence_seq', 'start_token', 'end_token', 'original_word',
                                  'word', 'label'])

  return df


In [6]:
train_data_path = "/content/drive/MyDrive/266_final/data/Original_text/dataset1_train.txt"
test_data_path = "/content/drive/MyDrive/266_final/data/Original_text/dataset1_test.txt"

train = convert_data(train_data_path)
test = convert_data(test_data_path)

print(f"Length of train: {len(train)}")
print(f"Length of test: {len(test)}")


Length of train: 895141
Length of test: 585761


In [7]:
df = train.copy()
df['sentence_line_number'].nunique()

1053

In [8]:
df['label'].value_counts().sort_values()

label
I-Route           397
B-Duration        592
I-ADE             776
B-ADE             956
I-Duration       1034
I-Reason         3125
B-Reason         3791
I-Form           4173
B-Dosage         4221
I-Drug           4298
B-Route          5475
B-Frequency      6279
I-Strength       6617
B-Form           6647
B-Strength       6691
I-Dosage         8779
I-Frequency     13023
B-Drug          16222
O              802045
Name: count, dtype: int64

In [9]:
df[df['sentence_seq'] == '530']

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label


In [10]:
df[(df['word'] == 'ORDINAL') & (df['label'] == 'B-Dosage')]


Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
2580,data/training_20180910/110727.txt,277,9,T396,12227,12228,3,ORDINAL,B-Dosage
3038,data/training_20180910/110727.txt,331,2,T167,14315,14316,1,ORDINAL,B-Dosage
3070,data/training_20180910/110727.txt,335,2,T87,14463,14464,1,ORDINAL,B-Dosage
3143,data/training_20180910/110727.txt,347,12,T264,14825,14828,One,ORDINAL,B-Dosage
3166,data/training_20180910/110727.txt,349,8,T269,14897,14902,Three,ORDINAL,B-Dosage
...,...,...,...,...,...,...,...,...,...
894405,data/training_20180910/100883.txt,125,0,T38,5767,5770,one,ORDINAL,B-Dosage
894903,data/training_20180910/100883.txt,188,3,T72,8349,8350,1,ORDINAL,B-Dosage
894959,data/training_20180910/100883.txt,194,3,T90,8571,8572,2,ORDINAL,B-Dosage
894968,data/training_20180910/100883.txt,195,3,T94,8607,8608,2,ORDINAL,B-Dosage


### Label labels

In [11]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['label'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]

print(unique_labels)

{'O', 'I-Drug', 'I-Strength', 'I-Duration', 'I-Route', 'B-Reason', 'B-Strength', 'B-Frequency', 'I-Dosage', 'B-Route', 'I-Frequency', 'I-ADE', 'I-Reason', 'B-Dosage', 'B-Form', 'I-Form', 'B-ADE', 'B-Drug', 'B-Duration'}


In [12]:
# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(labels_to_ids)

{'B-ADE': 0, 'B-Dosage': 1, 'B-Drug': 2, 'B-Duration': 3, 'B-Form': 4, 'B-Frequency': 5, 'B-Reason': 6, 'B-Route': 7, 'B-Strength': 8, 'I-ADE': 9, 'I-Dosage': 10, 'I-Drug': 11, 'I-Duration': 12, 'I-Form': 13, 'I-Frequency': 14, 'I-Reason': 15, 'I-Route': 16, 'I-Strength': 17, 'O': 18}


In [13]:
df[df['label'] == 'B-ADE']

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
637,data/training_20180910/110727.txt,68,3,T208,3011,3025,polyneuropathy,polyneuropathy,B-ADE
645,data/training_20180910/110727.txt,69,0,T377,3062,3070,infusion,infusion,B-ADE
700,data/training_20180910/110727.txt,74,1,T209,3237,3240,HTN,HTN,B-ADE
706,data/training_20180910/110727.txt,75,1,T210,3262,3265,DM2,DM0,B-ADE
732,data/training_20180910/110727.txt,78,1,T378,3365,3370,NAFLD,NAFLD,B-ADE
...,...,...,...,...,...,...,...,...,...
892895,data/training_20180910/118564.txt,156,6,T40,8092,8106,hallucinations,hallucinations,B-ADE
892897,data/training_20180910/118564.txt,157,1,T41,8111,8120,tachypnea,tachypnea,B-ADE
893013,data/training_20180910/118564.txt,168,10,T44,8731,8735,rash,rash,B-ADE
894424,data/training_20180910/100883.txt,128,0,T41,5871,5882,Hypotension,Hypotension,B-ADE


In [14]:
def formatted_df(df):
  df['sentence'] = df[[
      'text_file_name',
      'sentence_line_number',
      'original_word',
      'label']].groupby(
          ['text_file_name', 'sentence_line_number'])['original_word'].transform(lambda x: ' '.join(x))

  df['word_labels'] = df[[
      'text_file_name',
      'sentence_line_number',
      'original_word',
      'label']].groupby(
          ['text_file_name', 'sentence_line_number'])['label'].transform(lambda x: ','.join(x))

  return df

In [15]:
df = formatted_df(df)
test = formatted_df(test)

In [16]:
def sentence_level_data_fn(df):
  sentence_level_data = df[["text_file_name", "sentence_line_number", "sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
  return sentence_level_data

sentence_level_train = sentence_level_data_fn(df)
sentence_level_test = sentence_level_data_fn(test)

In [17]:
sentence_level_data = sentence_level_train[["text_file_name", "sentence_line_number", "sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
sentence_level_test = sentence_level_test[["text_file_name","sentence_line_number",  "sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)

sentence_level_data.head()

Unnamed: 0,text_file_name,sentence_line_number,sentence,word_labels
0,data/training_20180910/110727.txt,1,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/110727.txt,3,Date of Birth : [ * * 2163 - 9 - 18 * * ] Sex : M,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,data/training_20180910/110727.txt,5,Service : MEDICINE,"O,O,O"
3,data/training_20180910/110727.txt,7,Allergies :,"O,O"
4,data/training_20180910/110727.txt,8,Keflex / Orencia / Remicade,"B-Drug,O,B-Drug,O,B-Drug"


In [18]:
len(sentence_level_data)

83321

In [19]:
sentence_level_data.iloc[7020].sentence

'your chemistries .'

In [20]:

sentence_level_data.iloc[7020].word_labels

'O,O,O'

In [21]:
print(sentence_level_data.iloc[21].sentence)
print(sentence_level_data.iloc[21].word_labels)

fx on x - ray . This was thought to be a psoriatic arthritis flare ,
O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


#### Comment this out if you want concatenated dataset.

In [22]:
data = sentence_level_data.copy()
data_test = sentence_level_test.copy()

In [23]:
data.head()

Unnamed: 0,text_file_name,sentence_line_number,sentence,word_labels
0,data/training_20180910/110727.txt,1,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/110727.txt,3,Date of Birth : [ * * 2163 - 9 - 18 * * ] Sex : M,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,data/training_20180910/110727.txt,5,Service : MEDICINE,"O,O,O"
3,data/training_20180910/110727.txt,7,Allergies :,"O,O"
4,data/training_20180910/110727.txt,8,Keflex / Orencia / Remicade,"B-Drug,O,B-Drug,O,B-Drug"


### It looks reasonable, let's proceed with training

In [24]:
from transformers import BertTokenizerFast, BertForTokenClassification

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(labels_to_ids))
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [25]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
TEST_BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [26]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")
        # sentence_id = self.data.iloc[index]['sentence_id']

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             return_offsets_mapping=True,
                             padding='max_length',
                             is_split_into_words=True,
                             truncation=True,
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100


        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        # item['sentence_id'] = sentence_id

        return item

  def __len__(self):
        return self.len

In [27]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
val_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)



In [28]:
print("FULL Dataet: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VAL Dataset: {}".format(val_dataset.shape))
print("TEST Dataset: {}".format(data_test.shape))

data_test.reset_index(drop=True, inplace=True)

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
val_set = dataset(val_dataset, tokenizer, MAX_LEN)
test_set = dataset(data_test, tokenizer, MAX_LEN)

FULL Dataet: (83321, 4)
TRAIN Dataset: (66657, 4)
VAL Dataset: (16664, 4)
TEST Dataset: (55322, 4)


In [29]:
# for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[3]["input_ids"]), training_set[3]["labels"]):
  # print('{0:10}  {1}'.format(token, label))

In [30]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }


training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)
test_loader = DataLoader(test_set, **test_params)


In [31]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.6933, device='cuda:0', grad_fn=<NllLossBackward0>)

In [32]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 512, 19])

In [33]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [34]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)


        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        tr_loss += loss.item()

        # outputs = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        # loss = outputs["loss"]
        # tr_logits = outputs["logits"]

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [35]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.9227821826934814
Training loss per 100 training steps: 0.8242798686248831
Training loss per 100 training steps: 0.5787880921421286
Training loss per 100 training steps: 0.46531918944258777
Training loss per 100 training steps: 0.40621812406815866
Training loss per 100 training steps: 0.3584116140423257
Training loss per 100 training steps: 0.3294153750917492
Training loss per 100 training steps: 0.30971031928881365
Training loss per 100 training steps: 0.2894345771884092
Training loss per 100 training steps: 0.2736407599169513
Training loss per 100 training steps: 0.2613859843087292
Training loss per 100 training steps: 0.24980164319890077
Training loss per 100 training steps: 0.2407314328561699
Training loss per 100 training steps: 0.2321621216334285
Training loss per 100 training steps: 0.22441428028220586
Training loss per 100 training steps: 0.21826691423303296
Training loss per 100 training steps: 0.2123059820272389
Trainin

In [36]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [37]:
labels, predictions = valid(model, val_loader)

Validation loss per 100 evaluation steps: 0.00031351041980087757
Validation loss per 100 evaluation steps: 0.062299639241174
Validation loss per 100 evaluation steps: 0.08158744247469515
Validation loss per 100 evaluation steps: 0.07696731688275484
Validation loss per 100 evaluation steps: 0.07197901777637294
Validation loss per 100 evaluation steps: 0.07085150047947798
Validation loss per 100 evaluation steps: 0.07095418945818786
Validation loss per 100 evaluation steps: 0.06855515643563846
Validation loss per 100 evaluation steps: 0.07002460579117516
Validation loss per 100 evaluation steps: 0.0715624903474785
Validation loss per 100 evaluation steps: 0.06918801385639214
Validation loss per 100 evaluation steps: 0.06938034630480015
Validation loss per 100 evaluation steps: 0.06934783075049744
Validation loss per 100 evaluation steps: 0.06899281545193606
Validation loss per 100 evaluation steps: 0.06847969933337893
Validation loss per 100 evaluation steps: 0.06683508912623938
Validati

In [38]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

         ADE       0.37      0.15      0.21       205
      Dosage       0.87      0.91      0.89       860
        Drug       0.92      0.91      0.92      3201
    Duration       0.68      0.58      0.63       130
        Form       0.93      0.91      0.92      1325
   Frequency       0.87      0.89      0.88      1439
      Reason       0.64      0.40      0.50       831
       Route       0.93      0.92      0.93      1102
    Strength       0.93      0.94      0.94      1356

   micro avg       0.89      0.85      0.87     10449
   macro avg       0.79      0.74      0.76     10449
weighted avg       0.88      0.85      0.86     10449



In [39]:
# sentence_predictions = []

# # Example of splitting based on your data structure
# # Assuming each 'word_labels' entry in the DataFrame corresponds to a sentence
# start_idx = 0
# for index, row in val_dataset.iterrows():
#     # Number of labels in the sentence
#     num_labels = len(row['word_labels'].split(','))
#     # Segment predictions corresponding to the current sentence
#     sentence_pred = predictions[start_idx:start_idx + num_labels]
#     # Append to our list
#     sentence_predictions.append(','.join(sentence_pred))
#     # Update start index for next iteration
#     start_idx += num_labels

# # Assigning segmented predictions back to DataFrame
# val_dataset['predicted_labels'] = sentence_predictions

# val_dataset.head()

In [40]:
# sentence_predictions = []

# for index, row in val_dataset.iterrows():
#     num_labels = len(row['word_labels'].split(','))
#     sentence_pred = predictions[start_idx:start_idx + num_labels]
#     # Create a tuple of (sentence_id, combined predictions) and append to the list
#     sentence_predictions.append((row['sentence_id'], ','.join(sentence_pred)))
#     start_idx += num_labels

# # Now, sentence_predictions holds tuples of sentence IDs and their predictions
# for item in sentence_predictions:
#     print(f"Sentence ID: {item[0]}, Predictions: {item[1]}")

In [41]:
test_labels, test_predictions = valid(model, test_loader)

Validation loss per 100 evaluation steps: 0.0799737274646759
Validation loss per 100 evaluation steps: 0.07374508636496445
Validation loss per 100 evaluation steps: 0.07259091833115217
Validation loss per 100 evaluation steps: 0.07190524559140057
Validation loss per 100 evaluation steps: 0.07240229510258596
Validation loss per 100 evaluation steps: 0.0716730250229691
Validation loss per 100 evaluation steps: 0.07132550387623132
Validation loss per 100 evaluation steps: 0.07046859276237846
Validation loss per 100 evaluation steps: 0.06961904898385858
Validation loss per 100 evaluation steps: 0.07030455300844916
Validation loss per 100 evaluation steps: 0.07103393725273045
Validation loss per 100 evaluation steps: 0.0715733087713831
Validation loss per 100 evaluation steps: 0.07131951609990876
Validation loss per 100 evaluation steps: 0.07155711470181832
Validation loss per 100 evaluation steps: 0.07179240813727446
Validation loss per 100 evaluation steps: 0.07165845488070906
Validation 

In [42]:
from seqeval.metrics import classification_report

print(classification_report([test_labels], [test_predictions]))

              precision    recall  f1-score   support

         ADE       0.36      0.15      0.21       671
      Dosage       0.86      0.90      0.88      2777
        Drug       0.91      0.91      0.91     10629
    Duration       0.70      0.59      0.64       407
        Form       0.92      0.90      0.91      4403
   Frequency       0.83      0.86      0.84      4728
      Reason       0.62      0.38      0.47      2709
       Route       0.94      0.92      0.93      3532
    Strength       0.94      0.94      0.94      4252

   micro avg       0.88      0.84      0.86     34108
   macro avg       0.79      0.73      0.75     34108
weighted avg       0.87      0.84      0.85     34108



### Save

In [43]:
import os

directory = "/content/drive/MyDrive/266_final/bertbasecased"

if not os.path.exists(directory):
    os.makedirs(directory)

tokenizer.save_vocabulary(directory)
model.save_pretrained(directory, safe_serialization=True)

Resources

### https://github.com/lcampillos/Medical-NER/blob/master/bert_ner.ipynb
### https://medium.com/analytics-vidhya/bio-tagged-text-to-original-text-99b05da6664