In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [15]:

# verify GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


## Convert to DataFrame for EDA

In [32]:
def convert_data(filepath):

  # Read the data from the text file
  with open(filepath, "r") as file:
      lines = file.readlines()

  # Define an empty list to store the data
  data = []

  # Iterate over each line in the file
  for line in lines:
      # Split the line by spaces
      parts = line.strip().split()

      # Check if the line has the expected number of elements
      if len(parts) == 9:
          # Extract the values from the line
          text_file_name = parts[0]
          sentence_line_number = int(parts[1])
          sentence_word_index = int(parts[2])
          sentence_seq = parts[3]
          start_token = int(parts[4])
          end_token = int(parts[5])
          original_word = parts[6]
          word = parts[7]
          label = parts[8]

          # Append the values as a tuple to the data list
          data.append((text_file_name, sentence_line_number, sentence_word_index, sentence_seq,
                      start_token, end_token, original_word, word, label))

  # Create a DataFrame from the data list with appropriate column names
  df = pd.DataFrame(data, columns=['text_file_name', 'sentence_line_number', 'sentence_word_index',
                                  'sentence_seq', 'start_token', 'end_token', 'original_word',
                                  'word', 'label'])

  return df


In [33]:
train_data_path = "/content/drive/MyDrive/266_final/data/dataset1_train.txt"
test_data_path = "/content/drive/MyDrive/266_final/data/dataset1_test.txt"

train = convert_data(train_data_path)
test = convert_data(test_data_path)

print(f"Length of train: {len(train)}")
print(f"Length of test: {len(test)}")


Length of train: 895141
Length of test: 585761


In [34]:
df = train.copy()
df['sentence_line_number'].nunique()

1053

In [36]:
df['label'].value_counts().sort_values()

label
I-Route           397
B-Duration        592
I-ADE             776
B-ADE             956
I-Duration       1034
I-Reason         3125
B-Reason         3791
I-Form           4173
B-Dosage         4221
I-Drug           4298
B-Route          5475
B-Frequency      6279
I-Strength       6617
B-Form           6647
B-Strength       6691
I-Dosage         8779
I-Frequency     13023
B-Drug          16222
O              802045
Name: count, dtype: int64

In [37]:
df[df['sentence_seq'] == '530']

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label


In [38]:
df[(df['word'] == 'ORDINAL') & (df['label'] == 'B-Dosage')]


Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
2580,data/training_20180910/110727.txt,277,9,T396,12227,12228,3,ORDINAL,B-Dosage
3038,data/training_20180910/110727.txt,331,2,T167,14315,14316,1,ORDINAL,B-Dosage
3070,data/training_20180910/110727.txt,335,2,T87,14463,14464,1,ORDINAL,B-Dosage
3143,data/training_20180910/110727.txt,347,12,T264,14825,14828,One,ORDINAL,B-Dosage
3166,data/training_20180910/110727.txt,349,8,T269,14897,14902,Three,ORDINAL,B-Dosage
...,...,...,...,...,...,...,...,...,...
894405,data/training_20180910/100883.txt,125,0,T38,5767,5770,one,ORDINAL,B-Dosage
894903,data/training_20180910/100883.txt,188,3,T72,8349,8350,1,ORDINAL,B-Dosage
894959,data/training_20180910/100883.txt,194,3,T90,8571,8572,2,ORDINAL,B-Dosage
894968,data/training_20180910/100883.txt,195,3,T94,8607,8608,2,ORDINAL,B-Dosage


### Label labels

In [39]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['label'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]

print(unique_labels)

{'I-Form', 'I-Reason', 'B-Strength', 'I-Frequency', 'I-Strength', 'B-Form', 'I-Dosage', 'B-Reason', 'B-Route', 'I-Route', 'B-Dosage', 'B-Frequency', 'I-Duration', 'B-ADE', 'I-Drug', 'B-Duration', 'O', 'B-Drug', 'I-ADE'}


In [40]:
# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(labels_to_ids)

{'B-ADE': 0, 'B-Dosage': 1, 'B-Drug': 2, 'B-Duration': 3, 'B-Form': 4, 'B-Frequency': 5, 'B-Reason': 6, 'B-Route': 7, 'B-Strength': 8, 'I-ADE': 9, 'I-Dosage': 10, 'I-Drug': 11, 'I-Duration': 12, 'I-Form': 13, 'I-Frequency': 14, 'I-Reason': 15, 'I-Route': 16, 'I-Strength': 17, 'O': 18}


## Experimenting with formats for BERT

In [41]:
train_texts = []
train_labels = []

# Read the text file line by line
with open('/content/drive/MyDrive/266_final/data/dataset1_train.txt', 'r', encoding='utf-8') as file:
    current_text = []  # To store tokens of the current text
    current_labels = []  # To store labels of the current text
    for line in file:
        if line.strip() == '':  # Empty line signifies end of text
            train_texts.append(current_text)
            train_labels.append(current_labels)
            current_text = []
            current_labels = []
        else:
            parts = line.strip().split()
            token = parts[-3]  # Token is second-to-last part
            label = parts[-1]   # Label is last part
            current_text.append(token)
            current_labels.append(label)

# Check the first few samples
print(train_texts[:5])
print(train_labels[:5])


[['Admission', 'Date', ':', '[', '*', '*', '2202', '-', '1', '-', '8', '*', '*', ']', 'Discharge', 'Date', ':', '[', '*', '*', '2202', '-', '2', '-', '1', '*', '*', ']'], ['Date', 'of', 'Birth', ':', '[', '*', '*', '2163', '-', '9', '-', '18', '*', '*', ']', 'Sex', ':', 'M'], ['Service', ':', 'MEDICINE'], ['Allergies', ':', 'Keflex', '/', 'Orencia', '/', 'Remicade'], ['Attending', ':', '[', '*', '*', 'First', 'Name3', '(', 'LF', ')', '2751', '*', '*', ']', 'Chief', 'Complaint', ':', 'L', 'leg', 'pain', 'and', 'erythema']]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O'], ['O', 'O', 'B-Drug', 'O', 'B-Drug', 'O', 'B-Drug'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]


In [42]:
train_texts[15]

['Pt',
 'was',
 'started',
 'on',
 'gabapentin',
 'for',
 'presumed',
 'fibromyalgia',
 'and',
 'discharged',
 'yesterday',
 'on',
 'a',
 'stable',
 'pain',
 'regimen',
 'of',
 'MS',
 'contin',
 'with',
 'prn',
 'dilaudid',
 '.']

In [43]:
train_labels[15]

['O',
 'O',
 'O',
 'O',
 'B-Drug',
 'O',
 'O',
 'B-Reason',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-Drug',
 'I-Drug',
 'O',
 'B-Drug',
 'I-Drug',
 'O',
 'B-Frequency',
 'B-Drug',
 'O']

### Not sure if this is the correct format. Let's look into another way to turn it into a sentence.

In [29]:
df['sentence'] = df[['text_file_name', 'sentence_line_number', 'original_word', 'label']].groupby(['text_file_name', 'sentence_line_number'])['original_word'].transform(lambda x: ' '.join(x))

In [30]:
df['word_labels'] = df[['text_file_name', 'sentence_line_number', 'original_word', 'label']].groupby(['text_file_name', 'sentence_line_number'])['label'].transform(lambda x: ','.join(x))


In [31]:
sentence_level_data = df[["text_file_name","sentence_line_number", "sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
sentence_level_data.head()

Unnamed: 0,text_file_name,sentence_line_number,sentence,word_labels
0,data/training_20180910/110727.txt,1,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/110727.txt,3,Date of Birth : [ * * 2163 - 9 - 18 * * ] Sex : M,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,data/training_20180910/110727.txt,5,Service : MEDICINE,"O,O,O"
3,data/training_20180910/110727.txt,7,Allergies :,"O,O"
4,data/training_20180910/110727.txt,8,Keflex / Orencia / Remicade,"B-Drug,O,B-Drug,O,B-Drug"


In [45]:
sentence_level_data.iloc[10].sentence

'Incision and drainage'

In [46]:
sentence_level_data.iloc[10].word_labels

'O,O,O'

In [48]:
print(sentence_level_data.iloc[21].sentence)
print(sentence_level_data.iloc[21].word_labels)

fx on x - ray . This was thought to be a psoriatic arthritis flare ,
O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


#### Some sentences are really short. *Try* concatenating sentence to get longer inputs

In [50]:
df = sentence_level_data.sort_values(by=['text_file_name', 'sentence_line_number'])

# Function to concatenate up to 5 consecutive sentences and labels
def concatenate_sentences(group):
    """
    This function takes a group of sentences and labels from a dataframe
    and concatenates up to 5 consecutive sentences and t
    heir corresponding labels into a single string.
    Args:
        group (Datafram): A group of sentences and labels from a
        DataFrame grouped by a column (e.g., text_file_name).
    Returns:
        pandas.DataFrame: A DataFrame containing the
        original text_file_name, a list of
        concatenated sentences, and a
        list of concatenated labels.
    """
    concatenated_sentences = []
    concatenated_labels = []

    for i in range(len(group)):
        # Select up to 5 consecutive sentences starting from the current one
        sentences_to_concat = group['sentence'].iloc[i:i+5].tolist()
        labels_to_concat = group['word_labels'].iloc[i:i+5].tolist()

        # Concatenate selected sentences and labels
        concatenated_sentence = ' '.join(sentences_to_concat)
        concatenated_label = ','.join(labels_to_concat)

        concatenated_sentences.append(concatenated_sentence)
        concatenated_labels.append(concatenated_label)

    # Return a DataFrame for the concatenated sentences and labels
    return pd.DataFrame({
        'text_file_name': group['text_file_name'].iloc[0],
        'concatenated_sentence': concatenated_sentences,
        'concatenated_labels': concatenated_labels
    })

# Step 2: Apply the concatenation function to each group and combine the results
concatenated_df = pd.concat([concatenate_sentences(group) for _, group in df.groupby('text_file_name')])

# Reset index of the resulting DataFrame
concatenated_df = concatenated_df.reset_index(drop=True)

concatenated_df.head()

Unnamed: 0,text_file_name,concatenated_sentence,concatenated_labels
0,data/training_20180910/100035.txt,Admission Date : [ * * 2115 - 2 - 22 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/100035.txt,Date of Birth : [ * * 2078 - 8 - 9 * * ] Sex :...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,data/training_20180910/100035.txt,Service : MEDICINE Allergies : Vicodin Attendi...,"O,O,O,O,O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."
3,data/training_20180910/100035.txt,Allergies : Vicodin Attending : [ * * First Na...,"O,O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."
4,data/training_20180910/100035.txt,Vicodin Attending : [ * * First Name3 ( LF ) 4...,"B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."


In [23]:
concatenated_df.head()

Unnamed: 0,text_file_name,concatenated_sentence,concatenated_labels
0,data/training_20180910/100035.txt,Admission Date : [ * * 2115 - 2 - 22 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/100035.txt,Date of Birth : [ * * 2078 - 8 - 9 * * ] Sex :...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,data/training_20180910/100035.txt,Service : MEDICINE Allergies : Vicodin Attendi...,"O,O,O,O,O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."
3,data/training_20180910/100035.txt,Allergies : Vicodin Attending : [ * * First Na...,"O,O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."
4,data/training_20180910/100035.txt,Vicodin Attending : [ * * First Name3 ( LF ) 4...,"B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."


In [54]:
print(concatenated_df['concatenated_sentence'][20])
print(concatenated_df['concatenated_labels'][20])

and was intubated . He received epinephrine , atropine , magnesium , and bicarb . In addition , he had bilateral needle thoracostomies with report of air return on the left , and he subsequently had bilateral chest tubes placed . After approximately 15 - 20 minutes of rescucitation , he had ROSC . He received vecuronium and was
O,O,O,O,O,O,B-Drug,O,B-Drug,O,B-Drug,O,O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-Drug,O,O


In [55]:
print(concatenated_df['concatenated_sentence'][21])
print(concatenated_df['concatenated_labels'][21])

and bicarb . In addition , he had bilateral needle thoracostomies with report of air return on the left , and he subsequently had bilateral chest tubes placed . After approximately 15 - 20 minutes of rescucitation , he had ROSC . He received vecuronium and was started on an epi gtt for asthma and a cooling protocol , and was
O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-Drug,O,O,O,O,O,B-Drug,B-Route,O,B-Reason,O,O,O,O,O,O,O


In [56]:
print(concatenated_df['concatenated_sentence'][22])
print(concatenated_df['concatenated_labels'][22])

with report of air return on the left , and he subsequently had bilateral chest tubes placed . After approximately 15 - 20 minutes of rescucitation , he had ROSC . He received vecuronium and was started on an epi gtt for asthma and a cooling protocol , and was then transferred to [ * * Hospital1 18 * * ] for further evaluation . Of note , the
O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-Drug,O,O,O,O,O,B-Drug,B-Route,O,B-Reason,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [24]:
# concatenated_df.to_csv('/content/drive/MyDrive/266_final/data/concatenated_df.csv', index=False)

In [28]:
# data = sentence_level_data.copy()

In [29]:
data = concatenated_df.copy()

In [30]:
data.rename(columns={'concatenated_sentence': 'sentence', 'concatenated_labels': 'word_labels'}, inplace=True)

### It looks reasonable, let's proceed with training

In [31]:
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
import os

In [32]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [33]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             return_offsets_mapping=True,
                             padding='max_length',
                             is_split_into_words=True,
                             truncation=True,
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

  def __len__(self):
        return self.len

In [34]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (83321, 3)
TRAIN Dataset: (66657, 3)
TEST Dataset: (16664, 3)


In [35]:
training_set[0]

{'input_ids': tensor([  101,   189, 19366,  4638, 15540, 18574,   185, 12606,  7159, 12693,
          2892,  1104, 13653,  9190, 21615,  5800,   131,  1109,  5351,   170,
          5691,  1214,  1385,  1590,  1114,   170,  1607,  1104,   177, 24312,
         23826,  1988,   117, 17972,   117,  1105, 26707,  3452,  1465,  1208,
         11124,  1113,  4036,  1121,   164,   115,   115,  3355,  1495,   115,
           115,   166,  3355,  1114,   170,  1415,  1286, 16655,  2180,   118,
         14247, 22331,  1348, 24752,   119,  1109,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [36]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
t           18
##rac       -100
##he        -100
##ost       -100
##omy       -100
p           18
##eg        -100
tube        18
placement   18
History     18
of          18
Present     18
Il          18
##ln        -100
##ess       -100
:           18
The         18
patient     18
a           18
69          18
year        18
old         18
woman       18
with        18
a           18
history     18
of          18
h           18
##yper      -100
##tens      -100
##ion       -100
,           18
diabetes    18
,           18
and         18
dem         18
##ent       -100
##ia        -100
now         18
presenting  18
on          18
transfer    18
from        18
[           18
*           18
*           18
Hospital    18
##3         -100
*           18
*           18
]           18
Hospital    -100
with        18
a           18
large       18
left        18
tempo       18
##ro        -100
-           18
par         18
##iet       -100
##al        -100
bleed       18
.   

In [37]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [38]:
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(labels_to_ids))
model.to(device)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [39]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.7768, device='cuda:0', grad_fn=<NllLossBackward0>)

In [40]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 512, 19])

In [41]:
# !pip install transformers==3.5.1

import torch.nn.functional as F


def focal_loss(logits, labels, alpha=0.25, gamma=2.0, ignore_index=-100):
    """
    logits: [batch_size, seq_len, num_labels] - model predictions
    labels: [batch_size, seq_len] - ground truth labels
    """
    # Calculate Cross Entropy Loss without reduction
    ce_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), reduction='none', ignore_index=ignore_index)

    # Get the predictions
    pred_probs = F.softmax(logits.view(-1, logits.size(-1)), dim=-1)
    pred_class = labels.view(-1)

    # focusing parameter
    gamma = gamma

    # Filter out 'ignore_index' labels
    filtered = labels.view(-1) != ignore_index

    # Calculate focal loss
    ce_loss_filtered = ce_loss[filtered]
    pred_probs_filtered = pred_probs[filtered]
    pred_class_filtered = pred_class[filtered]

    # Construct the loss
    pt = pred_probs_filtered.gather(1, pred_class_filtered.unsqueeze(-1)).squeeze()
    loss = ((1 - pt) ** gamma * ce_loss_filtered).mean()  # mean over the batch

    return loss



In [42]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [43]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []

    model.train()

    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)
    #     global_attention_mask = torch.zeros(
    # input_ids.shape, dtype=torch.long, device=input_ids.device

        # Perform a forward pass to get the logits
        outputs = model(input_ids=ids, attention_mask=mask)

        # Calculate the loss using focal_loss function directly
        loss = focal_loss(outputs.logits, labels)  # Use the focal_loss function here directly

        tr_loss += loss.item()
        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100 == 0:
            loss_step = tr_loss / nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # Decoding the logits to compute accuracy
        active_logits = outputs.logits.view(-1, model.num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)

        # Ignoring the predictions of the padding tokens
        active_accuracy = labels.view(-1) != -100
        active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(active_labels, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels.cpu().numpy())
        tr_preds.extend(predictions.cpu().numpy())

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")


In [44]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.5334863662719727
Training loss per 100 training steps: 0.5297140642023175
Training loss per 100 training steps: 0.36601180720620263
Training loss per 100 training steps: 0.28760187731099707
Training loss per 100 training steps: 0.2425789871509129
Training loss per 100 training steps: 0.2100666295458189
Training loss per 100 training steps: 0.18829669990226588
Training loss per 100 training steps: 0.1719833943876521
Training loss per 100 training steps: 0.15911097100687493
Training loss per 100 training steps: 0.14706569937379907
Training loss per 100 training steps: 0.1381928359821416
Training loss per 100 training steps: 0.12973076432619157
Training loss per 100 training steps: 0.12356813901523167
Training loss per 100 training steps: 0.11839486402882242
Training loss per 100 training steps: 0.1134306820119915
Training loss per 100 training steps: 0.10963561573210619
Training loss per 100 training steps: 0.10570030286457321
Tra

In [47]:
def valid(model, testing_loader):
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            outputs = model(input_ids=ids, attention_mask=mask)
            loss = focal_loss(outputs.logits, labels)  # Use the same focal_loss function

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Decoding the logits to compute accuracy
            active_logits = outputs.logits.view(-1, model.num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)

            # Ignoring the predictions of the padding tokens
            active_accuracy = labels.view(-1) != -100
            active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

            labels = torch.masked_select(active_labels, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels.cpu().numpy())
            eval_preds.extend(predictions.cpu().numpy())

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions


In [49]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.005265221465379
Validation loss per 100 evaluation steps: 0.01178696071822501
Validation loss per 100 evaluation steps: 0.013459722983817635
Validation loss per 100 evaluation steps: 0.015663122395955402
Validation loss per 100 evaluation steps: 0.015767546331367246
Validation loss per 100 evaluation steps: 0.01516596553575219
Validation loss per 100 evaluation steps: 0.015739439741765577
Validation loss per 100 evaluation steps: 0.015174651956884729
Validation loss per 100 evaluation steps: 0.014647646670878915
Validation loss per 100 evaluation steps: 0.014108927053954324
Validation loss per 100 evaluation steps: 0.014152945191394086
Validation loss per 100 evaluation steps: 0.014377628110962298
Validation loss per 100 evaluation steps: 0.014385124292269767
Validation loss per 100 evaluation steps: 0.014615389602741531
Validation loss per 100 evaluation steps: 0.01492657696215484
Validation loss per 100 evaluation steps: 0.01489385067213822

In [46]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=f51f4207a1bb991773095200c812b275d1d9115656d04e91d1f9a7433a7e7833
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [50]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

         ADE       0.68      0.61      0.64       956
      Dosage       0.90      0.96      0.93      4386
        Drug       0.94      0.96      0.95     16379
    Duration       0.62      0.87      0.72       625
        Form       0.97      0.92      0.95      6796
   Frequency       0.87      0.91      0.89      6672
      Reason       0.65      0.69      0.67      3874
       Route       0.94      0.97      0.96      5694
    Strength       0.96      0.97      0.96      6904

   micro avg       0.90      0.92      0.91     52286
   macro avg       0.84      0.87      0.85     52286
weighted avg       0.90      0.92      0.91     52286



In [52]:
import os

directory = "/content/drive/MyDrive/266_final/bert_base_cased"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed


Resources

### https://github.com/lcampillos/Medical-NER/blob/master/bert_ner.ipynb
### https://medium.com/analytics-vidhya/bio-tagged-text-to-original-text-99b05da6664