In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# verify GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


## Convert to Train DataFrame for EDA

In [4]:
import pandas as pd

# Read the data from the text file
with open("/content/drive/MyDrive/266_project/data/dataset1_train.txt", "r") as file:
    lines = file.readlines()

# Define an empty list to store the data
data = []

# Iterate over each line in the file
for line in lines:
    # Split the line by spaces
    parts = line.strip().split()

    # Check if the line has the expected number of elements
    if len(parts) == 9:
        # Extract the values from the line
        text_file_name = parts[0]
        sentence_line_number = int(parts[1])
        sentence_word_index = int(parts[2])
        sentence_seq = parts[3]
        start_token = int(parts[4])
        end_token = int(parts[5])
        original_word = parts[6]
        word = parts[7]
        label = parts[8]

        # Append the values as a tuple to the data list
        data.append((text_file_name, sentence_line_number, sentence_word_index, sentence_seq,
                     start_token, end_token, original_word, word, label))

# Create a DataFrame from the data list with appropriate column names
df = pd.DataFrame(data, columns=['text_file_name', 'sentence_line_number', 'sentence_word_index',
                                 'sentence_seq', 'start_token', 'end_token', 'original_word',
                                 'word', 'label'])




In [5]:
df.head()

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
0,data/training_20180910/110727.txt,1,0,,0,9,Admission,Admission,O
1,data/training_20180910/110727.txt,1,1,,10,14,Date,Date,O
2,data/training_20180910/110727.txt,1,2,,14,15,:,:,O
3,data/training_20180910/110727.txt,1,3,,17,18,[,[,O
4,data/training_20180910/110727.txt,1,4,,18,19,*,*,O


In [6]:
len(df)

895141

In [7]:
df['sentence_line_number'].nunique()

1053

In [8]:
df[df['label']=='B-Drug']

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
51,data/training_20180910/110727.txt,8,0,T118,163,169,Keflex,Keflex,B-Drug
53,data/training_20180910/110727.txt,8,2,T124,172,179,Orencia,Orencia,B-Drug
55,data/training_20180910/110727.txt,8,4,T134,182,190,Remicade,Remicade,B-Drug
185,data/training_20180910/110727.txt,25,9,T203,737,741,vanc,vanc,B-Drug
187,data/training_20180910/110727.txt,25,11,T177,746,751,cipro,cipro,B-Drug
...,...,...,...,...,...,...,...,...,...
894942,data/training_20180910/100883.txt,192,2,T84,8482,8492,Prednisone,Prednisone,B-Drug
894950,data/training_20180910/100883.txt,192,10,T87,8520,8527,steroid,steroid,B-Drug
894958,data/training_20180910/100883.txt,194,2,T89,8561,8570,Combivent,Combivent,B-Drug
894967,data/training_20180910/100883.txt,195,2,T93,8599,8606,Flovent,Flovent,B-Drug


In [9]:
df['label'].value_counts()

O              802045
B-Drug          16222
I-Frequency     13023
I-Dosage         8779
B-Strength       6691
B-Form           6647
I-Strength       6617
B-Frequency      6279
B-Route          5475
I-Drug           4298
B-Dosage         4221
I-Form           4173
B-Reason         3791
I-Reason         3125
I-Duration       1034
B-ADE             956
I-ADE             776
B-Duration        592
I-Route           397
Name: label, dtype: int64

In [10]:
df[df['sentence_line_number'] == 1]

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
0,data/training_20180910/110727.txt,1,0,,0,9,Admission,Admission,O
1,data/training_20180910/110727.txt,1,1,,10,14,Date,Date,O
2,data/training_20180910/110727.txt,1,2,,14,15,:,:,O
3,data/training_20180910/110727.txt,1,3,,17,18,[,[,O
4,data/training_20180910/110727.txt,1,4,,18,19,*,*,O
...,...,...,...,...,...,...,...,...,...
893314,data/training_20180910/100883.txt,1,23,,64,65,-,-,O
893315,data/training_20180910/100883.txt,1,24,,65,66,3,ORDINAL,O
893316,data/training_20180910/100883.txt,1,25,,66,67,*,*,O
893317,data/training_20180910/100883.txt,1,26,,67,68,*,*,O


## Convert to Test DataFrame for EDA

In [11]:
# Read the data from the text file
with open("/content/drive/MyDrive/266_project/data/dataset1_test.txt", "r") as file:
    lines = file.readlines()

# Define an empty list to store the data
data = []

# Iterate over each line in the file
for line in lines:
    # Split the line by spaces
    parts = line.strip().split()

    # Check if the line has the expected number of elements
    if len(parts) == 9:
        # Extract the values from the line
        text_file_name = parts[0]
        sentence_line_number = int(parts[1])
        sentence_word_index = int(parts[2])
        sentence_seq = parts[3]
        start_token = int(parts[4])
        end_token = int(parts[5])
        original_word = parts[6]
        word = parts[7]
        label = parts[8]

        # Append the values as a tuple to the data list
        data.append((text_file_name, sentence_line_number, sentence_word_index, sentence_seq,
                     start_token, end_token, original_word, word, label))

# Create a DataFrame from the data list with appropriate column names
test_df = pd.DataFrame(data, columns=['text_file_name', 'sentence_line_number', 'sentence_word_index',
                                 'sentence_seq', 'start_token', 'end_token', 'original_word',
                                 'word', 'label'])


In [12]:
test_df.head()

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
0,data/test_data_Task2/107515.txt,1,0,,0,9,Admission,Admission,O
1,data/test_data_Task2/107515.txt,1,1,,10,14,Date,Date,O
2,data/test_data_Task2/107515.txt,1,2,,14,15,:,:,O
3,data/test_data_Task2/107515.txt,1,3,,17,18,[,[,O
4,data/test_data_Task2/107515.txt,1,4,,18,19,*,*,O


In [13]:
len(test_df)

585761

In [14]:
test_df['sentence_line_number'].nunique()

930

In [15]:
test_df['label'].value_counts()

O              526040
B-Drug          10581
I-Frequency      8145
I-Dosage         5721
B-Form           4358
B-Strength       4230
I-Strength       4065
B-Frequency      4015
B-Route          3513
I-Form           2751
B-Dosage         2681
I-Drug           2608
B-Reason         2515
I-Reason         2140
I-Duration        635
B-ADE             625
I-ADE             488
B-Duration        380
I-Route           270
Name: label, dtype: int64

In [16]:
test_df[test_df['sentence_line_number'] == 1]

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
0,data/test_data_Task2/107515.txt,1,0,,0,9,Admission,Admission,O
1,data/test_data_Task2/107515.txt,1,1,,10,14,Date,Date,O
2,data/test_data_Task2/107515.txt,1,2,,14,15,:,:,O
3,data/test_data_Task2/107515.txt,1,3,,17,18,[,[,O
4,data/test_data_Task2/107515.txt,1,4,,18,19,*,*,O
...,...,...,...,...,...,...,...,...,...
584452,data/test_data_Task2/100511.txt,1,23,,63,64,-,-,O
584453,data/test_data_Task2/100511.txt,1,24,,64,66,29,ORDINAL,O
584454,data/test_data_Task2/100511.txt,1,25,,66,67,*,*,O
584455,data/test_data_Task2/100511.txt,1,26,,67,68,*,*,O


### Label labels

In [17]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['label'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]

print(unique_labels)

{'I-Dosage', 'O', 'B-ADE', 'B-Dosage', 'B-Route', 'I-Form', 'I-Strength', 'B-Reason', 'I-Reason', 'B-Strength', 'I-ADE', 'B-Form', 'B-Drug', 'B-Duration', 'I-Drug', 'I-Frequency', 'B-Frequency', 'I-Duration', 'I-Route'}


In [18]:
# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(labels_to_ids)

{'B-ADE': 0, 'B-Dosage': 1, 'B-Drug': 2, 'B-Duration': 3, 'B-Form': 4, 'B-Frequency': 5, 'B-Reason': 6, 'B-Route': 7, 'B-Strength': 8, 'I-ADE': 9, 'I-Dosage': 10, 'I-Drug': 11, 'I-Duration': 12, 'I-Form': 13, 'I-Frequency': 14, 'I-Reason': 15, 'I-Route': 16, 'I-Strength': 17, 'O': 18}


## Structure Train Dataset for BERT

In [19]:
train_texts = []
train_labels = []

# Read the text file line by line
with open('/content/drive/MyDrive/266_project/data/dataset1_train.txt', 'r', encoding='utf-8') as file:
    current_text = []  # To store tokens of the current text
    current_labels = []  # To store labels of the current text
    for line in file:
        if line.strip() == '':  # Empty line signifies end of text
            train_texts.append(current_text)
            train_labels.append(current_labels)
            current_text = []
            current_labels = []
        else:
            parts = line.strip().split()
            token = parts[-3]  # Token is second-to-last part
            label = parts[-1]   # Label is last part
            current_text.append(token)
            current_labels.append(label)

# Check the first few samples
print(train_texts[:5])
print(train_labels[:5])


[['Admission', 'Date', ':', '[', '*', '*', '2202', '-', '1', '-', '8', '*', '*', ']', 'Discharge', 'Date', ':', '[', '*', '*', '2202', '-', '2', '-', '1', '*', '*', ']'], ['Date', 'of', 'Birth', ':', '[', '*', '*', '2163', '-', '9', '-', '18', '*', '*', ']', 'Sex', ':', 'M'], ['Service', ':', 'MEDICINE'], ['Allergies', ':', 'Keflex', '/', 'Orencia', '/', 'Remicade'], ['Attending', ':', '[', '*', '*', 'First', 'Name3', '(', 'LF', ')', '2751', '*', '*', ']', 'Chief', 'Complaint', ':', 'L', 'leg', 'pain', 'and', 'erythema']]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O'], ['O', 'O', 'B-Drug', 'O', 'B-Drug', 'O', 'B-Drug'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]


In [20]:
len(train_texts)

49877

In [21]:
len(train_labels)

49877

## Structure Test Dataset for BERT

In [22]:
test_texts = []
test_labels = []

# Read the text file line by line
with open('/content/drive/MyDrive/266_project/data/dataset1_test.txt', 'r', encoding='utf-8') as file:
    current_text = []  # To store tokens of the current text
    current_labels = []  # To store labels of the current text
    for line in file:
        if line.strip() == '':  # Empty line signifies end of text
            test_texts.append(current_text)
            test_labels.append(current_labels)
            current_text = []
            current_labels = []
        else:
            parts = line.strip().split()
            token = parts[-3]  # Token is second-to-last part
            label = parts[-1]   # Label is last part
            current_text.append(token)
            current_labels.append(label)

# Check the first few samples
print(test_texts[:5])
print(test_labels[:5])

[['Admission', 'Date', ':', '[', '*', '*', '2122', '-', '1', '-', '14', '*', '*', ']', 'Discharge', 'Date', ':', '[', '*', '*', '2122', '-', '2', '-', '6', '*', '*', ']'], ['Date', 'of', 'Birth', ':', '[', '*', '*', '2057', '-', '2', '-', '17', '*', '*', ']', 'Sex', ':', 'F'], ['Service', ':', 'MEDICINE'], ['Allergies', ':', 'Penicillins', '/', 'Fentanyl', '/', 'Oxycodone', '/', 'Meperidine'], ['Attending', ':', '[', '*', '*', 'First', 'Name3', '(', 'LF', ')', '11040', '*', '*', ']', 'Chief', 'Complaint', ':', 'ARDS', 'seconrday', 'to', 'septic', 'shock']]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O'], ['O', 'O', 'B-Drug', 'O', 'B-Drug', 'O', 'B-Drug', 'O', 'B-Drug'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]


In [23]:
len(test_texts)

33027

In [24]:
len(test_labels)

33027

## Create Sentence Level Train Dataset

In [25]:
df['sentence'] = df[['text_file_name', 'sentence_line_number', 'original_word', 'label']].groupby(['text_file_name', 'sentence_line_number'])['original_word'].transform(lambda x: ' '.join(x))

In [26]:
df['word_labels'] = df[['text_file_name', 'sentence_line_number', 'original_word', 'label']].groupby(['text_file_name', 'sentence_line_number'])['label'].transform(lambda x: ','.join(x))

In [27]:
sentence_level_data = df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
sentence_level_data.head()

Unnamed: 0,sentence,word_labels
0,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Date of Birth : [ * * 2163 - 9 - 18 * * ] Sex : M,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,Service : MEDICINE,"O,O,O"
3,Allergies :,"O,O"
4,Keflex / Orencia / Remicade,"B-Drug,O,B-Drug,O,B-Drug"


In [28]:
train_data = sentence_level_data.copy()

## Create Sentence Level Test Dataset

In [29]:
test_df['sentence'] = test_df[['text_file_name', 'sentence_line_number', 'original_word', 'label']].groupby(['text_file_name', 'sentence_line_number'])['original_word'].transform(lambda x: ' '.join(x))

In [30]:
test_df['word_labels'] = test_df[['text_file_name', 'sentence_line_number', 'original_word', 'label']].groupby(['text_file_name', 'sentence_line_number'])['label'].transform(lambda x: ','.join(x))

In [31]:
sentence_level_data = test_df[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
sentence_level_data.head()

Unnamed: 0,sentence,word_labels
0,Admission Date : [ * * 2122 - 1 - 14 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,Date of Birth : [ * * 2057 - 2 - 17 * * ] Sex : F,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,Service : MEDICINE,"O,O,O"
3,Allergies :,"O,O"
4,Penicillins / Fentanyl / Oxycodone / Meperidine,"B-Drug,O,B-Drug,O,B-Drug,O,B-Drug"


In [32]:
test_data = sentence_level_data.copy()

## Calculate Class Weights to Solve Class Imbalance

In [33]:
df.head()

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label,sentence,word_labels
0,data/training_20180910/110727.txt,1,0,,0,9,Admission,Admission,O,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/110727.txt,1,1,,10,14,Date,Date,O,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,data/training_20180910/110727.txt,1,2,,14,15,:,:,O,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,data/training_20180910/110727.txt,1,3,,17,18,[,[,O,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,data/training_20180910/110727.txt,1,4,,18,19,*,*,O,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [34]:
class_labels = df['label'].value_counts()
class_labels

O              802045
B-Drug          16222
I-Frequency     13023
I-Dosage         8779
B-Strength       6691
B-Form           6647
I-Strength       6617
B-Frequency      6279
B-Route          5475
I-Drug           4298
B-Dosage         4221
I-Form           4173
B-Reason         3791
I-Reason         3125
I-Duration       1034
B-ADE             956
I-ADE             776
B-Duration        592
I-Route           397
Name: label, dtype: int64

In [35]:
labels_to_ids

{'B-ADE': 0,
 'B-Dosage': 1,
 'B-Drug': 2,
 'B-Duration': 3,
 'B-Form': 4,
 'B-Frequency': 5,
 'B-Reason': 6,
 'B-Route': 7,
 'B-Strength': 8,
 'I-ADE': 9,
 'I-Dosage': 10,
 'I-Drug': 11,
 'I-Duration': 12,
 'I-Form': 13,
 'I-Frequency': 14,
 'I-Reason': 15,
 'I-Route': 16,
 'I-Strength': 17,
 'O': 18}

In [36]:
from sklearn.utils.class_weight import compute_class_weight

class_labels = np.arange(len(labels_to_ids))
class_frequencies = [956, 4221, 16222, 592, 6647, 6279, 3791, 5475, 6691, 776, 8779, 4298, 1034, 4173, 13023, 3125, 397, 6617, 802045]

weights = compute_class_weight('balanced', classes=class_labels, y=np.hstack([[i] * freq for i, freq in enumerate(class_frequencies)]))
class_weights = {i: weight for i, weight in enumerate(weights)}

print("Class weights:", class_weights)

Class weights: {0: 49.28105042942083, 1: 11.1614982730458, 2: 2.9042463451193634, 3: 79.58223684210526, 4: 7.0878116760232155, 5: 7.5032145581344665, 6: 12.427508364686446, 7: 8.605056476808459, 8: 7.041202243390572, 9: 60.71222192078133, 10: 5.366520584408966, 11: 10.961536577600352, 12: 45.5635243815535, 13: 11.289883587473357, 14: 3.617652170047325, 15: 15.076058947368422, 16: 118.67174864112422, 17: 7.119946230999896, 18: 0.05874069935044332}


## Model Training Using BERT

In [37]:
from transformers import BertTokenizerFast, TFBertForTokenClassification
import tensorflow as tf
import numpy as np
import pandas as pd

In [39]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
max_len = 128

In [40]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.split(), text_labels.split(",")):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        if label.startswith("B-") and n_subwords > 1:
            labels.extend([label] + ["I" + label[1:]] * (n_subwords - 1))
        else:
            labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

def encode_sentences(dataframe, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    labels = []

    # Mapping your labels to IDs
    labels_to_ids = {'B-ADE': 0, 'B-Dosage': 1, 'B-Drug': 2, 'B-Duration': 3, 'B-Form': 4, 'B-Frequency': 5, 'B-Reason': 6, 'B-Route': 7, 'B-Strength': 8, 'I-ADE': 9, 'I-Dosage': 10, 'I-Drug': 11, 'I-Duration': 12, 'I-Form': 13, 'I-Frequency': 14, 'I-Reason': 15, 'I-Route': 16, 'I-Strength': 17, 'O': 18}

    for _, row in dataframe.iterrows():
        sentence, text_labels = row['sentence'], row['word_labels']
        tokenized_sentence, label_seq = tokenize_and_preserve_labels(sentence, text_labels)

        # Now, tokenize the sentence to get input_ids, attention_mask
        encoding = tokenizer.encode_plus(tokenized_sentence,
                                         is_split_into_words=True,
                                         max_length=max_len,
                                         padding='max_length',
                                         truncation=True,
                                         return_attention_mask=True,
                                         return_tensors='np')

        # Adjust label_ids creation to properly handle [CLS], [SEP], and padding
        label_ids = [18]  # Start with [CLS] token label
        label_ids += [labels_to_ids.get(label, 18) for label in label_seq]  # Convert labels to ids, defaulting to 'O' (18)
        label_ids += [18] * (max_len - len(label_ids))  # Add padding with 'O' (18) labels

        input_ids.append(encoding['input_ids'][0])
        attention_masks.append(encoding['attention_mask'][0])
        labels.append(np.array(label_ids))

    return np.array(input_ids), np.array(attention_masks), np.array(labels)

In [42]:
input_ids, attention_masks, label_ids = encode_sentences(train_data, tokenizer, max_len)

In [45]:
# Split the data
from sklearn.model_selection import train_test_split

X_train_ids, X_val_ids, X_train_masks, X_val_masks, y_train, y_val = train_test_split(
    input_ids, attention_masks, label_ids, test_size=0.2, random_state=42)

In [46]:
def create_dataset(input_ids, attention_masks, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((
        {"input_ids": input_ids, "attention_mask": attention_masks},
        labels
    ))
    dataset = dataset.shuffle(len(labels)).batch(batch_size)
    return dataset

batch_size = 16
train_dataset = create_dataset(X_train_ids, X_train_masks, y_train, batch_size)
validation_dataset = create_dataset(X_val_ids, X_val_masks, y_val, batch_size)

In [57]:
class_weights_tensor = tf.constant([class_weights[i] for i in range(len(class_weights))], dtype=tf.float32)


def custom_loss(y_true, y_pred):
    # Calculate the cross-entropy loss for each class
    loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    loss = loss_obj(y_true, y_pred)

    # Ensure class_weights_tensor is float32
    class_weights_tensor = tf.constant([class_weights[i] for i in range(len(class_weights))], dtype=tf.float32)

    # Apply class weights
    class_weights_applied = tf.gather(class_weights_tensor, tf.cast(y_true, tf.int32))
    loss *= tf.cast(class_weights_applied, tf.float32)  # Ensure multiplication is with float32

    # Mask out the losses where y_true is 18 (used for padding/ignored tokens)
    mask = tf.cast(tf.not_equal(y_true, 18), tf.float32)
    loss *= mask

    # Return mean loss over non-ignored tokens
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [58]:
from transformers import TFBertForTokenClassification
from tensorflow.keras.optimizers import Adam

num_labels = len(labels_to_ids)  # Update based on your labels
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, clipnorm=1.0)
model.compile(optimizer=optimizer.name, loss=custom_loss, metrics=['accuracy'])

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
history = model.fit(train_dataset, epochs=1, validation_data=validation_dataset)



KeyboardInterrupt: 

In [None]:
from joblib import dump

model_filename = 'vgg16.joblib'
dump(model, model_filename)

In [None]:
# Define the path to the directory on Google Drive where you want to save the model
model_save_path = '/content/drive/My Drive/207_Project/vgg16_fine_tuned.h5'

# Save the model
model.save(model_save_path)

## Model Evaluation

In [61]:
def encode_test_data(dataframe, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    labels = []

    # Mapping your labels to IDs
    labels_to_ids = {'B-ADE': 0, 'B-Dosage': 1, 'B-Drug': 2, 'B-Duration': 3, 'B-Form': 4, 'B-Frequency': 5, 'B-Reason': 6, 'B-Route': 7, 'B-Strength': 8, 'I-ADE': 9, 'I-Dosage': 10, 'I-Drug': 11, 'I-Duration': 12, 'I-Form': 13, 'I-Frequency': 14, 'I-Reason': 15, 'I-Route': 16, 'I-Strength': 17, 'O': 18}

    for _, row in dataframe.iterrows():
        sentence, text_labels = row['sentence'], row['word_labels']
        tokenized_sentence, label_seq = tokenize_and_preserve_labels(sentence, text_labels)

        # Tokenize the sentence to get input_ids, attention_mask
        encoding = tokenizer.encode_plus(
            tokenized_sentence,
            is_split_into_words=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='np'
        )

        # Ensure label_ids is created with consistent length
        label_ids = [18]  # Start with [CLS] token label
        label_ids += [labels_to_ids.get(label, 18) for label in label_seq]  # Convert labels to ids, defaulting to 'O' (18)
        label_ids += [18]  # Add [SEP] token label
        label_ids += [18] * (max_len - len(label_ids))  # Ensure consistent padding with 'O' (18) labels

        if len(label_ids) > max_len:  # Safety check to ensure label_ids length doesn't exceed max_len
            label_ids = label_ids[:max_len]

        input_ids.append(encoding['input_ids'][0])
        attention_masks.append(encoding['attention_mask'][0])
        labels.append(np.array(label_ids))

    return np.array(input_ids), np.array(attention_masks), np.array(labels)


In [62]:
test_input_ids, test_attention_masks, _ = encode_test_data(test_data, tokenizer, max_len)

In [63]:
raw_predictions = model.predict([test_input_ids, test_attention_masks])



In [64]:
from scipy.special import softmax

probabilities = softmax(raw_predictions.logits, axis=-1)
predictions = np.argmax(probabilities, axis=-1)

In [69]:
from sklearn.metrics import classification_report, confusion_matrix

report = classification_report(_.flatten(), predictions.flatten(), target_names=ids_to_labels.values())
print("Classification Report:\n", report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

       B-ADE       0.00      0.00      0.00       622
    B-Dosage       0.00      0.00      0.00      2567
      B-Drug       0.00      0.00      0.00     10270
  B-Duration       0.00      0.00      0.00       372
      B-Form       0.00      0.00      0.00      4061
 B-Frequency       0.00      0.00      0.00      3663
    B-Reason       0.00      0.00      0.00      2388
     B-Route       0.00      0.00      0.00      3246
  B-Strength       0.00      0.00      0.00      4075
       I-ADE       0.00      0.00      0.00      1780
    I-Dosage       0.00      0.00      0.00      5828
      I-Drug       0.00      0.00      0.00     25466
  I-Duration       0.00      0.00      0.00       674
      I-Form       0.00      0.00      0.00      3356
 I-Frequency       0.00      0.00      0.00      8239
    I-Reason       0.00      0.00      0.00      5841
     I-Route       0.00      0.00      0.00      1230
  I

  _warn_prf(average, modifier, msg_start, len(result))


In [74]:
unique, counts = np.unique(predictions.flatten(), return_counts=True)
print(unique, counts)

[18] [5764352]


In [75]:
unique, counts = np.unique(_.flatten(), return_counts=True)
print(unique, counts)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18] [    622    2567   10270     372    4061    3663    2388    3246    4075
    1780    5828   25466     674    3356    8239    5841    1230    6971
 5673703]


### It looks reasonable, let's proceed with training

In [19]:
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
import os

In [20]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [24]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             return_offsets_mapping=True,
                             padding='max_length',
                             is_split_into_words=True,
                             truncation=True,
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

  def __len__(self):
        return self.len

In [25]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (66944, 2)
TRAIN Dataset: (53555, 2)
TEST Dataset: (13389, 2)


In [22]:
# Function to tokenize and align labels
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count the number of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # The first subtoken retains its label (e.g., B-Drug)
        # Subsequent subtokens should get the I- version of the B- label (e.g., I-Drug)
        if label.startswith("B-") and n_subwords > 1:
            labels.extend([label] + ["I" + label[1:]] * (n_subwords - 1))
        else:
            labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [37]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.iloc[index]['sentence']
        word_labels = self.data.iloc[index]['word_labels'].split(",")  # Assuming labels are stored as comma-separated in the dataframe

        # Apply the tokenization and label preservation
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence.split(), word_labels)

        # Now, use the tokenizer to encode the tokenized sentence
        encoding = self.tokenizer.encode_plus(
            tokenized_sentence,
            is_split_into_words=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        # Prepare the labels: Convert labels to their respective IDs and handle padding
        labels_to_ids = {label: id for id, label in enumerate(sorted(set(labels)))}
        label_ids = [labels_to_ids[label] for label in labels]  # Convert each label to its ID
        # Padding: Fill with a label ID that is ignored, such as -100
        label_ids += [-100] * (self.max_len - len(label_ids))

        # Ensure everything is in torch tensors
        item = {key: torch.as_tensor(val).squeeze() for key, val in encoding.items()}
        item['word_labels'] = torch.as_tensor(label_ids[:self.max_len])  # Ensure labels tensor is of max_len

        return item

    def __len__(self):
        return len(self.data)

In [38]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (66944, 2)
TRAIN Dataset: (53555, 2)
TEST Dataset: (13389, 2)


In [39]:
training_set[0]

{'input_ids': tensor([  101,  4650,  1012,  2101,  2023,  2154,  2002,  2001,  2057,  1001,
          1001,  2019,  2063,  1001,  1001,  1040,  2013,  7367,  1001,  1001,
         23755,  3258,  1010, 19179,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [None]:
# Tokenize and align labels with tokens
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sentence, label_group)
    for sentence, label_group in zip(sentences, label_groups)
]

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.9766, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 19])

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
training_set[0]

{'input_ids': tensor([  101,  4650,  1012,  2101,  2023,  2154,  2002,  2001,  2057,  7231,
          2094,  2013,  7367, 20207,  1010, 19179,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [None]:
df['label'].value_counts()

O              802045
B-Drug          16222
I-Frequency     13023
I-Dosage         8779
B-Strength       6691
B-Form           6647
I-Strength       6617
B-Frequency      6279
B-Route          5475
I-Drug           4298
B-Dosage         4221
I-Form           4173
B-Reason         3791
I-Reason         3125
I-Duration       1034
B-ADE             956
I-ADE             776
B-Duration        592
I-Route           397
Name: label, dtype: int64

In [None]:
from collections import defaultdict

# Initialize a dictionary to hold label counts
label_counts = defaultdict(int)

# Loop through each example in the training dataset
for i in range(len(training_set)):
    item = training_set[i]  # Get the i-th item
    labels_tensor = item['labels']  # Extract labels tensor assuming this key exists

    # Iterate over each label in the tensor
    for label in labels_tensor:
        label_id = label.item()  # Convert PyTorch tensor to a Python integer
        if label_id != -100:  # Ignore padding or special tokens
            label_counts[label_id] += 1

# Now label_counts dictionary contains the counts of each label
print(label_counts)

# Convert label_counts to a list of counts ordered by label ID for further use, if necessary
class_counts = [label_counts[i] for i in range(len(labels_to_ids))]  # Assuming labels_to_ids maps labels to consecutive integers starting from 0
print(class_counts)

defaultdict(<class 'int'>, {18: 585497, 2: 12584, 11: 3285, 8: 5173, 17: 5076, 4: 4813, 1: 3184, 10: 6432, 6: 2887, 5: 4539, 3: 460, 12: 794, 7: 3999, 14: 7105, 16: 312, 15: 2452, 13: 2351, 0: 743, 9: 596})
[743, 3184, 12584, 460, 4813, 4539, 2887, 3999, 5173, 596, 6432, 3285, 794, 2351, 7105, 2452, 312, 5076, 585497]


In [None]:
class_counts = [956] # replace with your actual counts
num_classes = len(class_counts)
class_weights = compute_class_weight(class_weight='balanced', classes=np.arange(num_classes), y=all_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)


        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        tr_loss += loss.item()

        # outputs = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        # loss = outputs["loss"]
        # tr_logits = outputs["logits"]

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 3.012042999267578
Training loss per 100 training steps: 0.7837160611801809
Training loss per 100 training steps: 0.6019163135049949


KeyboardInterrupt: 

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.004456072114408016
Validation loss per 100 evaluation steps: 0.09815204672435861
Validation loss per 100 evaluation steps: 0.08135277532974712
Validation loss per 100 evaluation steps: 0.0853397470082472
Validation loss per 100 evaluation steps: 0.08781791381699619
Validation loss per 100 evaluation steps: 0.08451544420090513
Validation loss per 100 evaluation steps: 0.08764765564839049
Validation loss per 100 evaluation steps: 0.08455414991717375
Validation loss per 100 evaluation steps: 0.08155191882007544
Validation loss per 100 evaluation steps: 0.08429008465106252
Validation loss per 100 evaluation steps: 0.08312966159922741
Validation loss per 100 evaluation steps: 0.08299840118678509
Validation loss per 100 evaluation steps: 0.0812383369249633
Validation loss per 100 evaluation steps: 0.0802643305518377
Validation loss per 100 evaluation steps: 0.07880768259333863
Validation loss per 100 evaluation steps: 0.0780429023344634
Validation 

In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m41.0/43.6 kB[0m [31m948.0 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m808.4 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=7b893bc642c2f2d98ecfad44b6f8808acb96723134f6c70929eae28198e1e97b
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

         ADE       0.47      0.11      0.18       216
      Dosage       0.88      0.91      0.90       832
        Drug       0.92      0.91      0.91      3163
    Duration       0.68      0.77      0.72       134
        Form       0.90      0.91      0.90      1252
   Frequency       0.82      0.89      0.85      1200
      Reason       0.63      0.40      0.49       739
       Route       0.91      0.95      0.93       992
    Strength       0.92      0.94      0.93      1265

   micro avg       0.88      0.86      0.87      9793
   macro avg       0.79      0.75      0.76      9793
weighted avg       0.86      0.86      0.86      9793



In [None]:
import os

directory = "/model"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed


In [None]:
model.save('/content/drive/MyDrive/266_final/model')

from tensorflow.keras.models import load_model

loaded_model = load_model('/content/drive/MyDrive/266_final/model')

AttributeError: 'BertForTokenClassification' object has no attribute 'save'

Resources

### https://github.com/lcampillos/Medical-NER/blob/master/bert_ner.ipynb
### https://medium.com/analytics-vidhya/bio-tagged-text-to-original-text-99b05da6664