In [1]:
!pip install transformers



In [2]:
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline


In [3]:
import pandas as pd
import numpy as np

import os

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score


In [4]:

# verify GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
data = pd.read_csv("/content/drive/MyDrive/266_final/data/sentence_level_data/test_dataset.csv")

In [8]:
labels = {'B-ADE': 0, 'B-Dosage': 1, 'B-Drug': 2, 'B-Duration': 3, 'B-Form': 4, 'B-Frequency': 5, 'B-Reason': 6, 'B-Route': 7, 'B-Strength': 8, 'I-ADE': 9, 'I-Dosage': 10, 'I-Drug': 11, 'I-Duration': 12, 'I-Form': 13, 'I-Frequency': 14, 'I-Reason': 15, 'I-Route': 16, 'I-Strength': 17, 'O': 18}


# Load model

In [9]:
directory = "/content/drive/MyDrive/266_final/microsoft_bbb_concat7"

In [10]:
tokenizer_bbc = BertTokenizer.from_pretrained(directory)
model_bbb = BertForTokenClassification.from_pretrained(directory)


In [12]:
model_bbb.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [14]:
ner_pipe = pipeline("ner", model_bbb, tokenizer=tokenizer_bbc)

In [None]:
def all_ade(labels):
    return all(label in ('B-ADE', 'I-ADE') for label in labels.split(','))

# Apply the function to filter the DataFrame
filtered_data = data[data['word_labels'].apply(all_ade)]

# Print the filtered DataFrame
filtered_data

Unnamed: 0,text_file_name,sentence_line_number,sentence,word_labels
13624,data/test_data_Task2/112445.txt,457,confusion,B-ADE
14675,data/test_data_Task2/107128.txt,292,Acute blood loss anemia from Psoas muscle hema...,"B-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE"
16717,data/test_data_Task2/113265.txt,714,neutropenia,B-ADE
17420,data/test_data_Task2/113852.txt,25,"lips , tongue and face swelling and it progres...","B-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-AD..."
48988,data/test_data_Task2/122365.txt,313,thrombocytopenia,B-ADE


In [None]:
labels

{'B-ADE': 0,
 'B-Dosage': 1,
 'B-Drug': 2,
 'B-Duration': 3,
 'B-Form': 4,
 'B-Frequency': 5,
 'B-Reason': 6,
 'B-Route': 7,
 'B-Strength': 8,
 'I-ADE': 9,
 'I-Dosage': 10,
 'I-Drug': 11,
 'I-Duration': 12,
 'I-Form': 13,
 'I-Frequency': 14,
 'I-Reason': 15,
 'I-Route': 16,
 'I-Strength': 17,
 'O': 18}

In [None]:
data.iloc[14675]['sentence']

'Acute blood loss anemia from Psoas muscle hematoma'

In [None]:
data.iloc[13623]['sentence']

'Decreased Elavil from 75mg to 25mg as it may have worsened your'

In [15]:
sequence_ADE = "Decreased Elavil from 75mg to 25mg as it may have worsened your confusion"

In [16]:
for entity in ner_pipe(sequence_ADE):
    print(entity)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'entity': 'LABEL_18', 'score': 0.9662258, 'index': 1, 'word': 'decreased', 'start': None, 'end': None}
{'entity': 'LABEL_2', 'score': 0.9653698, 'index': 2, 'word': 'el', 'start': None, 'end': None}
{'entity': 'LABEL_2', 'score': 0.66539043, 'index': 3, 'word': '##avi', 'start': None, 'end': None}
{'entity': 'LABEL_11', 'score': 0.6296134, 'index': 4, 'word': '##l', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9524498, 'index': 5, 'word': 'from', 'start': None, 'end': None}
{'entity': 'LABEL_8', 'score': 0.97232234, 'index': 6, 'word': '75', 'start': None, 'end': None}
{'entity': 'LABEL_17', 'score': 0.95961416, 'index': 7, 'word': '##mg', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.8957383, 'index': 8, 'word': 'to', 'start': None, 'end': None}
{'entity': 'LABEL_8', 'score': 0.9657996, 'index': 9, 'word': '25', 'start': None, 'end': None}
{'entity': 'LABEL_17', 'score': 0.9600873, 'index': 10, 'word': '##mg', 'start': None, 'end': None}
{'entity': 'L

In [None]:
data.iloc[17420]['sentence']

'lips , tongue and face swelling and it progressivly worsened to'

In [None]:
data.iloc[17420]['word_labels']


'B-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE,I-ADE'

In [17]:
sequenc_ADE2 = """Patient had been giving script for lisinopril a while back but
only started taking it on Friday [ * * 8 - 24 * * ] AM . He started to feel his
lips , tongue and face swelling and it progressivly worsened to
include his throat . He was admitted to the ICU and was intubated.
"""

In [18]:
for entity in ner_pipe(sequenc_ADE2):
    print(entity)

{'entity': 'LABEL_18', 'score': 0.99225533, 'index': 1, 'word': 'patient', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.98955256, 'index': 2, 'word': 'had', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.981857, 'index': 3, 'word': 'been', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9584423, 'index': 4, 'word': 'giving', 'start': None, 'end': None}
{'entity': 'LABEL_2', 'score': 0.53978056, 'index': 5, 'word': 'script', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.74679106, 'index': 6, 'word': 'for', 'start': None, 'end': None}
{'entity': 'LABEL_2', 'score': 0.9538391, 'index': 7, 'word': 'lis', 'start': None, 'end': None}
{'entity': 'LABEL_2', 'score': 0.5300952, 'index': 8, 'word': '##ino', 'start': None, 'end': None}
{'entity': 'LABEL_11', 'score': 0.6661655, 'index': 9, 'word': '##pr', 'start': None, 'end': None}
{'entity': 'LABEL_11', 'score': 0.7407746, 'index': 10, 'word': '##il', 'start': None, 'end': None}
{'en

In [19]:
sequence3 = "ADDENDUM TO HOSPITAL COURSE : It was felt that the patient's seizures were caused by the combination of Ritalin and thalidomide"""

for entity in ner_pipe(sequence3):
    print(entity)

{'entity': 'LABEL_18', 'score': 0.8933178, 'index': 1, 'word': 'add', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9097873, 'index': 2, 'word': '##end', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.93543833, 'index': 3, 'word': '##um', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9377759, 'index': 4, 'word': 'to', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9440522, 'index': 5, 'word': 'hospital', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9481235, 'index': 6, 'word': 'course', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9730926, 'index': 7, 'word': ':', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.95140886, 'index': 8, 'word': 'it', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9358156, 'index': 9, 'word': 'was', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.94328195, 'index': 10, 'word': 'felt', 'start': None, 'end': None}
{'entity

In [20]:
sequence4 = "2 liters of O2 at home , Xanax .25 q.h.s"
for entity in ner_pipe(sequence4):
    print(entity)

{'entity': 'LABEL_1', 'score': 0.8663034, 'index': 1, 'word': '2', 'start': None, 'end': None}
{'entity': 'LABEL_10', 'score': 0.7952166, 'index': 2, 'word': 'liter', 'start': None, 'end': None}
{'entity': 'LABEL_10', 'score': 0.70492196, 'index': 3, 'word': '##s', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.878639, 'index': 4, 'word': 'of', 'start': None, 'end': None}
{'entity': 'LABEL_2', 'score': 0.8513578, 'index': 5, 'word': 'o2', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.5243555, 'index': 6, 'word': 'at', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.65895677, 'index': 7, 'word': 'home', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.93923736, 'index': 8, 'word': ',', 'start': None, 'end': None}
{'entity': 'LABEL_2', 'score': 0.97475296, 'index': 9, 'word': 'xa', 'start': None, 'end': None}
{'entity': 'LABEL_2', 'score': 0.54958236, 'index': 10, 'word': '##na', 'start': None, 'end': None}
{'entity': 'LABEL_11', 

In [21]:
sequence7 = "She remained hemodynamically stable throughout her stay in the CCU , without need for pressors , and her HCT also remained stable following the 3 units of PRBCs ."
for entity in ner_pipe(sequence7):
    print(entity)

{'entity': 'LABEL_18', 'score': 0.98314846, 'index': 1, 'word': 'she', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9795289, 'index': 2, 'word': 'remained', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.8641945, 'index': 3, 'word': 'hemodynamic', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9721051, 'index': 4, 'word': '##ally', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.964414, 'index': 5, 'word': 'stable', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9846044, 'index': 6, 'word': 'throughout', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9839125, 'index': 7, 'word': 'her', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9804319, 'index': 8, 'word': 'stay', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9769672, 'index': 9, 'word': 'in', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.97683096, 'index': 10, 'word': 'the', 'start': None, 'end'

In [22]:
sequence8 = "The patient received 2 units of FFP after a GI bleed was determined for the low HgB of 6.2."
for entity in ner_pipe(sequence8):
    print(entity)

{'entity': 'LABEL_18', 'score': 0.9836209, 'index': 1, 'word': 'the', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9739415, 'index': 2, 'word': 'patient', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.94140506, 'index': 3, 'word': 'received', 'start': None, 'end': None}
{'entity': 'LABEL_1', 'score': 0.8866815, 'index': 4, 'word': '2', 'start': None, 'end': None}
{'entity': 'LABEL_10', 'score': 0.7421901, 'index': 5, 'word': 'units', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.9059259, 'index': 6, 'word': 'of', 'start': None, 'end': None}
{'entity': 'LABEL_2', 'score': 0.8902779, 'index': 7, 'word': 'ffp', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.96031064, 'index': 8, 'word': 'after', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.92863154, 'index': 9, 'word': 'a', 'start': None, 'end': None}
{'entity': 'LABEL_18', 'score': 0.5786429, 'index': 10, 'word': 'gi', 'start': None, 'end': None}
{'entity': 