In [2]:
from typing impPredicort List

def get_labels(path: str) -> List[str]:
    if path:
        with open(path, "r") as f:
            labels = f.read().splitlines()
            labels = [i+'-bio' if i != 'O' else 'O' for i in labels]
        if "O" not in labels:
            labels = ["O"] + labels
        return labels
    else:
        # return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
        return ["O", "B-bio", "I-bio"]

In [5]:
DATA_DIR = '../../datasets/NER/'
ENTITY = 'NCBI-disease/'

labels = get_labels(DATA_DIR + ENTITY + 'labels.txt')
labels

['B-bio', 'I-bio', 'O']

In [14]:
import pandas as pd

train = pd.read_csv(DATA_DIR + ENTITY + 'train.tsv', delimiter='\t', names=['word', 'label'], quoting=3, error_bad_lines=False)
# train

tag_values = list(set(train["label"].values))
tag_values
# tag_values.append("PAD")
print(sorted(tag_values))
tag2idx = {t: i for i, t in enumerate(tag_values)}
print(tag2idx)

['B', 'I', 'O']
{'O': 0, 'I': 1, 'B': 2}


In [11]:
# MODEL = 'biobert'
MODEL = 'biobert_base_NER_NCBI-disease'

MODEL_PATH = '../../models/word_embeddings/'
# MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks/models/' 

MODEL_PATH += MODEL

In [17]:
from transformers import BertForTokenClassification, BertTokenizer

tokenizer = BertTokenizer.from_pretrained(MODEL_PATH, do_lower_case=False)

model = BertForTokenClassification.from_pretrained(
    MODEL_PATH,
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

In [22]:
import torch
import numpy as np

test_sentence = 'Doctor, I am feeling chest pain since yesterday. The pain is continuous and is located just in the middle of my chest, worsening when I breathe and when I lay down on my bed. I suffer from arterial hypertension and smoke 20 cigarettes every day. My father had a “heart attack” at my age and I am very worried about it.'
test_sentence = " Her eye is green. A 58-year-old African-American woman presents to the ER with episodic pressing/burning anterior chest pain that began two days earlier for the first time in her life. The pain started while she was walking, radiates to the eyes, and is accompanied by nausea, diaphoresis and mild dyspnea, but is not increased on inspiration. The latest episode of pain ended half an hour prior to her arrival. She is known to have hypertension and obesity. She denies smoking, diabetes, hypercholesterolemia, or a family history of heart disease. She currently takes no medications. Physical examination is normal. The EKG shows nonspecific changes. Alprazolan was administred."

tokenized_sentence = tokenizer.encode(test_sentence)
# input_ids = torch.tensor([tokenized_sentence]).cuda()
input_ids = torch.tensor([tokenized_sentence])

with torch.no_grad():
    output = model(input_ids)
# print(output)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    print(token, label_idx)
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)
for token, label in zip(new_tokens, new_labels):
    print("{}\t\t{}".format(label, token))


[CLS] 2
Her 2
eye 2
is 2
green 2
. 2
A 2
58 2
- 2
year 2
- 2
old 2
African 2
- 2
American 2
woman 2
presents 2
to 2
the 2
ER 2
with 2
e 2
##pis 2
##od 2
##ic 2
pressing 1
/ 2
burning 0
anterior 1
chest 1
pain 1
that 2
began 2
two 2
days 2
earlier 2
for 2
the 2
first 2
time 2
in 2
her 2
life 2
. 2
The 2
pain 2
started 2
while 2
she 2
was 2
walking 2
, 2
r 2
##adia 2
##tes 2
to 2
the 2
eyes 2
, 2
and 2
is 2
accompanied 2
by 2
nausea 0
, 2
di 0
##aph 0
##ores 1
##is 1
and 2
mild 2
d 0
##ys 1
##p 1
##nea 1
, 2
but 2
is 2
not 2
increased 2
on 2
inspiration 2
. 2
The 2
latest 2
episode 2
of 2
pain 2
ended 2
half 2
an 2
hour 2
prior 2
to 2
her 2
arrival 2
. 2
She 2
is 2
known 2
to 2
have 2
h 0
##yper 1
##tens 1
##ion 1
and 2
o 0
##besity 1
. 2
She 2
denies 2
smoking 2
, 2
diabetes 0
, 2
h 0
##yper 0
##cho 1
##les 1
##tero 1
##lem 1
##ia 1
, 2
or 2
a 2
family 2
history 2
of 2
heart 0
disease 1
. 2
She 2
currently 2
takes 2
no 2
medications 2
. 2
Physical 2
examination 2
is 2
normal 2
. 2
The 2