In [6]:
import os
import sys
sys.path.append("../")
import json
from argparse import ArgumentParser
from datasets import Dataset
from transformers import DataCollatorForTokenClassification, BertForTokenClassification, BertTokenizer 
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from pprint import pprint
import pandas as pd

import spacy
from spacy import displacy


def load_data(path):
    data = pd.read_csv(path, header=None, delimiter='\t')
    data.columns = ['sent_id', 'text', 'label']
    data = data.groupby('sent_id').agg(list).reset_index()
    data = [(row.text, row.label) for row in data.itertuples()]
    return data

def tokenize_with_labels(tokenizer, sent_words, sent_labels, special_label):
    tok_sent = []
    labels = []
    for word, label in zip(sent_words, sent_labels):
        if type(word) == str:
            tok_word = tokenizer.tokenize(word)
            n_subwords = len(tok_word)

            tok_sent.extend(tok_word)
            labels.extend([label] * n_subwords)
    
    # Add special tokens
    if tok_sent[0] != '[CLS]':
        tok_sent.insert(0, '[CLS]')
        labels.insert(0, special_label)
    if tok_sent[-1] != '[SEP]':
        if tok_sent[-1] not in '.!?;':
            tok_sent.append('.')
            labels.append('O')
        tok_sent.append('[SEP]')
        labels.append(special_label)

    return tok_sent, labels

In [7]:
# raw_train_data = load_data("../data/ner_data_formatted/train.tsv")
# raw_test_data = load_data("../data/ner_data_formatted/test.tsv")
# raw_train_data = [(i, j) for i, j in raw_train_data if len(i) > 2 and not all(k=="O" for k in j)]
# raw_test_data = [(i, j) for i, j in raw_test_data if len(i) > 2 and not all(k=="O" for k in j)]

tokenizer = BertTokenizer(vocab_file="/anvil/projects/tdm/corporate/battelle-nl/ADE_NER_2023-02-04_4/vocab.txt", do_lower_case=False)

# # Tokenize data
# train_data = [tokenize_with_labels(tokenizer, i, j, '[PAD]') for i, j in raw_train_data if len(i) > 2]
# test_data = [tokenize_with_labels(tokenizer, i, j, '[PAD]') for i, j in raw_test_data if len(i) > 2]
# train_sents, train_labels = zip(*train_data)
# test_sents, test_labels = zip(*test_data)

# print("Labels:")
# pprint(set([l for sent in train_labels for l in sent])) 

labels = ['B-ADE',
    'B-Dosage',
    'B-Drug',
    'B-Duration',
    'B-Form',
    'B-Frequency',
    'B-Reason',
    'B-Route',
    'B-Strength',
    'I-ADE',
    'I-Dosage',
    'I-Drug',
    'I-Duration',
    'I-Form',
    'I-Frequency',
    'I-Reason',
    'I-Route',
    'I-Strength',
    'L-ADE',
    'L-Dosage',
    'L-Drug',
    'L-Duration',
    'L-Form',
    'L-Frequency',
    'L-Reason',
    'L-Route',
    'L-Strength',
    'O',
    'U-ADE',
    'U-Dosage',
    'U-Drug',
    'U-Duration',
    'U-Form',
    'U-Frequency',
    'U-Reason',
    'U-Route',
    'U-Strength',
    '[PAD]']



print("Loading pipeline")
nlp = pipeline("ner", model="/anvil/projects/tdm/corporate/battelle-nl/ADE_NER_2023-02-04_4", tokenizer="bert-base-cased")


Loading pipeline


In [5]:
print("Loading test.txt")

# display a menu to pick a file to process
files = os.listdir("../data/ner_data_formatted/txt/")
for i, file in enumerate(files):
    print(f"{i}: {file}")
file_index = int(input("Enter the index of the file to process: "))
file = files[file_index]


processedfiles = None
if file.endswith(".txt"):
    with open(os.path.join("../data/ner_data_formatted/txt/", file)) as f:
        processedlines = []
        while True:
            line = f.readline()
            if not line:
                break
            processedlines.append(nlp(line))
        processedfiles[file] = processedlines

            

# print("Running pipeline")
# processed = nlp(text)

# pprint(processed[0:5])




Loading test.txt


KeyboardInterrupt: 

In [43]:
# pprint(processed)
processedline = ""

for i in processed:
    i["label"] = labels[int(i["entity"].split("_")[1])]
    if i["label"] != 'O':
        processedline += i["word"] + f" {i['label']} "
    else:
        processedline += i["word"] + " "
    
print(processedline)

Su U-Form ##sp U-Form ##ens U-Form ##ion U-Form Last Name ST ##it ##le Five B-Dosage 5 I-Dosage M L-Dosage ##L L-Dosage P U-Route ##O U-Route Q B-Frequency ##ID B-Frequency 4 I-Frequency times I-Frequency a I-Frequency day I-Frequency as I-Frequency needed L-Frequency 3 Al U-Drug ##but U-Drug ##ero U-Drug ##l U-Drug - U-Drug I U-Drug ##pra U-Drug ##tro U-Drug ##pi U-Drug ##um U-Drug 103 B-Strength - B-Strength 18 B-Strength m L-Strength ##c L-Strength ##g L-Strength / L-Strength Act L-Strength ##uation L-Strength Aero U-Form ##sol U-Form Last Name ST ##it ##le Six B-Dosage 6 L-Dosage P U-Form ##uff U-Form In U-Route ##hala U-Route ##tion U-Route Q B-Frequency ##4 B-Frequency ##H B-Frequency every I-Frequency 4 I-Frequency hours I-Frequency as I-Frequency needed L-Frequency for when on vent 4 Al U-Drug ##but U-Drug ##ero U-Drug ##l U-Drug 90 B-Strength m L-Strength ##c L-Strength ##g L-Strength / L-Strength Act L-Strength ##uation L-Strength Aero U-Form ##sol U-Form Last Name ST ##it ##

In [44]:
#process syllables
combined = []
for i in range(len(processed)):
    if processed[i]["word"].startswith('##'):
        continue
    # Otherwise, combine it with the next string if it starts with "##"
    word = processed[i]["word"]
    start = processed[i]["start"]
    end = processed[i]["end"]
    for j in range(i+1, len(processed)):
        if processed[j]["word"].startswith('##'):
            word += processed[j]["word"][2:]
            end = processed[j]["end"]
        else:
            break
    combined.append({"word": word, "entity": processed[i]["entity"], "start": start, "end": end})



for i in range(len(combined)):
    # example output: {'end': None, 'entity': 'LABEL_27', 'index': 131, 'score': 0.9996716, 'start': None, 'word': 'and'}
    combined[i]["label"] = labels[int(combined[i]["entity"].split("_")[1])]
    # if i == 0:
    #     combined[i]["start"] = 0
    #     combined[i]["end"] = len(combined[i]["word"])
    # else:
    #     combined[i]["start"] = combined[i-1]["end"] + 2
    #     combined[i]["end"] = combined[i]["start"] + len(combined[i]["word"])

pprint(combined[10:15])
    
# Generate the visualization using displacy module
options = {"ents": [ent for ent in labels if ent != "O"]}
doc = {"text": text, "ents": [{"start": i["start"], "end": i["end"], "label": i["label"]} for i in combined if i["label"] != "O"]}


[{'end': 52,
  'entity': 'LABEL_14',
  'label': 'I-Frequency',
  'start': 47,
  'word': 'times'},
 {'end': 54,
  'entity': 'LABEL_14',
  'label': 'I-Frequency',
  'start': 53,
  'word': 'a'},
 {'end': 58,
  'entity': 'LABEL_14',
  'label': 'I-Frequency',
  'start': 55,
  'word': 'day'},
 {'end': 61,
  'entity': 'LABEL_14',
  'label': 'I-Frequency',
  'start': 59,
  'word': 'as'},
 {'end': 68,
  'entity': 'LABEL_23',
  'label': 'L-Frequency',
  'start': 62,
  'word': 'needed'}]


In [45]:
displacy.render(doc, style="ent", options=options, manual=True)