In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from datetime import timedelta
# from edstan_data import EdstanData
from frvrs_utils import FRVRSUtilities
from notebook_utils import NotebookUtilities
from pandas import DataFrame
import humanize
import matplotlib.pyplot as plt
import numpy as np
import os
import os.path as osp
import pandas as pd
import re
import random

nu = NotebookUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)
fu = FRVRSUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)
from scrape_utils import WebScrapingUtilities
wsu = WebScrapingUtilities(
    s=nu,
    secrets_json_path=os.path.abspath(os.path.join(nu.data_folder, 'secrets', 'itm_secrets.json'))
)


# Parse Domain Documents for Entities

Downloaded all documents from https://nextcentury.atlassian.net/wiki/spaces/ITMC/pages/2991849482/Domain+Documents and converted them all to PDF files and stored them in the data folder.

In [3]:

import logging

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


## Option 1: Use a Hugging Face NER model

In [4]:

from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

# Named entity recognition pipeline, passing in a specific model and tokenizer
model = AutoModelForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
token_classifier = pipeline('ner', model=model, tokenizer=tokenizer)

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
tokens = token_classifier(sentence)
print(tokens)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'I-PER', 'score': 0.9988381, 'index': 1, 'word': 'Barack', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': 0.9994398, 'index': 2, 'word': 'Obama', 'start': 7, 'end': 12}, {'entity': 'I-LOC', 'score': 0.9983613, 'index': 10, 'word': 'United', 'start': 43, 'end': 49}, {'entity': 'I-LOC', 'score': 0.9920671, 'index': 11, 'word': 'States', 'start': 50, 'end': 56}]



## Option 2: Use SpaCy

In [5]:

import PyPDF2
import spacy

# Load the spaCy model
try: nlp = spacy.load('en_core_web_sm')
except OSError as e:
    print(str(e).strip())
    command_str = f'{sys.executable} -m spacy download en_core_web_sm --quiet'
    print(command_str)
    !{command_str}
    nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

pdf_folder = '../data/Domain_Knowledge'
black_list = ['.ipynb_checkpoints', '$Recycle.Bin', '.git']


## Extract the text from PDFs and load it into documents

In [8]:

# Get text lengths
text_length_dict = {}
sentences_dict = {}
for sub_directory, directories_list, files_list in os.walk(pdf_folder):
    if all(map(lambda x: x not in sub_directory, black_list)):
        for file_name in files_list:
            if file_name.endswith('.pdf'):
                file_path = osp.join(sub_directory, file_name)
                with open(file_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    text = ''
                    for page_number in range(len(pdf_reader.pages)):
                        page = pdf_reader.pages[page_number]
                        text += page.extract_text()
                    # print(f'Text length for {file_path} is {len(text):,} characters.')
                    text_length_dict[file_path] = len(text)
                    sentences_dict[file_path] = text

Overwriting cache for 0 1856
[0, IndirectObject(3080, 0, 140206642769104)]
unknown widths : 
[0, IndirectObject(3080, 0, 140206642769104)]
[0, IndirectObject(3075, 0, 140206642769104)]
unknown widths : 
[0, IndirectObject(3075, 0, 140206642769104)]
[0, IndirectObject(3070, 0, 140206642769104)]
unknown widths : 
[0, IndirectObject(3070, 0, 140206642769104)]
[0, IndirectObject(3065, 0, 140206642769104)]
unknown widths : 
[0, IndirectObject(3065, 0, 140206642769104)]


In [10]:

# load documents
if nu.pickle_exists('domain_doc_ners_df'): domain_doc_ners_df = nu.load_object('domain_doc_ners_df')
else:
    entities = []
    for file_path, text_length in text_length_dict.items():
        print(f'Text length for {file_path} is {text_length:,} characters.')
        text = sentences_dict[file_path]

        # Extract metadata from entity recognition pipeline and add it as a row dictionary to the entities rows list
        tokens = token_classifier(text)
        for metadata_dict in tokens:
            metadata_dict['file_path'] = file_path
            entities.append(metadata_dict)

        # Extract SpaCy named entities and add them as a row dictionary to the entities rows list
        if text_length <= nlp.max_length:
            doc = nlp(text)
            for ent in doc.ents:
                entities.append({
                    'file_path': file_path,
                    'type': ent.label_,
                    'text': ent.text, # Or lemma_, orth_, text_with_ws
                    'start_pos': ent.start_char, # Or start
                    'end_pos': ent.end_char # Or end
                })
    domain_doc_ners_df = DataFrame(entities)
    nu.store_objects(domain_doc_ners_df=domain_doc_ners_df)


## Explore the dataset

In [15]:

domain_doc_ners_df.entity.unique()

array(['I-ORG', 'I-MISC', 'I-LOC', nan, 'I-PER'], dtype=object)

In [20]:

for entity, entity_df in domain_doc_ners_df.groupby('entity'):
    print()
    print(entity, sorted(entity_df.word.unique()))


I-LOC ['##A', '##AN', '##AT', '##C', '##CH', '##D', '##E', '##ER', '##ET', '##H', '##HA', '##IR', '##IT', '##J', '##K', '##LA', '##ND', '##RO', '##TC', '##TH', '##U', '##WI', '##Z', '##a', '##ais', '##ani', '##aus', '##bert', '##bur', '##c', '##ch', '##d', '##da', '##dis', '##er', '##es', '##field', '##ford', '##ga', '##hall', '##hing', '##hl', '##hor', '##hu', '##ia', '##iform', '##ing', '##inia', '##ir', '##le', '##loc', '##ly', '##ma', '##man', '##ner', '##o', '##oi', '##pan', '##par', '##rick', '##rit', '##rl', '##sville', '##t', '##th', '##ton', '##tu', '##v', '##ville', '##vo', '##wood', '##worth', '##wson', '##yte', '##é', '-', '.', '11th', '210', '21st', 'AL', 'Afghanistan', 'Air', 'Albert', 'America', 'Andrews', 'Anthony', 'Antonio', 'Arabia', 'Arizona', 'Asia', 'Atlanta', 'Avenue', 'BS', 'Baltimore', 'Base', 'Beach', 'Bel', 'Berlin', 'Beth', 'Boston', 'Boulevard', 'Bragg', 'Bridge', 'Bronx', 'CT', 'California', 'Camp', 'Carolina', 'Carson', 'Carter', 'Center', 'Central', 'Ch

In [21]:

domain_doc_ners_df.type.unique()

array([nan, 'CARDINAL', 'DATE', 'ORG', 'GPE', 'PERSON', 'EVENT',
       'ORDINAL', 'FAC', 'PERCENT', 'TIME', 'WORK_OF_ART', 'LAW', 'LOC',
       'PRODUCT', 'NORP', 'QUANTITY', 'MONEY', 'LANGUAGE'], dtype=object)

In [26]:

for type, type_df in domain_doc_ners_df.groupby('type'):
    print()
    texts_list = sorted(type_df.text.unique())
    print(type, random.sample(texts_list, min(len(texts_list), 100)))


CARDINAL ['23-4', '2014.640', '699this', '300–3,000', '100%).57', '58', '4(3', '1990s,“real', '7-148', '2010;175(8:57):57', 'only a half-day', '647', '14-8', '1990.5', '141.Mlitary', '5-135', '20:114', 'about 1,000', '1-79', '539', '1991;338:669–674', '1-93', '5.0', '6490.03', '670–691', '3-128', '809their', '688', '6-201', '99–109', '472', '1997).33', '233–238', '1229–1867', '6-1Chapter', '14-01', '6-22', '3-152', '2019;37(1):94–99', '#', '5-20', '27,29', 'a hundred', '1946–1947).Nazi', '2014', '335', 'hundred', '391power', '118', '14(p75', '2015;15(1):13–20', '1967,when', '35(p28', '2014;97(3):31–35', '33(p12', '1981;141(8):1062–1064', '1996:11(3):147–155', '324–332', '4,500Rest', '4,300', '1-61', '2,3', '8,100', '40-1', '219.108(b', '1995;17(2):124', '108', '9An', '150', '2008;35(3):230–236', '454', 'six', '8-20', '1-75', '557–577', '55,500', '4.17', '565INTRODUCTION', '3(p754', '2012;32:972–798', 'about half', '11.18.2.4).Occupation', '2009;374:405–415.554', '1-94', '23', 'numbero

In [12]:

for file_path, file_path_df in domain_doc_ners_df.groupby('file_path'):
    print(file_path)
    display(file_path_df)
    break

../data/Domain_Knowledge/DoDTR-Data-Dictionary-External.pdf


Unnamed: 0,entity,score,index,word,start,end,file_path,type,text,start_pos,end_pos
0,I-ORG,0.738414,2.0,D,3.0,4.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
1,I-ORG,0.900860,3.0,##OD,4.0,6.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
2,I-ORG,0.801103,4.0,T,7.0,8.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
3,I-ORG,0.400040,5.0,##RA,8.0,10.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
4,I-ORG,0.742858,6.0,##UM,10.0,12.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
...,...,...,...,...,...,...,...,...,...,...,...
2108,,,,,,,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,CARDINAL,6,111492.0,111493.0
2109,,,,,,,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,ORG,Discharge Discharge Vitals Weight Discharge In...,111494.0,111552.0
2110,,,,,,,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,ORG,LabsWeight Patient's,111554.0,111574.0
2111,,,,,,,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,PERSON,discharge1/1/2007,111604.0,111621.0


In [13]:

for entity, entity_df in file_path_df.groupby('entity'):
    print(entity)
    display(entity_df)
    break

I-LOC


Unnamed: 0,entity,score,index,word,start,end,file_path,type,text,start_pos,end_pos
68,I-LOC,0.968093,351.0,Lands,1579.0,1584.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
69,I-LOC,0.915048,352.0,##tu,1584.0,1586.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
70,I-LOC,0.987361,353.0,##hl,1586.0,1588.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
71,I-LOC,0.808909,354.0,Regional,1589.0,1597.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
72,I-LOC,0.642502,355.0,Medical,1598.0,1605.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
73,I-LOC,0.832576,356.0,Center,1606.0,1612.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
77,I-LOC,0.999686,363.0,Germany,1624.0,1631.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,


In [14]:

for word, word_df in file_path_df.groupby('word'):
    print(word)
    display(word_df)
    break

##B


Unnamed: 0,entity,score,index,word,start,end,file_path,type,text,start_pos,end_pos
102,I-ORG,0.73533,471.0,##B,2071.0,2072.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,
