In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from notebook_utils import NotebookUtilities
from scrape_utils import WebScrapingUtilities
from pandas import DataFrame
import os
import os.path as osp
import random
import re
import logging

nu = NotebookUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)
wsu = WebScrapingUtilities(
    s=nu,
    secrets_json_path=osp.abspath(osp.join(nu.data_folder, 'secrets', 'itm_secrets.json'))
)


# Parse Domain Documents for Entities

Downloaded all documents from https://nextcentury.atlassian.net/wiki/spaces/ITMC/pages/2991849482/Domain+Documents and converted them all to PDF files and stored them in the data folder.

In [3]:

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

pdf_folder = '../data/Domain_Knowledge'
black_list = ['.ipynb_checkpoints', '$Recycle.Bin', '.git']


## Option 1: Use a Hugging Face NER model

In [4]:

from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

# Named entity recognition pipeline, passing in a specific model and tokenizer
model = AutoModelForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
token_classifier = pipeline('ner', model=model, tokenizer=tokenizer)

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
tokens = token_classifier(sentence)
print(tokens)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'I-PER', 'score': 0.9988381, 'index': 1, 'word': 'Barack', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': 0.9994398, 'index': 2, 'word': 'Obama', 'start': 7, 'end': 12}, {'entity': 'I-LOC', 'score': 0.9983613, 'index': 10, 'word': 'United', 'start': 43, 'end': 49}, {'entity': 'I-LOC', 'score': 0.9920671, 'index': 11, 'word': 'States', 'start': 50, 'end': 56}]



## Option 2: Use SpaCy

In [5]:

import spacy

# Load the spaCy model
try: nlp = spacy.load('en_core_web_sm')
except OSError as e:
    print(str(e).strip())
    command_str = f'{sys.executable} -m spacy download en_core_web_sm --quiet'
    print(command_str)
    !{command_str}
    nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
doc = nlp(sentence)
print([{'text': word.text, 'tag_': word.tag_, 'ent_type_': word.ent_type_, 'pos_': word.pos_} for word in doc])
print([{'text': ent.text, 'label_': ent.label_} for ent in doc.ents])

[{'text': 'Barack', 'tag_': 'NNP', 'ent_type_': 'PERSON', 'pos_': 'PROPN'}, {'text': 'Obama', 'tag_': 'NNP', 'ent_type_': 'PERSON', 'pos_': 'PROPN'}, {'text': 'was', 'tag_': 'VBD', 'ent_type_': '', 'pos_': 'AUX'}, {'text': 'the', 'tag_': 'DT', 'ent_type_': '', 'pos_': 'DET'}, {'text': '44th', 'tag_': 'JJ', 'ent_type_': 'ORDINAL', 'pos_': 'ADJ'}, {'text': 'President', 'tag_': 'NNP', 'ent_type_': '', 'pos_': 'PROPN'}, {'text': 'of', 'tag_': 'IN', 'ent_type_': '', 'pos_': 'ADP'}, {'text': 'the', 'tag_': 'DT', 'ent_type_': 'GPE', 'pos_': 'DET'}, {'text': 'United', 'tag_': 'NNP', 'ent_type_': 'GPE', 'pos_': 'PROPN'}, {'text': 'States', 'tag_': 'NNP', 'ent_type_': 'GPE', 'pos_': 'PROPN'}, {'text': '.', 'tag_': '.', 'ent_type_': '', 'pos_': 'PUNCT'}]
[{'text': 'Barack Obama', 'label_': 'PERSON'}, {'text': '44th', 'label_': 'ORDINAL'}, {'text': 'the United States', 'label_': 'GPE'}]



## Extract the text from PDFs and load it into documents

In [15]:

from PyPDF2 import PdfReader

# Converts pdf, returns its text content as a string
def convert(file_path, verbose=False):
    text = ''
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        for page_number in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_number]
            text += page.extract_text()
        text = re.sub('[^\x00-\x7F]+', '', text)
        text = re.sub('[\x00\x08\x0B\x0C\x0E-\x1F]+', '', text)
        text = text.replace('##', '')
    if verbose: print(f'Text length for {file_path} is {len(text):,} characters.')
    
    return text

In [21]:

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

# Converts pdf, returns its text content as a string
def convert(file_path, verbose=False):
    manager = PDFResourceManager()
    with StringIO() as output:
        converter = TextConverter(manager, output, laparams=LAParams())
        interpreter = PDFPageInterpreter(manager, converter)
        with open(file_path, 'rb') as infile:
            for page in PDFPage.get_pages(infile): interpreter.process_page(page)
        converter.close()
        text = output.getvalue()
    if verbose: print(f'Text length for {file_path} is {len(text):,} characters.')
    
    return text

In [29]:

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed

def convert(file_path, verbose=False):
    
    # Create the document model from the file
    parser = PDFParser(open(file_path, 'rb'))
    document = PDFDocument(parser)
    
    # Try to parse the document
    if not document.is_extractable: raise PDFTextExtractionNotAllowed
    
    # Create a PDF resource manager object that stores shared resources
    rsrcmgr = PDFResourceManager()
    
    # Create a buffer for the parsed text
    retstr = StringIO()
    
    # Spacing parameters for parsing
    laparams = LAParams()
    
    # Create a PDF device object
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    
    # Create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document): interpreter.process_page(page)
    
    text = retstr.getvalue()
    if verbose: print(f'Text length for {file_path} is {len(text):,} characters.')
    
    return text

In [32]:

import fitz

def convert(file_path, verbose=False):
    doc = fitz.open(file_path)
    text = ''
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    if verbose: print(f'Text length for {file_path} is {len(text):,} characters.')
    
    return text

In [33]:

# Get text from PDFs
if nu.pickle_exists('domain_knowledge_sentences_dict'):
    domain_knowledge_sentences_dict = nu.load_object('domain_knowledge_sentences_dict')
else:
    domain_knowledge_sentences_dict = {}
    for sub_directory, directories_list, files_list in os.walk(pdf_folder):
        if all(map(lambda x: x not in sub_directory, black_list)):
            for file_name in files_list:
                if file_name.endswith('.pdf'):
                    file_path = osp.join(sub_directory, file_name)
                    text = convert(file_path)
                    domain_knowledge_sentences_dict[file_path] = text
    nu.store_objects(domain_knowledge_sentences_dict=domain_knowledge_sentences_dict)

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/domain_knowledge_sentences_dict.pkl


In [66]:

# load documents
if nu.pickle_exists('domain_doc_ners_df'): domain_doc_ners_df = nu.load_object('domain_doc_ners_df')
else:
    entities = []
    for file_path, text in domain_knowledge_sentences_dict.items():
        text_length = len(text)
        # print(f'Text length for {file_path} is {text_length:,} characters.')

        # Extract metadata from entity recognition pipeline and add it as a row dictionary to the entities rows list
        tokens = token_classifier(text)
        for metadata_dict in tokens:
            
            # Ignore subword tokenization
            # if metadata_dict['word'].startswith('##'): pass
            
            metadata_dict = {'bert_' + str(k): v for k, v in metadata_dict.items()}
            metadata_dict['file_path'] = file_path
            entities.append(metadata_dict)

        # Extract SpaCy named entities and add them as a row dictionary to the entities rows list
        if text_length <= nlp.max_length:
            doc = nlp(text)
            entities.extend([{'file_path': file_path, 'nlp_word': word.text, 'nlp_tag': word.tag_, 'nlp_type': word.ent_type_, 'nlp_pofs': word.pos_} for word in doc])
            entities.extend([{'file_path': file_path, 'ent_phrase': ent.text, 'ent_type': ent.label_, 'ent_start': ent.start_char, 'ent_end': ent.end_char} for ent in doc.ents])
    domain_doc_ners_df = DataFrame(entities)
    nu.store_objects(domain_doc_ners_df=domain_doc_ners_df)

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/domain_doc_ners_df.pkl


In [37]:

print(domain_doc_ners_df.columns.tolist())
mask_series = domain_doc_ners_df.bert_word.isnull()
display(domain_doc_ners_df[~mask_series].sample(4).dropna(axis='columns', how='all').T)

['bert_entity', 'bert_score', 'bert_index', 'bert_word', 'bert_start', 'bert_end', 'file_path', 'nlp_word', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_phrase', 'ent_type', 'ent_start', 'ent_end']


Unnamed: 0,786531,497740,314766,70365
bert_entity,I-MISC,I-PER,I-ORG,I-MISC
bert_score,0.84228,0.986659,0.986794,0.584865
bert_index,160.0,29.0,320.0,179.0
bert_word,##alis,CH,Performance,Medicine
bert_start,437.0,84.0,1146.0,847.0
bert_end,441.0,86.0,1157.0,855.0
file_path,../data/Domain_Knowledge/Military Medical Ethi...,../data/Domain_Knowledge/Fundamentals of Milit...,../data/Domain_Knowledge/Fundamentals of Milit...,../data/Domain_Knowledge/Tactical_Combat_Casua...



## Explore the entity columns

In [11]:

domain_doc_ners_df.bert_entity.unique()

array(['I-ORG', 'I-MISC', 'I-LOC', nan, 'I-PER'], dtype=object)

In [12]:

for entity, entity_df in domain_doc_ners_df.groupby('bert_entity'):
    texts_list = sorted(entity_df.bert_word.unique())
    print(entity, random.sample(texts_list, min(len(texts_list), 100)))

I-LOC ['Marine', '##ldo', 'K', 'Chicago', 'Ho', 'Center', 'III', 'Point', 'Clinic', 'Fort', '##A', 'Haven', 'l', 'Spain', '##bert', '##c', '##field', 'Central', 'Lee', 'Field', 'Maryland', 'Carson', '##par', '##Z', '##ND', 'National', '-', 'Stanley', 'Arizona', '##TR', '##IR', 'PA', 'Chancellor', '##t', 'Asia', '##LA', 'China', 'Harbor', 'Station', 'Kingdom', 'Corps', 'Han', 'Val', 'Anthony', 'W', '##H', 'Einstein', 'Persian', 'and', 'Pearl', 'Joint', 'States', '##WA', '##TH', 'Colt', '##es', '##bur', 'Iraq', 'California', 'BC', 'London', 'Army', 'Tam', 'Drive', 'Baltimore', '##azar', '##U', 'Ken', '##man', 'D', 'Gordon', 'Sam', '##hu', 'Combat', 'Na', '##ma', 'Force', '##ia', 'Sai', '##oi', 'Dover', '##da', 'Medicine', '##E', 'College', 'Salvador', 'General', 'Hu', 'MD', 'West', 'Falls', 'St', 'Avenue', '##le', 'Somalia', 'Base', '##TC', 'Tennessee', 'of', 'Rock']
I-MISC ['##ern', 'Peace', 'S', 'Anti', 'Human', 'Service', 'G', 'Combat', 'Women', '##CI', 'Western', '##AR', 'Just', 'Naz


## Explore the entity type columns

In [13]:

words_list = sorted([str(w) for w in domain_doc_ners_df.nlp_type.unique()])
print(random.sample(words_list, min(len(words_list), 100)))

['ORDINAL', 'WORK_OF_ART', 'nan', 'PERCENT', 'PRODUCT', 'MONEY', 'LOC', 'LANGUAGE', 'LAW', 'ORG', 'NORP', 'TIME', '', 'PERSON', 'EVENT', 'QUANTITY', 'DATE', 'GPE', 'FAC', 'CARDINAL']


In [14]:

for entity, entity_df in domain_doc_ners_df.groupby('nlp_type'):
    texts_list = sorted([str(w) for w in entity_df.nlp_word.unique()])
    print()
    print(entity, random.sample(texts_list, min(len(texts_list), 100)))


 ['nursingleaders', 'explains', 'unusualto', 'notlimited', 'Oper', 'T5', 'relinquish', 'inthe', 'deified', 'humbleness', 'superficialreforms', 'jectivism', 'carelessness', 'Autocratic', 'ethicaltogether', '2013;14:124144', 'possibilities', 'iatrogenic', 'Kind', 'polar', 'aft', 'blood-', 'waking', '24:3031', 'ELEMENTS', 'vio-', 'spectful', 'convalescent', 'doindeed', 'eak', 'accession', 'gions', 'battleship', 'training', 'erwise', 'commonhumanity', 'OCCUPYING', 'withdraws', 'creditfor', 'directives.102Military', '3(pp393394', 'openly', 'hall', 'oversees', 'whoattempted', 'becomesfutile', 'alveoli', 'reevaluate', 'tact', 'gradu', 'Travelers', 'percentages', 'raccoon', 'sional', 'Sinnreich', 'Mapstone', '02.3', 'Hogan', 'concurs', 'previouslynoted', 'ravage', 'notably', 'surprise', 'habituation', 'trial.49', 'connotation', 'feminist', 'wasabout', 'facilities)designed', 'persistence', 'Mil_TRISS', 'versial', 'MREs', 'encourag', 'thosethat', 'OSRDs', 'theperformance', 'outlawing', 'Hist', 

In [15]:

domain_doc_ners_df.ent_type.unique()

array([nan, 'CARDINAL', 'DATE', 'ORG', 'GPE', 'PERSON', 'EVENT',
       'ORDINAL', 'FAC', 'PERCENT', 'TIME', 'WORK_OF_ART', 'LAW', 'LOC',
       'PRODUCT', 'NORP', 'QUANTITY', 'MONEY', 'LANGUAGE'], dtype=object)

In [16]:

for entity_type, entity_type_df in domain_doc_ners_df.groupby('ent_type'):
    print()
    texts_list = sorted(entity_type_df.ent_phrase.unique())
    print(entity_type, random.sample(texts_list, min(len(texts_list), 100)))


CARDINAL ['5.0', '487', '32-34', '51,559', '79', '1982;31:849853', 'ectopic', '18,000', '102(p149', '0.02', '252', '82 to 8-8 6', '14-3', '0.42', '6(pp5256', 'up to 9oF', '2):S307313', '5(pp230ff,295ff,784ff', '202', '33(pp1314,39,56', '2009;36:351373', '2-1', '2,969', '728', 'at least 3', '420', '12(p16', '11.1864', '9.6', '2011', '836example', '703Conventions', '5-126', '669', '2003.33,34,36', '470', '2019;37(1):9499', '1):S3237.619', '2008;35(3):230236', 'Hundreds', '13-2', '18-6', '4  Confused', '430', '6-136', 'plans.34', '1,344', '27-161-2', '10.31.4', '2005;23(2):230250', '13(2', '925', '100-5', '2014;14(3):1338', '22,23(pp4950', '6-191', '323', '399CONCLUSION', 'less \nthan', '339truth', '733service', 'scriptive', '2000:51(7):10871110', '3(p203', '350diers', '355', '2006;38(1):173178', '379', '86(p8', '102F', 'slightly more than 23,000', '1998:316(7133):771774', '5-108', '592', '1986;23:803813', '355comm', '2015;20(3):311321', '800', '1900', '1987;17(2):119132', '156', '74(pp1


## Concatonate BERT entities into NER phrases

In [82]:

# Concatonate BERT entities into NER phrases
for file_path, file_path_df in domain_doc_ners_df.groupby('file_path'):
    print()
    print(file_path)
    for entity, entity_df in file_path_df.groupby('bert_entity'):
        print()
        print(entity)
        bert_indices_list = entity_df.sort_values('bert_start').bert_index.tolist()
        for indices_list in nu.split_row_indices_list(bert_indices_list):
            mask_series = entity_df.bert_index.isin(indices_list)
            df = entity_df[mask_series].sort_values('bert_start')
            ner_phrase = ' '.join(df.bert_word.tolist())
            entity_list = df.bert_entity.tolist()
            
            # Remove subword tokenization
            ner_phrase = ner_phrase.replace(' ##', '')
            if ner_phrase.startswith('##'):
                bert_index = df.bert_index.min() - 1
                mask_series = (file_path_df.bert_index == bert_index)
                df = file_path_df[mask_series]
                if df.shape[0] == 1:
                    bert_word = df.bert_word.squeeze()
                    ner_phrase = bert_word + ner_phrase[2:]
                    entity_list.insert(0, df.bert_entity.squeeze())
            
            print(ner_phrase, entity_list)
    break


../data/Domain_Knowledge/DoDTR-Data-Dictionary-External.pdf

I-LOC
Landstuhl Regional Medical Center ['I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC']
L ['I-LOC']
Germany ['I-LOC']

I-MISC
Tra ['I-ORG', 'I-MISC']
Joint Trauma System ['I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC']
Sailor ['I-MISC']
Marine ['I-MISC']
Air ['I-MISC']
Trauma ['I-ORG', 'I-MISC', 'I-MISC']
Global War on Terror ['I-MISC', 'I-MISC', 'I-MISC', 'I-MISC']
GW ['I-MISC', 'I-MISC']
##T ['I-MISC']
Combat Trauma Registry ['I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC']
CT ['I-MISC']
Windows ['I-MISC']

I-ORG
DOD TRAUM ['I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG']
##AT ['I-ORG']
Joint T ['I-ORG', 'I-ORG']
##rauma System ['I-MISC', 'I-ORG', 'I-ORG']
JTS ['I-ORG', 'I-ORG']
Department of Defense ['I-ORG', 'I-ORG', 'I-ORG']
DoD ['I-ORG', 'I-ORG']
JTS ['I-ORG', 'I-ORG']
JTS ['I-ORG', 'I-ORG']
DoD T ['I-ORG', 'I-ORG', 'I-ORG']
Registry ['I-ORG', 'I-ORG']
DoDTR ['I-ORG', 'I-ORG', 'I-ORG']
DoD ['I-ORG', 'I-


## Explore column groupbys

In [17]:

for file_path, file_path_df in domain_doc_ners_df.groupby('file_path'):
    print(file_path)
    display(file_path_df)
    break

../data/Domain_Knowledge/DoDTR-Data-Dictionary-External.pdf


Unnamed: 0,bert_entity,bert_score,bert_index,bert_word,bert_start,bert_end,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs,ent_phrase,ent_type,ent_start,ent_end
0,I-ORG,0.663520,2.0,D,3.0,4.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,,,,,
1,I-ORG,0.882723,3.0,##OD,4.0,6.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,,,,,
2,I-ORG,0.775356,4.0,T,7.0,8.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,,,,,
3,I-ORG,0.370818,5.0,##RA,8.0,10.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,,,,,
4,I-ORG,0.716090,6.0,##UM,10.0,12.0,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23451,,,,,,,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,,6,CARDINAL,111430.0,111431.0
23452,,,,,,,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,,Discharge Discharge Vitals Weight Discharge In...,ORG,111432.0,111490.0
23453,,,,,,,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,,LabsWeight Patient's,ORG,111492.0,111512.0
23454,,,,,,,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,,,,,discharge1/1/2007,PERSON,111542.0,111559.0
