In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from notebook_utils import NotebookUtilities
from scrape_utils import WebScrapingUtilities
from pandas import DataFrame
import os
import os.path as osp
import random
import re
import logging

nu = NotebookUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)
wsu = WebScrapingUtilities(
    s=nu,
    secrets_json_path=osp.abspath(osp.join(nu.data_folder, 'secrets', 'itm_secrets.json'))
)


# Parse Domain Documents for Entities

Downloaded all documents from https://nextcentury.atlassian.net/wiki/spaces/ITMC/pages/2991849482/Domain+Documents and converted them all to PDF files and stored them in the data folder.

In [3]:

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

pdf_folder = '../data/Domain_Knowledge'
black_list = ['.ipynb_checkpoints', '$Recycle.Bin', '.git']


## Option 1: Use a Hugging Face NER model

In [4]:

from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

# Named entity recognition pipeline, passing in a specific model and tokenizer
model = AutoModelForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
token_classifier = pipeline('ner', model=model, tokenizer=tokenizer)

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
tokens = token_classifier(sentence)
print(tokens)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'I-PER', 'score': 0.9988381, 'index': 1, 'word': 'Barack', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': 0.9994398, 'index': 2, 'word': 'Obama', 'start': 7, 'end': 12}, {'entity': 'I-LOC', 'score': 0.9983613, 'index': 10, 'word': 'United', 'start': 43, 'end': 49}, {'entity': 'I-LOC', 'score': 0.9920671, 'index': 11, 'word': 'States', 'start': 50, 'end': 56}]



## Option 2: Use SpaCy

In [5]:

import spacy

# Load the spaCy model
try: nlp = spacy.load('en_core_web_sm')
except OSError as e:
    print(str(e).strip())
    command_str = f'{sys.executable} -m spacy download en_core_web_sm --quiet'
    print(command_str)
    !{command_str}
    nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
doc = nlp(sentence)
print([{'text': word.text, 'tag_': word.tag_, 'ent_type_': word.ent_type_, 'pos_': word.pos_} for word in doc])
print([{'text': ent.text, 'label_': ent.label_} for ent in doc.ents])

[{'text': 'Barack', 'tag_': 'NNP', 'ent_type_': 'PERSON', 'pos_': 'PROPN'}, {'text': 'Obama', 'tag_': 'NNP', 'ent_type_': 'PERSON', 'pos_': 'PROPN'}, {'text': 'was', 'tag_': 'VBD', 'ent_type_': '', 'pos_': 'AUX'}, {'text': 'the', 'tag_': 'DT', 'ent_type_': '', 'pos_': 'DET'}, {'text': '44th', 'tag_': 'JJ', 'ent_type_': 'ORDINAL', 'pos_': 'ADJ'}, {'text': 'President', 'tag_': 'NNP', 'ent_type_': '', 'pos_': 'PROPN'}, {'text': 'of', 'tag_': 'IN', 'ent_type_': '', 'pos_': 'ADP'}, {'text': 'the', 'tag_': 'DT', 'ent_type_': 'GPE', 'pos_': 'DET'}, {'text': 'United', 'tag_': 'NNP', 'ent_type_': 'GPE', 'pos_': 'PROPN'}, {'text': 'States', 'tag_': 'NNP', 'ent_type_': 'GPE', 'pos_': 'PROPN'}, {'text': '.', 'tag_': '.', 'ent_type_': '', 'pos_': 'PUNCT'}]
[{'text': 'Barack Obama', 'label_': 'PERSON'}, {'text': '44th', 'label_': 'ORDINAL'}, {'text': 'the United States', 'label_': 'GPE'}]



## Extract the text from PDFs and load it into documents

In [6]:

# Get text from PDFs
if nu.pickle_exists('domain_knowledge_sentences_dict'):
    domain_knowledge_sentences_dict = nu.load_object('domain_knowledge_sentences_dict')
else:
    from PyPDF2 import PdfReader
    def convert(file_path, verbose=False):
        """
        Convert PDF, return its text content as a string
        """
        text = ''
        with open(file_path, 'rb') as file:
            pdf_reader = PdfReader(file)
            for page_number in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_number]
                text += page.extract_text()
        if verbose: print(f'Text length for {file_path} is {len(text):,} characters.')

        return text
    domain_knowledge_sentences_dict = {}
    for sub_directory, directories_list, files_list in os.walk(pdf_folder):
        if all(map(lambda x: x not in sub_directory, black_list)):
            for file_name in files_list:
                if file_name.endswith('.pdf'):
                    file_path = osp.join(sub_directory, file_name)
                    text = convert(file_path)
                    domain_knowledge_sentences_dict[file_path] = text
    nu.store_objects(domain_knowledge_sentences_dict=domain_knowledge_sentences_dict)

In [7]:

import pandas as pd

# load documents
if nu.csv_exists('domain_doc_ners_df'): domain_doc_ners_df = nu.load_data_frames(domain_doc_ners_df='domain_doc_ners_df')['domain_doc_ners_df']
else:
    entities = []
    for file_path, text in domain_knowledge_sentences_dict.items():
        text_length = len(text)
        # print(f'Text length for {file_path} is {text_length:,} characters.')
        
        # Prepare to join subword tokens back together and keep track of entity and score
        output_words = []
        current_word = ''
        current_entities = []
        current_scores = []
        
        # Extract metadata from entity recognition pipeline and add it as a row dictionary to the entities rows list
        tokens = token_classifier(text)
        for metadata_dict in tokens:
            current_entities.append(metadata_dict['entity'])
            current_scores.append(metadata_dict['score'])
            if metadata_dict['word'].startswith('##'):
                current_word += metadata_dict['word'][2:]
            else:
                
                # Take the mode of entities and average of scores for the current_word
                if current_word:
                    mode_entity = pd.Series(current_entities).mode().tolist()[0]
                    mean_score = pd.Series(current_scores).mean()
                    entity_tuple = (current_word, mode_entity, mean_score)
                    output_words.append(entity_tuple)
                    current_word = ''
                    current_entities = []
                    current_scores = []
                
                entity_tuple = (metadata_dict['word'], metadata_dict['entity'], metadata_dict['score'])
                output_words.append(entity_tuple)
        
        # Take the mode of entities for the last current_word
        if current_word:
            mode_entity = pd.Series(current_entities).mode().tolist()[0]
            mean_score = pd.Series(current_scores).mean()
            entity_tuple = (current_word, mode_entity, mean_score)
            output_words.append(entity_tuple)
        
        for word, entity, score in output_words:
            metadata_dict = {'bert_word': word, 'bert_entity': entity, 'bert_score': score, 'file_path': file_path}
            entities.append(metadata_dict)

        # Extract SpaCy named entities and add them as a row dictionary to the entities rows list
        if text_length <= nlp.max_length:
            doc = nlp(text)
            entities.extend([
                {'file_path': file_path, 'nlp_word': word.text, 'nlp_tag': word.tag_, 'nlp_type': word.ent_type_, 'nlp_pofs': word.pos_}
                for word in doc
            ])
            entities.extend([
                {'file_path': file_path, 'ent_phrase': ent.text, 'ent_type': ent.label_, 'ent_start': ent.start_char, 'ent_end': ent.end_char}
                for ent in doc.ents
            ])
    domain_doc_ners_df = DataFrame(entities)
    nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)

Attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/domain_doc_ners_df.pkl.
No pickle exists - attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv.


In [8]:

# Check that all the subword tokens are gone
print(domain_doc_ners_df.columns.tolist())
mask_series = domain_doc_ners_df.bert_word.map(lambda x: str(x).startswith('##'))
df = domain_doc_ners_df[mask_series]
if df.shape[0]: display(df.sample(min(4, df.shape[0])).dropna(axis='columns', how='all').T)

['bert_word', 'bert_entity', 'bert_score', 'file_path', 'nlp_word', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_phrase', 'ent_type', 'ent_start', 'ent_end']


In [9]:

# Check that you indeed have only strings among the BERT entities
import numpy as np

mask_series = domain_doc_ners_df.bert_entity.isnull()
sorted(domain_doc_ners_df[~mask_series].bert_entity.tolist(), key=lambda x: len(str(x)), reverse=True)[:10]

['I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC']


## Explore the tag and parts-of-speech columns

In [10]:

words_list = sorted([str(w) for w in domain_doc_ners_df.nlp_tag.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.nlp_tag.isin(words_list)
for tag, tag_df in domain_doc_ners_df[mask_series].groupby('nlp_tag'):
    mask_series = tag_df.nlp_word.isnull()
    texts_list = sorted(tag_df[~mask_series].nlp_word.unique())
    print()
    print(tag, random.sample(texts_list, min(len(texts_list), 50)))

['JJ', 'UH', 'NNS', '-RRB-', 'RBS', 'EX', 'CD', 'MD', 'SYM', "''", 'WDT', 'WP', 'ADD', 'VBN', 'TO', 'VBP', 'VB', 'POS', 'NN', 'LS']

'' ['‘', '”64(p13', '”34(p32', '”46', '”17(p47', '”36', '“', '”28', '”12', 'sel.35–37', '”23,24', 'implant.1', '”17', '”19', '”13', '”24', '”8', '”89', 'https://ihl-databases.icrc.org/applic/ihl/ihl.nsf/.', '”62', "'", '”7(p78', '”38', '”20', '”3', '”10', '”73', '”26', '”43', '”32', '”5', '”7', '”20(p19', '’', '”34', '”37', '”6–12', '”54', '”50', '”64(p26', '"', '”1', '’s', '”15(p43', '”71,72', '”13(p65', '”42', '”33', 'II.29', '”4']

-RRB- ['…', 'af5e-11e1-8a84-0019bb2963f4.html', 'casualties.2', '):', 'international.9', '•', '}', 'activities.6,13', 'satisfaction.41(p86', 'Report.19', ')', 'tions).31', ']', 'feelings.2', 'ter(66):35–41', '≥39.1']

ADD [':', '.23(p44', 'www.cotccc.com', '.23(p40', '”21(p35', 'Div-', '”36(p424', 'Addition-', 'ICTY', '2005;59:25–35', '2010;68:204–210', 'ir-', '”2(p4', '5810.01D.', '<', '2003;12(1):12', 'DoD).7', '\uf076', '

In [11]:

words_list = sorted([str(w) for w in domain_doc_ners_df.nlp_pofs.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.nlp_pofs.isin(words_list)
for pofs, pofs_df in domain_doc_ners_df[mask_series].groupby('nlp_pofs'):
    mask_series = pofs_df.nlp_word.isnull()
    texts_list = sorted(pofs_df[~mask_series].nlp_word.unique())
    print()
    print(pofs, random.sample(texts_list, min(len(texts_list), 20)))

['PUNCT', 'SPACE', 'ADV', 'NUM', 'VERB', 'DET', 'SCONJ', 'PART', 'ADJ', 'PROPN', 'NOUN', 'ADP', 'INTJ', 'PRON', 'AUX', 'CCONJ', 'X', 'nan', 'SYM']

ADJ ['calcoaceticus', 'straightfor-', 'fluid', 'intermit-', 'restless', 'proportionate', 'commit-', 'human', 'uncondi-', 'Assist', 'erential', 'horizon-', '9–86', 'appen-', 'communicable', 'hardcore', 'superior-', 'movement.26', 'accomplishable', 'nonmalef-']

ADP ['requirements.2', 'cago', 'forfeit', 'strat-', '”32', 'amidst', 'on', 'in-', 'Hur-', 'adopt-', 'qd', 'Without', 'ODT', 'ap-', 'Rights.59', '†Lieutenant', 'notwithstanding', 'Mil_NISS', 'geograph-', 'lest']

ADV ['low-', 'ingly', 'po-', 'Besides', 'temporarily', 'functionally', 'ren-', 'clandestinely', 'implicitly', 'val-', 'will-', 'profoundly', 'importantly', 'ultimately', 'retrospectively', 'Additionally', 'pirically', 'treat-', 'Consciously', 'effectually']

AUX ['DOES', 'acclima-', '◦', 'MAY', 'advo-', 'BE', 'initi-', 'got', 'intro-', '’ve', 'Need', 'Mobil-', 'became', 'is-',


## Explore the entity columns

In [12]:

words_list = sorted([str(w) for w in domain_doc_ners_df.bert_entity.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.bert_entity.isin(words_list)
for entity, entity_df in domain_doc_ners_df[mask_series].groupby('bert_entity'):
    mask_series = entity_df.bert_word.isnull()
    texts_list = sorted(entity_df[~mask_series].bert_word.unique())
    print()
    print(entity, random.sample(texts_list, min(len(texts_list), 50)))

['I-LOC', 'I-PER', 'nan', 'I-MISC', 'I-ORG']

I-LOC ['Marine', 'Guam', 'He', 'Corps', 'chfield', 'Carolina', 'Diego', 'St', 'Reservoir', '21st', 'University', 'National', 'worth', 'Det', 'Geneva', 'My', 'America', 'Ken', 'To', 'Clinic', 'Florida', 'Delaware', 'ing', 'Silver', 'CO', 'Arizona', 'Illinois', 'ébert', 'Field', 'Tennessee', 'ia', 'Avenue', 'California', 'Neck', 'Saint', 'Mo', 'Gulf', 'Lewis', 'BC', 'Springs', 'R', 'New', 'Yu', 'Bragg', 'Em', 'Zealand', 'iformed', 'll', 'I', 'Carter']

I-MISC ['SICIAN', 'LITYSICI', 'Support', '-', 'HSICIANH', 'Semi', 'Darwin', 'United', 'RISINA', 'Lazarus', 'plain', 'D', 'Jew', 'Ph', 'AT', 'Swan', 'Greek', 'Structure', 'ccied', 'Nuclear', 'UC', 'tian', 'Second', 'duct', 'UMANITARIAN', 'Territories', 'Just', 'ernalis', 'Charter', 'North', 'I', '19th', 'CT', 'Conflict', 'NS', 'Medicine', 'World', 'ern', 'Seven', 'HILOSID', 'na', 'Earth', 'Advanced', 'E', 'HIV', 'Western', 'Alt', 'UN', 'CA', 'War']

I-ORG ['Group', 'USAF', 'SH', '264', 'Service'


## Explore the entity type columns

In [13]:

words_list = sorted([str(w) for w in domain_doc_ners_df.nlp_type.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.nlp_type.isin(words_list)
for type, type_df in domain_doc_ners_df[mask_series].groupby('nlp_type'):
    mask_series = type_df.nlp_word.isnull()
    texts_list = sorted(type_df[~mask_series].nlp_word.unique())
    print()
    print(type, random.sample(texts_list, min(len(texts_list), 20)))

['QUANTITY', 'ORDINAL', 'PERSON', 'PERCENT', 'DATE', 'LOC', 'PRODUCT', 'EVENT', 'nan', 'GPE', 'WORK_OF_ART', 'LANGUAGE', 'FAC', 'MONEY', 'CARDINAL', 'ORG', 'NORP', 'LAW', 'TIME']

CARDINAL ['2.34', '414', '3,800', '580', '2000', '2003;54(5', '489–494', '219.117', '2010;125(1):16–25', '346', 'chemopro-', '18.9', '839', '206', '2.6.1', 'One', '2.b.(2', '2:269', '1970;211(11):1849–1850', '1998;163(11):743–746']

DATE ['1900–1999', '\n', '1855', 'Two', '0203', 'early', '1996:1', 'One', '1897;Sep:221–228', '1906', '1618', '2370', '1980–1988', '29510', '1645', 'fifteenth', '8061', 'IV', 'Atlantic', '100']

EVENT ['Rebellion', 'Assembly', 'Claims', 'Of', 'Paris', 'war', 'CURRENT', '’s', '1965', 'Relationship', 'Wide', 'Regeneration', '1980', 'Olympic', 'Junctional', 'St.', 'War.22,23(pp49–50', 'Convention', 'Med', 'Total']

FAC ['\n', 'Pickett', 'Appia', 'Middle', 'Refusal', 'Bampton', 'Tzu', 'PCB', 'Smith', 'District', '32', '1b', 'Palace', 'Metronidazole', 'Strip', 'Suppl', 'Examiner', 'Uni

In [14]:

words_list = sorted([str(w) for w in domain_doc_ners_df.ent_type.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.ent_type.isin(words_list)
for type, type_df in domain_doc_ners_df[mask_series].groupby('ent_type'):
    mask_series = type_df.ent_phrase.isnull()
    texts_list = sorted(type_df[~mask_series].ent_phrase.unique())
    print()
    print(type, random.sample(texts_list, min(len(texts_list), 20)))

['ORDINAL', 'LAW', 'GPE', 'WORK_OF_ART', 'nan', 'MONEY', 'FAC', 'LANGUAGE', 'PERSON', 'DATE', 'PRODUCT', 'NORP', 'ORG', 'CARDINAL', 'LOC', 'PERCENT', 'TIME', 'EVENT', 'QUANTITY']

CARDINAL ['669', '60mL', '1977;62(3):295', 'two to eight', '2002;20:255–271', '3-94', '13', '2006;32:493–512', '994.1', '85–94', '1970s.21', '2005;9:416–422', '26.7', '2-108', '24-3', '24-28', '1(4', '6260.6A', '5.10', '24-6']

DATE ['26 April 1937', 'the next 3 years', 'February \n2018', 'the third decade', 'Chapters 46', 'many years earlier', '15 August 1942', '3-77', '21-9', '23 October 2002', '15 August', '7-5', 'Table 31-3', '32-1).17', 'February 1953', 'early 20th centuries', 'December 7, 2016', '3 January 1986', '10.16.4', '0.5-1mL']

EVENT ['EMMA', 'The Gulf War', 'the Korean War', 'Gulf War', 'the International Covenant on Economic, Social and Cultural Rights', 'The Persian Gulf War', 'War', 'The Civil War', 'the Cold War,20', 'World\nWar', 'Waging War', 'The Communist Revolution', 'Geneva revision o


## Explore column groupbys

In [15]:

for file_path, file_path_df in domain_doc_ners_df.groupby('file_path'):
    print(file_path)
    display(file_path_df.sample(4).dropna(axis='columns', how='all').T)
    break

../data/Domain_Knowledge/DoDTR-Data-Dictionary-External.pdf


Unnamed: 0,6366,6000,18415,9001
file_path,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,../data/Domain_Knowledge/DoDTR-Data-Dictionary...,../data/Domain_Knowledge/DoDTR-Data-Dictionary...
nlp_word,\n,\n,digit,ECCN
nlp_tag,_SP,_SP,NN,NNP
nlp_type,ORG,,,
nlp_pofs,SPACE,SPACE,NOUN,PROPN


In [16]:

for nlp_tag, nlp_tag_df in domain_doc_ners_df.groupby('nlp_tag'):
    print(nlp_tag)
    display(nlp_tag_df.sample(4).dropna(axis='columns', how='all').T)
    break

$


Unnamed: 0,338740,1336048,1375313,854972
file_path,../data/Domain_Knowledge/Fundamentals of Milit...,../data/Domain_Knowledge/Military Medical Ethi...,../data/Domain_Knowledge/Military Medical Ethi...,../data/Domain_Knowledge/Military Medical Ethi...
nlp_word,$,•,#,objec-
nlp_tag,$,$,$,$
nlp_type,,PRODUCT,ORG,
nlp_pofs,SYM,SYM,SYM,SYM


In [17]:

from itertools import combinations

# columns_list = ['bert_entity', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_type']
for groupby_columns in combinations(domain_doc_ners_df.columns, 2):
    groupby_columns = list(groupby_columns)
    df = nu.get_minority_combinations(domain_doc_ners_df, groupby_columns).dropna(axis='columns', how='all')
    if df.shape[0]:
        print(nu.conjunctify_nouns(groupby_columns))
        df.file_path = df.file_path.map(lambda x: str(x).split('/')[-1])
        display(df)

bert_word and bert_entity


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
630880,May,I-LOC,0.985751,Ethics-ch-02.pdf
1255948,drome,I-MISC,0.857681,Ethics-ch-22.pdf
564170,den,I-ORG,0.93838,Fund ch 6.pdf
590712,MUAL,I-ORG,0.58498,Fund ch 8.pdf


bert_word and bert_score


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
181027,&,I-ORG,0.901232,Fund ch 15.pdf
497182,Sam,I-LOC,0.993379,Fund ch 39.pdf
65058,Sam,I-LOC,0.993311,SchauerMedicBag.pdf
65107,Sam,I-LOC,0.992895,SchauerMedicBag.pdf


bert_word and file_path


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
1023195,Institute,I-ORG,0.946906,Ethics-ch-16.pdf
110535,World,I-MISC,0.997363,Fund ch 1.pdf
61,Medical,I-LOC,0.65229,DoDTR-Data-Dictionary-External.pdf
216935,”,I-PER,0.669389,Fund ch 19.pdf


bert_entity and bert_score


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
216972,MD,I-LOC,0.329505,Fund ch 19.pdf
65130,Team,I-ORG,0.974057,SchauerMedicBag.pdf
233231,University,I-ORG,0.974026,Fund ch 20.pdf
252726,Emergency,I-ORG,0.974023,Fund ch 22.pdf


bert_entity and file_path


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
375023,Medicine,I-MISC,0.656078,Fund ch 32.pdf
514387,Medicine,I-MISC,0.58028,Fund ch 4.pdf
374947,GO,I-PER,0.586359,Fund ch 32.pdf
323074,Medicine,I-MISC,0.892257,Fund ch 28.pdf


bert_score and file_path


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
1331774,CA,I-ORG,0.292829,Ethics-ch-25.pdf
1371307,Gulf,I-MISC,0.982336,Ethics-ch-27.pdf
1206618,ton,I-LOC,0.982311,Ethics-ch-20.pdf
279173,SC,I-PER,0.9823,Fund ch 24.pdf


file_path and nlp_word


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_pofs
587136,Fund ch 7.pdf,amount,NN,NOUN
704028,Ethics-ch-04.pdf,audiotapes,NNS,NOUN
715845,Ethics-ch-04.pdf,authors,NNS,NOUN
707413,Ethics-ch-04.pdf,automatic,JJ,ADJ


file_path and nlp_tag


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_pofs
257313,Fund ch 22.pdf,endomorphs,FW,X
509368,Fund ch 39.pdf,There,EX,PRON
218280,Fund ch 19.pdf,there,EX,PRON
487689,Fund ch 38.pdf,Hg,UH,INTJ


file_path and nlp_type


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
307524,Fund ch 26.pdf,"adaptation.39,40",NNP,LANGUAGE,PROPN
872569,Ethics-ch-11.pdf,200,CD,MONEY,NUM
324586,Fund ch 28.pdf,1992;8(1):41–47,CD,TIME,NUM
338741,Fund ch 29.pdf,4000,CD,MONEY,NUM


file_path and nlp_pofs


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_pofs
324125,Fund ch 28.pdf,Al-,UH,INTJ
997927,Ethics-ch-14.pdf,As-,UH,INTJ
487689,Fund ch 38.pdf,Hg,UH,INTJ
152513,Fund ch 12.pdf,right,UH,INTJ


file_path and ent_phrase


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
720642,Ethics-ch-04.pdf,the Case of the Netherlands,WORK_OF_ART,86698.0,86725.0
872113,Ethics-ch-10.pdf,Colo,PERSON,100474.0,100478.0
871580,Ethics-ch-10.pdf,Comayagua,GPE,1596.0,1605.0
871738,Ethics-ch-10.pdf,Consti-,PRODUCT,31700.0,31707.0


file_path and ent_type


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
1228285,Ethics-ch-20.pdf,Spanish,LANGUAGE,8653.0,8660.0
278472,Fund ch 23.pdf,English,LANGUAGE,23854.0,23861.0
1370731,Ethics-ch-26.pdf,5-kilometer,QUANTITY,35158.0,35169.0
630153,Ethics-ch-01.pdf,D-238,MONEY,1524.0,1529.0


file_path and ent_start


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
24151,DoDTR-Data-Dictionary-External.pdf,1,CARDINAL,0.0,1.0
1083145,Ethics-ch-17.pdf,Charlton,PERSON,76216.0,76224.0
1083144,Ethics-ch-17.pdf,Eda Schultz,PERSON,76204.0,76215.0
1083143,Ethics-ch-17.pdf,one,CARDINAL,76183.0,76186.0


file_path and ent_end


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
24151,DoDTR-Data-Dictionary-External.pdf,1,CARDINAL,0.0,1.0
1083145,Ethics-ch-17.pdf,Charlton,PERSON,76216.0,76224.0
1083144,Ethics-ch-17.pdf,Eda Schultz,PERSON,76204.0,76215.0
1083143,Ethics-ch-17.pdf,one,CARDINAL,76183.0,76186.0


nlp_word and nlp_tag


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_pofs
428732,Fund ch 34.pdf,centers.74,VBN,VERB
721915,Ethics-ch-05.pdf,debates,VBZ,VERB
740606,Ethics-ch-05.pdf,debilitating,NN,NOUN
860073,Ethics-ch-10.pdf,debilitation,NN,NOUN


nlp_word and nlp_type


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
55699,Prolonged_Casualty_Care_Guidelines_21_Dec_2021...,,NN,WORK_OF_ART,NOUN
429927,Fund ch 34.pdf,START,NNP,WORK_OF_ART,PROPN
903805,Ethics-ch-11.pdf,Exposures,NNS,ORG,NOUN
539595,Fund ch 40.pdf,Express,NNP,PRODUCT,PROPN


nlp_word and nlp_pofs


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_pofs
265596,Fund ch 23.pdf,cais-,JJ,ADJ
1007838,Ethics-ch-15.pdf,cloud,VB,VERB
876738,Ethics-ch-11.pdf,clouding,VBG,VERB
350693,Fund ch 3.pdf,cloudy,JJ,ADJ


nlp_tag and nlp_type


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
731053,Ethics-ch-05.pdf,Allocate,VB,LOC,VERB
613500,Fund ch 9.pdf,Who,WP,EVENT,PRON
1247976,Ethics-ch-21.pdf,;,:,TIME,PUNCT
262526,Fund ch 23.pdf,:,:,QUANTITY,PUNCT


nlp_tag and nlp_pofs


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_pofs
550433,Fund ch 5.pdf,”41,'',AUX
1487127,ARN19354_FM 6-27 _C1_FINAL_WEB_v2.pdf,doe,UH,AUX
1112745,Ethics-ch-19.pdf,per-,ADD,AUX
280512,Fund ch 24.pdf,al-,SYM,AUX


nlp_type and nlp_pofs


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
174295,Fund ch 14.pdf,that,IN,LOC,SCONJ
120391,Fund ch 1.pdf,Unfortunately,RB,EVENT,ADV
1075798,Ethics-ch-17.pdf,kid-,DT,NORP,PRON
504381,Fund ch 39.pdf,that,IN,DATE,SCONJ


ent_phrase and ent_type


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
1228351,Ethics-ch-20.pdf,Jerome,PERSON,12521.0,12527.0
109923,TCCC_Quick_Reference_Guide_2017.pdf,KK,ORG,81064.0,81066.0
110110,TCCC_Quick_Reference_Guide_2017.pdf,KIRKPATRICK JW,ORG,85757.0,85771.0
136068,Fund ch 10.pdf,a particular day,DATE,34898.0,34914.0


ent_phrase and ent_start


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
907155,Ethics-ch-11.pdf,#,CARDINAL,153158.0,153159.0
590583,Fund ch 7.pdf,O&M,ORG,64202.0,64205.0
590562,Fund ch 7.pdf,O&M,ORG,61982.0,61985.0
109450,TCCC_Quick_Reference_Guide_2017.pdf,Nystagmus,GPE,50625.0,50634.0


ent_phrase and ent_end


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
907155,Ethics-ch-11.pdf,#,CARDINAL,153158.0,153159.0
590583,Fund ch 7.pdf,O&M,ORG,64202.0,64205.0
590562,Fund ch 7.pdf,O&M,ORG,61982.0,61985.0
109450,TCCC_Quick_Reference_Guide_2017.pdf,Nystagmus,GPE,50625.0,50634.0


ent_type and ent_start


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
998575,Ethics-ch-14.pdf,Nazi,NORP,24497.0,24501.0
233009,Fund ch 2.pdf,US Army Combat Readiness Center,ORG,48399.0,48430.0
252691,Fund ch 21.pdf,US Department of Agriculture,ORG,48394.0,48422.0
158832,Fund ch 12.pdf,Navy,ORG,48390.0,48394.0


ent_type and ent_end


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
745105,Ethics-ch-05.pdf,European,NORP,24576.0,24584.0
849929,Ethics-ch-09.pdf,Army,ORG,48862.0,48866.0
25009,DoDTR-Data-Dictionary-External.pdf,GCS,ORG,48860.0,48863.0
291517,Fund ch 24.pdf,NASA,ORG,48848.0,48852.0


ent_start and ent_end


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
472973,Fund ch 36.pdf,Burlington,ORG,63291.0,63301.0
1353592,Ethics-ch-25.pdf,Rwanda,GPE,90370.0,90376.0
805990,Ethics-ch-07.pdf,A11,PERSON,90370.0,90373.0
1022929,Ethics-ch-15.pdf,the Nuremberg Doctors Trial,ORG,90368.0,90395.0
