In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from notebook_utils import NotebookUtilities
from scrape_utils import WebScrapingUtilities
from pandas import DataFrame
import os
import os.path as osp
import random
import re
import logging

nu = NotebookUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)
wsu = WebScrapingUtilities(
    s=nu,
    secrets_json_path=osp.abspath(osp.join(nu.data_folder, 'secrets', 'itm_secrets.json'))
)


# Parse Domain Documents for Entities

Downloaded all documents from https://nextcentury.atlassian.net/wiki/spaces/ITMC/pages/2991849482/Domain+Documents and converted them all to PDF files and stored them in the data folder.

In [3]:

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

pdf_folder = '../data/Domain_Knowledge'
black_list = ['.ipynb_checkpoints', '$Recycle.Bin', '.git']


## Option 1: Use a Hugging Face NER model

In [4]:

from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

# Named entity recognition pipeline, passing in a specific model and tokenizer
model = AutoModelForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
token_classifier = pipeline('ner', model=model, tokenizer=tokenizer)

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
tokens = token_classifier(sentence)
print(tokens)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'I-PER', 'score': 0.9988381, 'index': 1, 'word': 'Barack', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': 0.9994398, 'index': 2, 'word': 'Obama', 'start': 7, 'end': 12}, {'entity': 'I-LOC', 'score': 0.9983613, 'index': 10, 'word': 'United', 'start': 43, 'end': 49}, {'entity': 'I-LOC', 'score': 0.9920671, 'index': 11, 'word': 'States', 'start': 50, 'end': 56}]



## Option 2: Use SpaCy

In [5]:

import spacy

# Load the spaCy model
try: nlp = spacy.load('en_core_web_sm')
except OSError as e:
    print(str(e).strip())
    command_str = f'{sys.executable} -m spacy download en_core_web_sm --quiet'
    print(command_str)
    !{command_str}
    nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
doc = nlp(sentence)
print([{'text': word.text, 'tag_': word.tag_, 'ent_type_': word.ent_type_, 'pos_': word.pos_} for word in doc])
print([{'text': ent.text, 'label_': ent.label_} for ent in doc.ents])

[{'text': 'Barack', 'tag_': 'NNP', 'ent_type_': 'PERSON', 'pos_': 'PROPN'}, {'text': 'Obama', 'tag_': 'NNP', 'ent_type_': 'PERSON', 'pos_': 'PROPN'}, {'text': 'was', 'tag_': 'VBD', 'ent_type_': '', 'pos_': 'AUX'}, {'text': 'the', 'tag_': 'DT', 'ent_type_': '', 'pos_': 'DET'}, {'text': '44th', 'tag_': 'JJ', 'ent_type_': 'ORDINAL', 'pos_': 'ADJ'}, {'text': 'President', 'tag_': 'NNP', 'ent_type_': '', 'pos_': 'PROPN'}, {'text': 'of', 'tag_': 'IN', 'ent_type_': '', 'pos_': 'ADP'}, {'text': 'the', 'tag_': 'DT', 'ent_type_': 'GPE', 'pos_': 'DET'}, {'text': 'United', 'tag_': 'NNP', 'ent_type_': 'GPE', 'pos_': 'PROPN'}, {'text': 'States', 'tag_': 'NNP', 'ent_type_': 'GPE', 'pos_': 'PROPN'}, {'text': '.', 'tag_': '.', 'ent_type_': '', 'pos_': 'PUNCT'}]
[{'text': 'Barack Obama', 'label_': 'PERSON'}, {'text': '44th', 'label_': 'ORDINAL'}, {'text': 'the United States', 'label_': 'GPE'}]



## Extract the text from PDFs and load it into documents

In [6]:

# Get text from PDFs
if nu.pickle_exists('domain_knowledge_sentences_dict'):
    domain_knowledge_sentences_dict = nu.load_object('domain_knowledge_sentences_dict')
else:
    from PyPDF2 import PdfReader
    def convert(file_path, verbose=False):
        """
        Convert PDF, return its text content as a string
        """
        text = ''
        with open(file_path, 'rb') as file:
            pdf_reader = PdfReader(file)
            for page_number in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_number]
                text += page.extract_text()
        if verbose: print(f'Text length for {file_path} is {len(text):,} characters.')

        return text
    domain_knowledge_sentences_dict = {}
    for sub_directory, directories_list, files_list in os.walk(pdf_folder):
        if all(map(lambda x: x not in sub_directory, black_list)):
            for file_name in files_list:
                if file_name.endswith('.pdf'):
                    file_path = osp.join(sub_directory, file_name)
                    text = convert(file_path)
                    domain_knowledge_sentences_dict[file_path] = text
    nu.store_objects(domain_knowledge_sentences_dict=domain_knowledge_sentences_dict)

In [7]:

# Assert that you got all the hyphenated word wrappings out
for file_path, text in domain_knowledge_sentences_dict.items():
    assert not ('effec-' in text), f'{file_path} still has hyphenated word wrappings.'

In [8]:

import pandas as pd

# load documents
if nu.csv_exists('domain_doc_ners_df'): domain_doc_ners_df = nu.load_data_frames(domain_doc_ners_df='domain_doc_ners_df')['domain_doc_ners_df']
else:
    entities = []
    for file_path, text in domain_knowledge_sentences_dict.items():
        text_length = len(text)
        # print(f'Text length for {file_path} is {text_length:,} characters.')
        
        # Prepare to join subword tokens back together and keep track of entity and score
        output_words = []
        current_word = ''
        current_entities = []
        current_scores = []
        
        # Extract metadata from entity recognition pipeline and add it as a row dictionary to the entities rows list
        tokens = token_classifier(text)
        for metadata_dict in tokens:
            current_entities.append(metadata_dict['entity'])
            current_scores.append(metadata_dict['score'])
            if metadata_dict['word'].startswith('##'):
                current_word += metadata_dict['word'][2:]
            else:
                
                # Take the mode of entities and average of scores for the current_word
                if current_word:
                    mode_entity = pd.Series(current_entities).mode().tolist()[-1]
                    mean_score = pd.Series(current_scores).mean()
                    entity_tuple = (current_word, mode_entity, mean_score)
                    output_words.append(entity_tuple)
                    current_word = ''
                    current_entities = []
                    current_scores = []
                
                entity_tuple = (metadata_dict['word'], metadata_dict['entity'], metadata_dict['score'])
                output_words.append(entity_tuple)
        
        # Take the mode of entities for the last current_word
        if current_word:
            mode_entity = pd.Series(current_entities).mode().tolist()[-1]
            mean_score = pd.Series(current_scores).mean()
            entity_tuple = (current_word, mode_entity, mean_score)
            output_words.append(entity_tuple)
        
        for word, entity, score in output_words:
            metadata_dict = {'bert_word': word, 'bert_entity': entity, 'bert_score': score, 'file_path': file_path}
            entities.append(metadata_dict)

        # Extract SpaCy named entities and add them as a row dictionary to the entities rows list
        if text_length <= nlp.max_length:
            doc = nlp(text)
            entities.extend([
                {'file_path': file_path, 'nlp_word': word.text, 'nlp_tag': word.tag_, 'nlp_type': word.ent_type_, 'nlp_pofs': word.pos_}
                for word in doc
            ])
            entities.extend([
                {'file_path': file_path, 'ent_phrase': ent.text, 'ent_type': ent.label_, 'ent_start': ent.start_char, 'ent_end': ent.end_char}
                for ent in doc.ents
            ])
    domain_doc_ners_df = DataFrame(entities)
    nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


In [9]:

# Assert that all the subword tokens are gone
print(domain_doc_ners_df.columns.tolist())
mask_series = domain_doc_ners_df.bert_word.map(lambda x: str(x).startswith('##'))
df = domain_doc_ners_df[mask_series]
assert (df.shape[0] == 0), 'There still exist subword tokens.'

['bert_word', 'bert_entity', 'bert_score', 'file_path', 'nlp_word', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_phrase', 'ent_type', 'ent_start', 'ent_end']


In [10]:

# Check that you indeed have only strings among the BERT entities
mask_series = domain_doc_ners_df.bert_entity.isnull()
sorted(domain_doc_ners_df[~mask_series].bert_entity.tolist(), key=lambda x: len(str(x)), reverse=True)[:10]

['I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC']


## Explore the entity type columns

In [15]:

words_list = sorted([str(w) for w in domain_doc_ners_df.ent_type.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.ent_type.isin(words_list)
for type, type_df in domain_doc_ners_df[mask_series].groupby('ent_type'):
    mask_series = type_df.ent_phrase.isnull()
    texts_list = sorted(type_df[~mask_series].ent_phrase.unique())
    print()
    print(type, random.sample(texts_list, min(len(texts_list), 20)))

['PERSON', 'QUANTITY', 'ORG', 'EVENT', 'GPE', 'NORP', 'ORDINAL', 'PRODUCT', 'TIME', 'PERCENT', 'CARDINAL', 'LOC', 'DATE', 'MONEY', 'FAC', 'LAW', 'LANGUAGE', 'WORK_OF_ART', 'nan']

CARDINAL ['2015;180(11):1178–1183', '446,484', '22,28,41', '4-6', '700', '2M/', '362', '825', '3000', '905', '4,500', '1–3', '1981–1982', '2006;171(9):826', '3–48', '1-95', '3–8', '2d 553', '284', '9–22']

DATE ['1-7', 'the second half of the 20th century', '15 September 1983', 'August 1862', 'the next \nhour', '62', 'May 29–June 1, 1996', 'April 24, 1863', 'May 12, 2018', 'Nurs Adm. 1995;25:60–62', 'a single day', '78234', '15 December 1996', 'the end of the sixteenth\ncentury', 'as many years', 'fourteen days', '7-8)', 'the seventies', '5525', '1994 to 1995']

EVENT ['the Civil War Doctor Who Pioneered Battlefield Care', 'Operation Iraqi Freedom/Operation Enduring Freedom\nWW', 'Total War', 'the Military\nChapter 38', 'The Gulf War', 'World War I:', 'Nazi War on Cancer', 'Revolution', 'Gulf War\nOEF', 'the 

In [14]:

words_list = sorted([str(w) for w in domain_doc_ners_df.nlp_type.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.nlp_type.isin(words_list)
for type, type_df in domain_doc_ners_df[mask_series].groupby('nlp_type'):
    mask_series = type_df.nlp_word.isnull()
    texts_list = sorted(type_df[~mask_series].nlp_word.unique())
    print()
    print(type, random.sample(texts_list, min(len(texts_list), 20)))

['EVENT', 'PRODUCT', 'ORDINAL', 'ORG', 'FAC', 'LAW', 'GPE', '', 'DATE', 'MONEY', 'LANGUAGE', 'PERCENT', 'PERSON', 'nan', 'CARDINAL', 'LOC', 'WORK_OF_ART', 'QUANTITY', 'TIME', 'NORP']

 ['radiologic', 'issn/0026', 'scored', 'philosophically', '54,55', 'http://www.army.mil/cmh-pg/art/A&I/Vietnam/p_3_4_67.jpg', 'RHW', 'Shinseki', 'battlefront', 'CRITICAL', 'Upper', 'corn', 'POWs', 'jurisdictions', 'forced', 'Senior', 'dicentrics', 'FOLLOW', 'WOUNDS', 'exemption']

CARDINAL ['2006;2', '1968;78:269–279', '496', '1995;66(3):260–263', 'See', '358–389', "4'11", 'odd', '10(–5', '733', '2005;90(1):53–76', '2012;4(1):22', '380', '691', '1975;31(4):49–65', 'V1', '764', '1988;153(1):7–11', '2014;189(12):1479–1486', '1987;17:545–548']

DATE ['1854', 'old', '1932–45', '1650', 'MD', 'end', '1930s', '1990;20(3):19–22', 'Nineteenth', '1919', 'full', '3362', '47', '1885', 'generation', '2905', '46', 'era', 'wartime', 'Saturday']

EVENT ['FREEDOM', 'Humanitarian', 'K.', 'Lost', 'Marine', 'IRAQI', 'Detaine


## Explore the tag and parts-of-speech columns

In [11]:

words_list = sorted([str(w) for w in domain_doc_ners_df.nlp_tag.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.nlp_tag.isin(words_list)
for tag, tag_df in domain_doc_ners_df[mask_series].groupby('nlp_tag'):
    mask_series = tag_df.nlp_word.isnull()
    texts_list = sorted(tag_df[~mask_series].nlp_word.unique())
    print()
    print(tag, random.sample(texts_list, min(len(texts_list), 50)))

['nan', 'RBS', 'RBR', '.', 'JJ', 'VBZ', 'NN', ':', '``', 'VB', 'WP', 'VBG', '-RRB-', 'UH', '_SP', 'JJR', 'WP$', "''", 'PRP', 'JJS']

'' ['”3', '”13', '”19', '”13(p65', '”89', '”85', '”62(p82', "'", '”92', '”5', '”7(p78', '”12', '”42', '’', '”46', '”34', '”17(p47', '"', '‘', '”54', '”36', '”37', '”59(p49', '”62', '”4', '”24', '”34(p32', '”88', '”1', '\uf0a7', '“', '”7', '”64(p26', '”10', '”18,19', '”20(p19', '”71,72', '”26', '”2', '”', '”102(p149', '”64(p13', '”35', '”9', '”11', '”23,24', '”32', '’s', '”15(p43', '”8']

-RRB- ['…', ']', '•', '”34(Art23e', '}', ')', '):', '46,47']

. ['.....................................................................................................................', '”4(p448', '....................................................', '.........................................................................................', '................................................', '.............................................................................

In [12]:

words_list = sorted([str(w) for w in domain_doc_ners_df.nlp_pofs.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.nlp_pofs.isin(words_list)
for pofs, pofs_df in domain_doc_ners_df[mask_series].groupby('nlp_pofs'):
    mask_series = pofs_df.nlp_word.isnull()
    texts_list = sorted(pofs_df[~mask_series].nlp_word.unique())
    print()
    print(pofs, random.sample(texts_list, min(len(texts_list), 20)))

['NUM', 'SCONJ', 'X', 'PART', 'VERB', 'NOUN', 'INTJ', 'ADV', 'nan', 'PUNCT', 'PROPN', 'AUX', 'SPACE', 'SYM', 'DET', 'ADJ', 'PRON', 'CCONJ', 'ADP']

ADJ ['Discontinue', 'true', 'combatcapable', 'rocky', 'advanced', 'Russian', 'untenable', 'wholesome', 'painful', 'derive', 'Frequent', 'liquid', 'µg', 'ineradicable', 'Coalition', 'inconsistent', 'absolute', 'unsure', 'irreversible', 'succinct']

ADP ['throughout', 'Up', 'https://', 'Around', 'After', 'notwithstanding', 'With', 'irregulâr', '1933–1945', 'Into', 'v.', 'out', 'DURING', 'vs.', 'for', 'volvulus', 'Between', 'underneath', 'ta', 'at']

ADV ['beforehand', 'perbronchially', 'externally', 'Widely', 'than', 'easily', 'out', 'well', 'solemnly', 'comparably', 'unusually', '+', 'attitudes,25', 'Fundamentally', 'aptly', 'invariably', 'back', 'subsequently', 'closely', 'incredibly']

AUX ['”41', 'got', 'uses', 'See', 'becoming', 'Has', 'Spurting', 'would', 'remain', 'had', '’ve', 'Ca', 'aches', 'Can', 'deploym', 'see', 'did', 'became', '


## Explore the entity column

In [13]:

words_list = sorted([str(w) for w in domain_doc_ners_df.bert_entity.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.bert_entity.isin(words_list)
for entity, entity_df in domain_doc_ners_df[mask_series].groupby('bert_entity'):
    mask_series = entity_df.bert_word.isnull()
    texts_list = sorted(entity_df[~mask_series].bert_word.unique())
    print()
    print(entity, random.sample(texts_list, min(len(texts_list), 50)))

['I-LOC', 'I-ORG', 'I-MISC', 'nan', 'I-PER']

I-LOC ['Southeast', 'Anthony', 'M', 'Church', 'St', 'West', 'Bel', 'He', 'May', 'Einstein', 'chfield', 'Andrews', 'Pearl', 'Pennsylvania', 'Edward', 'Bridge', '’', 'III', 'Boston', 'London', 'My', 'MD', 'College', 'New', 'Baltimore', 'Iraq', 'TX', 'WA', 'Field', 'ridge', 'ERICA', 'wood', 'Room', 'US', 'ébert', 'Virginia', 'CO', 'Salvador', 'Vincent', 'Combat', 'Silver', 'China', 'Charleston', 'Guam', 'Em', 'DC', '28th', 'Tennessee', 'Staffordshire', 'El']

I-MISC ['RISINA', 'RRAUMAranexa', 'BL', 'Shield', 'Jews', 'chau', 'Convention', 'Seven', 'MS', 'AR', 'Fu', 'Chemical', 'Art', 'Darwin', 'O', 'Post', 'L', 'ET', 'Act', 'Modern', 'less', 'TI', 'LA', 'UMANITARIAN', 'Study', 'Sun', 'Territories', 'Medicine', 'Manual', 'Da', 'ATA', 'Program', 'Prussian', 'Democracy', '20th', 'Wars', 'Napoleonic', 'Cambodian', 'NS', 'N', 'Report', 'APAN', 'trine', 'APANR', 'Lazarus', 'Aristotle', 'Century', 'National', 'UMANITARIANN', 'lines']

I-ORG ['LGIDIOL'


## Explore column groupbys

In [16]:

for file_path, file_path_df in domain_doc_ners_df.groupby('file_path'):
    print(file_path)
    display(file_path_df.sample(4).dropna(axis='columns', how='all').T)
    break

../data/Domain_Knowledge/Fundamentals of Military Medicine/Fund ch 1.txt


Unnamed: 0,89871,90227,95310,94584
file_path,../data/Domain_Knowledge/Fundamentals of Milit...,../data/Domain_Knowledge/Fundamentals of Milit...,../data/Domain_Knowledge/Fundamentals of Milit...,../data/Domain_Knowledge/Fundamentals of Milit...
nlp_word,an,a,",",units
nlp_tag,DT,DT,",",NNS
nlp_type,,,,
nlp_pofs,DET,DET,PUNCT,NOUN


In [17]:

for nlp_tag, nlp_tag_df in domain_doc_ners_df.groupby('nlp_tag'):
    print(nlp_tag)
    display(nlp_tag_df.sample(4).dropna(axis='columns', how='all').T)
    break

$


Unnamed: 0,1057944,1018594,1280591,1280584
file_path,../data/Domain_Knowledge/Military Medical Ethi...,../data/Domain_Knowledge/Military Medical Ethi...,../data/Domain_Knowledge/Military Medical Ethi...,../data/Domain_Knowledge/Military Medical Ethi...
nlp_word,$,$,•,•
nlp_tag,$,$,$,$
nlp_type,,MONEY,PRODUCT,PRODUCT
nlp_pofs,SYM,SYM,SYM,SYM


In [18]:

from itertools import combinations

# columns_list = ['bert_entity', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_type']
for groupby_columns in combinations(domain_doc_ners_df.columns, 2):
    groupby_columns = list(groupby_columns)
    df = nu.get_minority_combinations(domain_doc_ners_df, groupby_columns).dropna(axis='columns', how='all')
    if df.shape[0]:
        print(nu.conjunctify_nouns(groupby_columns))
        df.file_path = df.file_path.map(lambda x: str(x).split('/')[-1])
        display(df)

bert_word and bert_entity


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
928373,Max,I-ORG,0.937659,Ethics-ch-14.txt
1177445,ecessity,I-ORG,0.767861,Ethics-ch-21.txt
1203133,drome,I-ORG,0.88896,Ethics-ch-22.txt
558685,MUAL,I-ORG,0.633143,Fund ch 8.txt


bert_word and bert_score


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
153709,&,I-ORG,0.894564,Fund ch 15.txt
809741,Sai,I-LOC,0.979156,Ethics-ch-10.txt
1276560,Safe,I-ORG,0.661455,Ethics-ch-25.txt
767404,Sa,I-PER,0.995492,Ethics-ch-08.txt


bert_word and file_path


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
928376,ckInstut,I-ORG,0.845059,Ethics-ch-14.txt
1276555,Maryland,I-ORG,0.884252,Ethics-ch-25.txt
1297682,Maryland,I-LOC,0.996224,Ethics-ch-26.txt
1314548,Maryland,I-LOC,0.997555,Ethics-ch-27.txt


bert_entity and bert_score


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
189009,MD,I-LOC,0.329505,Fund ch 19.txt
276694,Branch,I-ORG,0.976236,Fund ch 26.txt
511864,Health,I-ORG,0.976196,Fund ch 5.txt
581377,Man,I-ORG,0.976157,Ethics-ch-01.txt


bert_entity and file_path


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
666595,John,I-PER,0.435961,Ethics-ch-04.txt
224444,Medicine,I-MISC,0.753931,Fund ch 22.txt
301115,MI,I-MISC,0.73435,Fund ch 29.txt
483488,Medicine,I-MISC,0.668454,Fund ch 4.txt


bert_score and file_path


Unnamed: 0,bert_word,bert_entity,bert_score,file_path
263068,Plato,I-PER,0.285925,Fund ch 25.txt
581350,Washington,I-ORG,0.983112,Ethics-ch-01.txt
511862,of,I-ORG,0.983095,Fund ch 5.txt
408105,US,I-ORG,0.983095,Fund ch 35.txt


file_path and nlp_word


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
484280,Fund ch 4.txt,rear,NN,,NOUN
1210289,Ethics-ch-22.txt,purpose,NN,,NOUN
1212582,Ethics-ch-22.txt,purposes,NNS,,NOUN
625903,Ethics-ch-02.txt,Bioeth,NNP,ORG,PROPN


file_path and nlp_tag


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
401533,Fund ch 34.txt,tag,UH,,INTJ
206130,Fund ch 20.txt,”4,FW,,X
674419,Ethics-ch-04.txt,per,FW,,X
1327672,3-84-D11-LEGAL-ROE.txt,best,JJS,,ADJ


file_path and nlp_type


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
831173,Ethics-ch-11.txt,200,CD,MONEY,NUM
429315,Fund ch 36.txt,English,JJ,LANGUAGE,ADJ
42171,SchauerMedicBag.txt,Nonissue,NNP,NORP,PROPN
405592,Fund ch 34.txt,4):S129,NNP,MONEY,PROPN


file_path and nlp_pofs


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
228719,Fund ch 22.txt,right,UH,,INTJ
271734,Fund ch 25.txt,Well,UH,,INTJ
1048598,Ethics-ch-18.txt,no,UH,,INTJ
215409,Fund ch 21.txt,quote,UH,,INTJ


file_path and ent_phrase


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
99400,Fund ch 1.txt,& de plusieurs Monstres,ORG,71673.0,71696.0
927109,Ethics-ch-13.txt,Army Medical Department,ORG,19446.0,19469.0
927881,Ethics-ch-13.txt,Article\n17,LAW,135453.0,135463.0
927369,Ethics-ch-13.txt,Article 12 of the Geneva Conventions,LAW,64225.0,64261.0


file_path and ent_type


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
1328463,3-84-D11-LEGAL-ROE.txt,Legal Support,PERSON,2174.0,2187.0
169844,Fund ch 16.txt,100%,PERCENT,20720.0,20724.0
1328453,3-84-D11-LEGAL-ROE.txt,Appendix F,PRODUCT,1262.0,1272.0
629967,Ethics-ch-02.txt,147,MONEY,146046.0,146049.0


file_path and ent_start


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
98489,Fund ch 1.txt,1,CARDINAL,0.0,1.0
1060383,Ethics-ch-18.txt,2,CARDINAL,42517.0,42518.0
1060382,Ethics-ch-18.txt,1,CARDINAL,42442.0,42443.0
1060381,Ethics-ch-18.txt,three,CARDINAL,42424.0,42429.0


file_path and ent_end


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
98489,Fund ch 1.txt,1,CARDINAL,0.0,1.0
1060383,Ethics-ch-18.txt,2,CARDINAL,42517.0,42518.0
1060382,Ethics-ch-18.txt,1,CARDINAL,42442.0,42443.0
1060381,Ethics-ch-18.txt,three,CARDINAL,42424.0,42429.0


nlp_word and nlp_tag


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
860886,Ethics-ch-11.txt,adjuvants,NNS,,NOUN
1251767,Ethics-ch-24.txt,ineffectual,JJ,,ADJ
1200736,Ethics-ch-21.txt,Norberg,NNP,,PROPN
1229025,Ethics-ch-23.txt,Noort,NN,,NOUN


nlp_word and nlp_type


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
1073675,Ethics-ch-19.txt,Protect,VB,ORG,VERB
251942,Fund ch 24.txt,proton,NN,,NOUN
232172,Fund ch 22.txt,Publication,NNP,DATE,PROPN
231524,Fund ch 22.txt,Public,NNP,EVENT,PROPN


nlp_word and nlp_pofs


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
145708,Fund ch 14.txt,M1126,NNP,,PROPN
158462,Fund ch 15.txt,lament,VBP,,VERB
155486,Fund ch 15.txt,laminated,VBN,,VERB
779883,Ethics-ch-08.txt,landbased,JJ,,ADJ


nlp_tag and nlp_type


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
453302,Fund ch 38.txt,EXHIBIT,VBZ,LOC,VERB
1046756,Ethics-ch-18.txt,Conducting,VBG,FAC,VERB
440400,Fund ch 36.txt,-,:,CARDINAL,PUNCT
1467716,ARN19354_FM 6-27 _C1_FINAL_WEB_v2.txt,],XX,PRODUCT,X


nlp_tag and nlp_pofs


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
518965,Fund ch 5.txt,”41,'',,AUX
786776,Ethics-ch-08.txt,[,XX,,AUX
582927,Ethics-ch-01.txt,EXHIBIT,NNS,,AUX
1429334,ARN19354_FM 6-27 _C1_FINAL_WEB_v2.txt,doe,UH,,AUX


nlp_type and nlp_pofs


Unnamed: 0,file_path,nlp_word,nlp_tag,nlp_type,nlp_pofs
1241087,Ethics-ch-23.txt,shall,MD,LAW,AUX
365739,Fund ch 33.txt,%,UH,ORG,INTJ
780641,Ethics-ch-08.txt,remaining,VBG,PERCENT,VERB
930681,Ethics-ch-14.txt,9,LS,EVENT,X


ent_phrase and ent_type


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
327054,Fund ch 3.txt,James Mattis,PERSON,66704.0,66716.0
1492436,ARN19354_FM 6-27 _C1_FINAL_WEB_v2.txt,Judgment,GPE,712647.0,712655.0
746606,Ethics-ch-06.txt,Zagreb,GPE,136598.0,136604.0
1297485,Ethics-ch-25.txt,Zaijtchuk R. MEDRETEs,PERSON,101413.0,101434.0


ent_phrase and ent_start


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
864486,Ethics-ch-11.txt,#,CARDINAL,151818.0,151819.0
952785,Ethics-ch-14.txt,Nazis,ORG,15020.0,15025.0
975559,Ethics-ch-15.txt,Nazis,NORP,10326.0,10331.0
975543,Ethics-ch-15.txt,Nazis,NORP,8774.0,8779.0


ent_phrase and ent_end


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
864486,Ethics-ch-11.txt,#,CARDINAL,151818.0,151819.0
952785,Ethics-ch-14.txt,Nazis,ORG,15020.0,15025.0
975559,Ethics-ch-15.txt,Nazis,NORP,10326.0,10331.0
975543,Ethics-ch-15.txt,Nazis,NORP,8774.0,8779.0


ent_type and ent_start


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
666509,Ethics-ch-03.txt,the Constitution of the United States,LAW,166397.0,166434.0
326650,Fund ch 3.txt,Congress,ORG,39415.0,39423.0
294018,Fund ch 27.txt,JUMP-ACL,ORG,39408.0,39416.0
214082,Fund ch 20.txt,US Army Infantry School,ORG,39407.0,39430.0


ent_type and ent_end


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
1487205,ARN19354_FM 6-27 _C1_FINAL_WEB_v2.txt,Chapter 4,LAW,179388.0,179397.0
36746,Prolonged_Casualty_Care_Guidelines_21_Dec_2021...,Assess,ORG,39839.0,39845.0
427437,Fund ch 35.txt,Investigational New \nDrug,ORG,39819.0,39844.0
574549,Fund ch 8.txt,Medical Logistics,ORG,39823.0,39840.0


ent_start and ent_end


Unnamed: 0,file_path,ent_phrase,ent_type,ent_start,ent_end
153327,Fund ch 14.txt,the Joint Chiefs of Staff,ORG,62322.0,62347.0
84205,TCCC_Quick_Reference_Guide_2017.txt,978,CARDINAL,89961.0,89964.0
767153,Ethics-ch-07.txt,6th,ORDINAL,89960.0,89963.0
532013,Fund ch 5.txt,1982,DATE,89957.0,89961.0
