In [1]:

%pprint
import sys
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))

Pretty printing has been turned OFF


In [2]:

from FRVRS import (nu, DataFrame, osp, re, walk, Series, display)
from scrape_utils import WebScrapingUtilities
import os
import random
import logging

wsu = WebScrapingUtilities(
    s=nu,
    secrets_json_path=osp.abspath(osp.join(nu.data_folder, 'secrets', 'itm_secrets.json'))
)


# Parse Domain Documents for Entities

Downloaded all documents from https://nextcentury.atlassian.net/wiki/spaces/ITMC/pages/2991849482/Domain+Documents and converted them all to PDF files and stored them in the data folder.

In [3]:

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

pdf_folder = '../data/Domain_Knowledge'
black_list = ['.ipynb_checkpoints', '$Recycle.Bin', '.git']

In [None]:

# Display all the files we ingested
for sub_directory, directories_list, files_list in walk(pdf_folder):
    if all(map(lambda x: x not in sub_directory, black_list)):
        if any(map(lambda x: x.endswith('.txt'), files_list)):
            for file_name in files_list:
                if file_name.endswith('.txt'):
                    file_path = osp.join(sub_directory, file_name)
                    print(file_path.replace('../data/Domain_Knowledge/', ''))
        else:
            for file_name in files_list:
                if file_name.endswith('.pdf'):
                    file_path = osp.join(sub_directory, file_name)
                    print(file_path.replace('../data/Domain_Knowledge/', ''))


## Option 1: Use a Hugging Face NER model

In [4]:

from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

# Named entity recognition pipeline, passing in a specific model and tokenizer
model = AutoModelForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
token_classifier = pipeline('ner', model=model, tokenizer=tokenizer)

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
tokens = token_classifier(sentence)
print(tokens)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'I-PER', 'score': 0.9988381, 'index': 1, 'word': 'Barack', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': 0.9994398, 'index': 2, 'word': 'Obama', 'start': 7, 'end': 12}, {'entity': 'I-LOC', 'score': 0.9983613, 'index': 10, 'word': 'United', 'start': 43, 'end': 49}, {'entity': 'I-LOC', 'score': 0.9920671, 'index': 11, 'word': 'States', 'start': 50, 'end': 56}]



## Option 2: Use SpaCy

In [5]:

import spacy

# Load the spaCy model
try: nlp = spacy.load('en_core_web_sm')
except OSError as e:
    print(str(e).strip())
    command_str = f'{sys.executable} -m spacy download en_core_web_sm --quiet'
    print(command_str)
    !{command_str}
    nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
doc = nlp(sentence)
print([{'text': word.text, 'tag_': word.tag_, 'ent_type_': word.ent_type_, 'pos_': word.pos_} for word in doc])
print([{'text': ent.text, 'label_': ent.label_} for ent in doc.ents])

[{'text': 'Barack', 'tag_': 'NNP', 'ent_type_': 'PERSON', 'pos_': 'PROPN'}, {'text': 'Obama', 'tag_': 'NNP', 'ent_type_': 'PERSON', 'pos_': 'PROPN'}, {'text': 'was', 'tag_': 'VBD', 'ent_type_': '', 'pos_': 'AUX'}, {'text': 'the', 'tag_': 'DT', 'ent_type_': '', 'pos_': 'DET'}, {'text': '44th', 'tag_': 'JJ', 'ent_type_': 'ORDINAL', 'pos_': 'ADJ'}, {'text': 'President', 'tag_': 'NNP', 'ent_type_': '', 'pos_': 'PROPN'}, {'text': 'of', 'tag_': 'IN', 'ent_type_': '', 'pos_': 'ADP'}, {'text': 'the', 'tag_': 'DT', 'ent_type_': 'GPE', 'pos_': 'DET'}, {'text': 'United', 'tag_': 'NNP', 'ent_type_': 'GPE', 'pos_': 'PROPN'}, {'text': 'States', 'tag_': 'NNP', 'ent_type_': 'GPE', 'pos_': 'PROPN'}, {'text': '.', 'tag_': '.', 'ent_type_': '', 'pos_': 'PUNCT'}]
[{'text': 'Barack Obama', 'label_': 'PERSON'}, {'text': '44th', 'label_': 'ORDINAL'}, {'text': 'the United States', 'label_': 'GPE'}]



## Extract the text from PDFs and load it into documents

In [10]:

# Get text from PDFs
if nu.pickle_exists('domain_knowledge_sentences_dict'):
    domain_knowledge_sentences_dict = nu.load_object('domain_knowledge_sentences_dict')
else:
    from PyPDF2 import PdfReader
    def convert(file_path, verbose=False):
        """
        Convert PDF, return its text content as a string
        """
        text = ''
        with open(file_path, 'rb') as file:
            pdf_reader = PdfReader(file)
            for page_number in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_number]
                text += page.extract_text()
        if verbose: print(f'Text length for {file_path} is {len(text):,} characters.')

        return text
    domain_knowledge_sentences_dict = {}
    for sub_directory, directories_list, files_list in walk(pdf_folder):
        if all(map(lambda x: x not in sub_directory, black_list)):
            if any(map(lambda x: x.endswith('.txt'), files_list)):
                for file_name in files_list:
                    if file_name.endswith('.txt'):
                        file_path = osp.join(sub_directory, file_name)
                        with open(file_path, 'r', encoding=nu.encoding_type) as f:
                            text = f.read()
                            domain_knowledge_sentences_dict[file_path] = text
            else:
                for file_name in files_list:
                    if file_name.endswith('.pdf'):
                        file_path = osp.join(sub_directory, file_name)
                        text = convert(file_path, verbose=True)
                        domain_knowledge_sentences_dict[file_path] = text
    nu.store_objects(domain_knowledge_sentences_dict=domain_knowledge_sentences_dict)

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/domain_knowledge_sentences_dict.pkl


In [11]:

# Assert that you got all the hyphenated word wrappings out
for file_path, text in domain_knowledge_sentences_dict.items():
    assert not ('effec-' in text), f'{file_path} still has hyphenated word wrappings.'

In [21]:

import pandas as pd

# load documents
if nu.csv_exists('domain_doc_ners_df'): domain_doc_ners_df = nu.load_data_frames(
    domain_doc_ners_df='domain_doc_ners_df'
)['domain_doc_ners_df']
else:
    entities = []
    for file_path, text in domain_knowledge_sentences_dict.items():
        text_length = len(text)
        # print(f'Text length for {file_path} is {text_length:,} characters.')
        
        # Prepare to join subword tokens back together and keep track of entity and score
        output_words = []
        current_word = ''
        current_entities = []
        current_scores = []
        current_starts = []
        current_ends = []
        
        # Extract metadata from entity recognition pipeline and add it as a row dictionary to the entities rows list
        tokens = token_classifier(text)
        for metadata_dict in tokens:
            current_entities.append(metadata_dict['entity'])
            current_scores.append(metadata_dict['score'])
            current_starts.append(metadata_dict['start'])
            current_ends.append(metadata_dict['end'])
            if metadata_dict['word'].startswith('##'): current_word += metadata_dict['word'][2:]
            else:
                
                # Take the mode of entities and average of scores for the current_word
                if current_word:
                    if len(current_entities) > 1: current_entities = current_entities[:-1]
                    if len(current_scores) > 1: current_scores = current_scores[:-1]
                    if len(current_ends) > 2: current_ends = current_ends[:-1]
                    mode_entity = Series(current_entities).mode().tolist()[-1]
                    mean_score = Series(current_scores).mean()
                    start_idx = current_starts[0]
                    end_idx = current_ends[-1]
                    entity_tuple = (current_word, mode_entity, mean_score, start_idx, end_idx)
                    output_words.append(entity_tuple)
                    current_word = ''
                    current_entities = []
                    current_scores = []
                    current_starts = []
                    current_ends = []
                else:
                    current_word = metadata_dict['word']
                    current_entities = [metadata_dict['entity']]
                    current_scores = [metadata_dict['score']]
                    current_starts = [metadata_dict['start']]
                    current_ends = [metadata_dict['end']]
        
        # Take the mode of entities for the last current_word
        if current_word:
            mode_entity = Series(current_entities).mode().tolist()[-1]
            mean_score = Series(current_scores).mean()
            start_idx = current_starts[0]
            end_idx = current_ends[-1]
            entity_tuple = (current_word, mode_entity, mean_score, start_idx, end_idx)
            output_words.append(entity_tuple)
        
        for word, entity, score, start, end in output_words:
            metadata_dict = {
                'bert_word': word, 'bert_entity': entity, 'bert_score': score, 'bert_start': start, 'bert_end': end, 'file_path': file_path
            }
            entities.append(metadata_dict)

        # Extract SpaCy named entities and add them as a row dictionary to the entities rows list
        if text_length <= nlp.max_length:
            doc = nlp(text)
            entities.extend([
                {'file_path': file_path, 'nlp_word': word.text, 'nlp_tag': word.tag_, 'nlp_type': word.ent_type_, 'nlp_pofs': word.pos_}
                for word in doc
            ])
            entities.extend([
                {'file_path': file_path, 'ent_phrase': ent.text, 'ent_type': ent.label_, 'ent_start': ent.start_char, 'ent_end': ent.end_char}
                for ent in doc.ents
            ])
    domain_doc_ners_df = DataFrame(entities)
    nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


In [23]:

# Assert that all the subword tokens are gone
print(domain_doc_ners_df.columns.tolist())
mask_series = domain_doc_ners_df.bert_word.map(lambda x: str(x).startswith('##'))
df = domain_doc_ners_df[mask_series]
assert (df.shape[0] == 0), 'There still exist subword tokens.'

['bert_word', 'bert_entity', 'bert_score', 'bert_start', 'bert_end', 'file_path', 'nlp_word', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_phrase', 'ent_type', 'ent_start', 'ent_end']


In [24]:

# Check that you indeed have only strings among the BERT entities
mask_series = domain_doc_ners_df.bert_entity.isnull()
sorted(domain_doc_ners_df[~mask_series].bert_entity.tolist(), key=lambda x: len(str(x)), reverse=True)[:10]

['I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC']


## Explore the entity type columns

In [15]:

words_list = sorted([str(w) for w in domain_doc_ners_df.ent_type.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.ent_type.isin(words_list)
for type, type_df in domain_doc_ners_df[mask_series].groupby('ent_type'):
    mask_series = type_df.ent_phrase.isnull()
    texts_list = sorted(type_df[~mask_series].ent_phrase.unique())
    print()
    print(type, random.sample(texts_list, min(len(texts_list), 20)))

['PERSON', 'QUANTITY', 'ORG', 'EVENT', 'GPE', 'NORP', 'ORDINAL', 'PRODUCT', 'TIME', 'PERCENT', 'CARDINAL', 'LOC', 'DATE', 'MONEY', 'FAC', 'LAW', 'LANGUAGE', 'WORK_OF_ART', 'nan']

CARDINAL ['2015;180(11):1178–1183', '446,484', '22,28,41', '4-6', '700', '2M/', '362', '825', '3000', '905', '4,500', '1–3', '1981–1982', '2006;171(9):826', '3–48', '1-95', '3–8', '2d 553', '284', '9–22']

DATE ['1-7', 'the second half of the 20th century', '15 September 1983', 'August 1862', 'the next \nhour', '62', 'May 29–June 1, 1996', 'April 24, 1863', 'May 12, 2018', 'Nurs Adm. 1995;25:60–62', 'a single day', '78234', '15 December 1996', 'the end of the sixteenth\ncentury', 'as many years', 'fourteen days', '7-8)', 'the seventies', '5525', '1994 to 1995']

EVENT ['the Civil War Doctor Who Pioneered Battlefield Care', 'Operation Iraqi Freedom/Operation Enduring Freedom\nWW', 'Total War', 'the Military\nChapter 38', 'The Gulf War', 'World War I:', 'Nazi War on Cancer', 'Revolution', 'Gulf War\nOEF', 'the 

In [14]:

words_list = sorted([str(w) for w in domain_doc_ners_df.nlp_type.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.nlp_type.isin(words_list)
for type, type_df in domain_doc_ners_df[mask_series].groupby('nlp_type'):
    mask_series = type_df.nlp_word.isnull()
    texts_list = sorted(type_df[~mask_series].nlp_word.unique())
    print()
    print(type, random.sample(texts_list, min(len(texts_list), 20)))

['EVENT', 'PRODUCT', 'ORDINAL', 'ORG', 'FAC', 'LAW', 'GPE', '', 'DATE', 'MONEY', 'LANGUAGE', 'PERCENT', 'PERSON', 'nan', 'CARDINAL', 'LOC', 'WORK_OF_ART', 'QUANTITY', 'TIME', 'NORP']

 ['radiologic', 'issn/0026', 'scored', 'philosophically', '54,55', 'http://www.army.mil/cmh-pg/art/A&I/Vietnam/p_3_4_67.jpg', 'RHW', 'Shinseki', 'battlefront', 'CRITICAL', 'Upper', 'corn', 'POWs', 'jurisdictions', 'forced', 'Senior', 'dicentrics', 'FOLLOW', 'WOUNDS', 'exemption']

CARDINAL ['2006;2', '1968;78:269–279', '496', '1995;66(3):260–263', 'See', '358–389', "4'11", 'odd', '10(–5', '733', '2005;90(1):53–76', '2012;4(1):22', '380', '691', '1975;31(4):49–65', 'V1', '764', '1988;153(1):7–11', '2014;189(12):1479–1486', '1987;17:545–548']

DATE ['1854', 'old', '1932–45', '1650', 'MD', 'end', '1930s', '1990;20(3):19–22', 'Nineteenth', '1919', 'full', '3362', '47', '1885', 'generation', '2905', '46', 'era', 'wartime', 'Saturday']

EVENT ['FREEDOM', 'Humanitarian', 'K.', 'Lost', 'Marine', 'IRAQI', 'Detaine


## Explore the tag and parts-of-speech columns

In [11]:

words_list = sorted([str(w) for w in domain_doc_ners_df.nlp_tag.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.nlp_tag.isin(words_list)
for tag, tag_df in domain_doc_ners_df[mask_series].groupby('nlp_tag'):
    mask_series = tag_df.nlp_word.isnull()
    texts_list = sorted(tag_df[~mask_series].nlp_word.unique())
    print()
    print(tag, random.sample(texts_list, min(len(texts_list), 50)))

['nan', 'RBS', 'RBR', '.', 'JJ', 'VBZ', 'NN', ':', '``', 'VB', 'WP', 'VBG', '-RRB-', 'UH', '_SP', 'JJR', 'WP$', "''", 'PRP', 'JJS']

'' ['”3', '”13', '”19', '”13(p65', '”89', '”85', '”62(p82', "'", '”92', '”5', '”7(p78', '”12', '”42', '’', '”46', '”34', '”17(p47', '"', '‘', '”54', '”36', '”37', '”59(p49', '”62', '”4', '”24', '”34(p32', '”88', '”1', '\uf0a7', '“', '”7', '”64(p26', '”10', '”18,19', '”20(p19', '”71,72', '”26', '”2', '”', '”102(p149', '”64(p13', '”35', '”9', '”11', '”23,24', '”32', '’s', '”15(p43', '”8']

-RRB- ['…', ']', '•', '”34(Art23e', '}', ')', '):', '46,47']

. ['.....................................................................................................................', '”4(p448', '....................................................', '.........................................................................................', '................................................', '.............................................................................

In [12]:

words_list = sorted([str(w) for w in domain_doc_ners_df.nlp_pofs.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.nlp_pofs.isin(words_list)
for pofs, pofs_df in domain_doc_ners_df[mask_series].groupby('nlp_pofs'):
    mask_series = pofs_df.nlp_word.isnull()
    texts_list = sorted(pofs_df[~mask_series].nlp_word.unique())
    print()
    print(pofs, random.sample(texts_list, min(len(texts_list), 20)))

['NUM', 'SCONJ', 'X', 'PART', 'VERB', 'NOUN', 'INTJ', 'ADV', 'nan', 'PUNCT', 'PROPN', 'AUX', 'SPACE', 'SYM', 'DET', 'ADJ', 'PRON', 'CCONJ', 'ADP']

ADJ ['Discontinue', 'true', 'combatcapable', 'rocky', 'advanced', 'Russian', 'untenable', 'wholesome', 'painful', 'derive', 'Frequent', 'liquid', 'µg', 'ineradicable', 'Coalition', 'inconsistent', 'absolute', 'unsure', 'irreversible', 'succinct']

ADP ['throughout', 'Up', 'https://', 'Around', 'After', 'notwithstanding', 'With', 'irregulâr', '1933–1945', 'Into', 'v.', 'out', 'DURING', 'vs.', 'for', 'volvulus', 'Between', 'underneath', 'ta', 'at']

ADV ['beforehand', 'perbronchially', 'externally', 'Widely', 'than', 'easily', 'out', 'well', 'solemnly', 'comparably', 'unusually', '+', 'attitudes,25', 'Fundamentally', 'aptly', 'invariably', 'back', 'subsequently', 'closely', 'incredibly']

AUX ['”41', 'got', 'uses', 'See', 'becoming', 'Has', 'Spurting', 'would', 'remain', 'had', '’ve', 'Ca', 'aches', 'Can', 'deploym', 'see', 'did', 'became', '


## Explore the entity column

In [25]:

words_list = sorted([str(w) for w in domain_doc_ners_df.bert_entity.unique()])
words_list = random.sample(words_list, min(len(words_list), 20))
print(words_list)
mask_series = domain_doc_ners_df.bert_entity.isin(words_list)
for entity, entity_df in domain_doc_ners_df[mask_series].groupby('bert_entity'):
    mask_series = entity_df.bert_word.isnull()
    texts_list = sorted(entity_df[~mask_series].bert_word.unique())
    print()
    print(entity, random.sample(texts_list, min(len(texts_list), 50)))

['I-LOC', 'nan', 'I-ORG', 'I-PER', 'I-MISC']

I-LOC ['t', 'Detrick', 'cChord', 'Towson', 'States', 'Iraq', 'Springs', 'China', 'Afghanistan', 'South', 'Geneva', 'Campanile', 'Center', 'Florida', 'Connecticut', 'Marine', 'Hurlburt', 'Asia', 'and', 'line', 'Chicago', 'Tennessee', 'Lackland', 'man', 'worth', 'S', 'Leyte', 'Andrews', 'Montefiore', 'Maryland', 'Einstein', 'Hebert', 'West', 'Haven', 'Durham', 'Vincent', 'EUTHANASIALE', 'Philippines', 'Mogadishu', 'Spring', 'Hospital', 'Glen', 'Delaware', 'Kingdom', 'pan', 'France', 'Germany', 'Arizona', 'Room', 'Silver']

I-MISC ['Experiment', 'Armed', 'Sun', 'Human', 'Aristotle', 'Doctrine', 's', 'SICIGI', 'ETUMAN', '21st', 'alis', 'LA', 'Remembrance', 'Persian', 'L', 'Roman', 'Biological', 'lines', '19th', 'Occied', 'Da', 'Doc', 'North', 'Ethics', 'mmde', 'erilization', 'NAZI', 'RAUM', 'Immunode', 'LAN', 'LNL', 'The', 'Confederate', 'Ra', 'A', 'Geneva', 'mitatus', 'Medical', 'Decision', 'Desert', 'Great', 'Just', 'VILDID', 'NE', 'HUMAN', '


## Explore column groupbys

In [16]:

for file_path, file_path_df in domain_doc_ners_df.groupby('file_path'):
    print(file_path)
    display(file_path_df.sample(4).dropna(axis='columns', how='all').T)
    break

../data/Domain_Knowledge/Fundamentals of Military Medicine/Fund ch 1.txt


Unnamed: 0,89871,90227,95310,94584
file_path,../data/Domain_Knowledge/Fundamentals of Milit...,../data/Domain_Knowledge/Fundamentals of Milit...,../data/Domain_Knowledge/Fundamentals of Milit...,../data/Domain_Knowledge/Fundamentals of Milit...
nlp_word,an,a,",",units
nlp_tag,DT,DT,",",NNS
nlp_type,,,,
nlp_pofs,DET,DET,PUNCT,NOUN


In [17]:

for nlp_tag, nlp_tag_df in domain_doc_ners_df.groupby('nlp_tag'):
    print(nlp_tag)
    display(nlp_tag_df.sample(4).dropna(axis='columns', how='all').T)
    break

$


Unnamed: 0,1057944,1018594,1280591,1280584
file_path,../data/Domain_Knowledge/Military Medical Ethi...,../data/Domain_Knowledge/Military Medical Ethi...,../data/Domain_Knowledge/Military Medical Ethi...,../data/Domain_Knowledge/Military Medical Ethi...
nlp_word,$,$,•,•
nlp_tag,$,$,$,$
nlp_type,,MONEY,PRODUCT,PRODUCT
nlp_pofs,SYM,SYM,SYM,SYM


In [26]:

from itertools import combinations

columns_list = [cn for cn in domain_doc_ners_df.columns if cn.startswith('bert_')]
for groupby_columns in combinations(columns_list, 2):
    groupby_columns = list(groupby_columns)
    df = nu.get_minority_combinations(domain_doc_ners_df, groupby_columns).dropna(axis='columns', how='all')
    if df.shape[0]:
        print(nu.conjunctify_nouns(groupby_columns))
        df.file_path = df.file_path.map(lambda x: str(x).split('/')[-1])
        display(df)

bert_word and bert_entity


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
311771,MEESEL,I-ORG,0.565598,172.0,371.0,Fund ch 3.txt
427113,OLYTRAUMA,I-ORG,0.824845,232.0,241.0,Fund ch 36.txt
828641,OMINICK,I-PER,0.836177,717.0,724.0,Ethics-ch-11.txt
542462,ONAT,I-ORG,0.579091,79.0,83.0,Fund ch 7.txt


bert_word and bert_score


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
481847,&,I-ORG,0.90407,476.0,487.0,Fund ch 4.txt
1294854,Sciences,I-ORG,0.953635,882.0,902.0,Ethics-ch-26.txt
807603,Sciences,I-ORG,0.955297,244.0,269.0,Ethics-ch-10.txt
213496,Sciences,I-ORG,0.968651,699.0,713.0,Fund ch 21.txt


bert_word and bert_start


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
481847,&,I-ORG,0.90407,476.0,487.0,Fund ch 4.txt
120606,Sciences,I-ORG,0.625684,1033.0,1047.0,Fund ch 12.txt
862723,Sciences,I-ORG,0.898472,1068.0,1088.0,Ethics-ch-12.txt
109083,Sciences,I-ORG,0.952303,1071.0,1085.0,Fund ch 11.txt


bert_word and bert_end


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
481847,&,I-ORG,0.90407,476.0,487.0,Fund ch 4.txt
1311682,Sciences,I-ORG,0.92341,1016.0,1036.0,Ethics-ch-27.txt
299962,Sciences,I-ORG,0.860803,1023.0,1037.0,Fund ch 29.txt
120606,Sciences,I-ORG,0.625684,1033.0,1047.0,Fund ch 12.txt


bert_entity and bert_score


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
1294842,Room,I-LOC,0.374644,594.0,604.0,Ethics-ch-26.txt
495184,H,I-ORG,0.970456,138.0,148.0,Fund ch 40.txt
573136,University,I-ORG,0.97049,466.0,479.0,Fund ch 9.txt
223549,University,I-ORG,0.97055,842.0,855.0,Fund ch 22.txt


bert_entity and bert_start


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
38723,Pass,I-LOC,0.767956,52.0,59.0,SchauerMedicBag.txt
262026,for,I-ORG,0.966432,956.0,968.0,Fund ch 25.txt
284233,Emergency,I-ORG,0.990785,955.0,973.0,Fund ch 27.txt
1324236,Department,I-ORG,0.998094,954.0,967.0,3-84-D11-LEGAL-ROE.txt


bert_entity and bert_end


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
38723,Pass,I-LOC,0.767956,52.0,59.0,SchauerMedicBag.txt
223557,iformed,I-ORG,0.98964,979.0,986.0,Fund ch 22.txt
275639,Program,I-ORG,0.825875,974.0,985.0,Fund ch 26.txt
1311679,iformed,I-ORG,0.980799,974.0,981.0,Ethics-ch-27.txt


bert_score and bert_start


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
862730,ggs,I-ORG,0.310521,1266.0,1382.0,Ethics-ch-12.txt
1153027,agency,I-ORG,0.980444,587.0,593.0,Ethics-ch-20.txt
542481,Medical,I-ORG,0.980487,592.0,610.0,Fund ch 7.txt
284234,Uniformed,I-ORG,0.980573,975.0,984.0,Fund ch 27.txt


bert_score and bert_end


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
862730,ggs,I-ORG,0.310521,1266.0,1382.0,Ethics-ch-12.txt
1153027,agency,I-ORG,0.980444,587.0,593.0,Ethics-ch-20.txt
542481,Medical,I-ORG,0.980487,592.0,610.0,Fund ch 7.txt
284234,Uniformed,I-ORG,0.980573,975.0,984.0,Fund ch 27.txt


bert_start and bert_end


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
43781,TCCC,I-ORG,0.97528,0.0,4.0,Tactical_Combat_Casualty_Care.txt
204181,Community,I-ORG,0.708247,1031.0,1049.0,Fund ch 20.txt
83930,AMERICAN,I-MISC,0.987863,1032.0,1040.0,Fund ch 1.txt
99174,Military,I-MISC,0.786213,1032.0,1049.0,Fund ch 10.txt
