In [1]:

%pprint
import sys
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))

Pretty printing has been turned OFF


In [2]:

from FRVRS import (nu, DataFrame, osp, re, walk, Series, DataFrame, display, concat)
from scrape_utils import WebScrapingUtilities
import os
import random
import logging

wsu = WebScrapingUtilities(
    s=nu,
    secrets_json_path=osp.abspath(osp.join(nu.data_folder, 'secrets', 'itm_secrets.json'))
)


# Parse Domain Documents for Entities

Downloaded all documents from https://nextcentury.atlassian.net/wiki/spaces/ITMC/pages/2991849482/Domain+Documents and converted them all to PDF files and stored them in the data folder.

In [3]:

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

pdf_folder = '../data/Domain_Knowledge/OSU Additions'
black_list = ['.ipynb_checkpoints', '$Recycle.Bin', '.git']


## Option 1: Use a Hugging Face NER model

In [4]:

from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

# Named entity recognition pipeline, passing in a specific model and tokenizer
model = AutoModelForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
token_classifier = pipeline('ner', model=model, tokenizer=tokenizer)

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
tokens = token_classifier(sentence)
print(tokens)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'I-PER', 'score': 0.9988381, 'index': 1, 'word': 'Barack', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': 0.9994398, 'index': 2, 'word': 'Obama', 'start': 7, 'end': 12}, {'entity': 'I-LOC', 'score': 0.9983613, 'index': 10, 'word': 'United', 'start': 43, 'end': 49}, {'entity': 'I-LOC', 'score': 0.9920671, 'index': 11, 'word': 'States', 'start': 50, 'end': 56}]



## Option 2: Use SpaCy

In [5]:

import spacy

# Load the spaCy model
try: nlp = spacy.load('en_core_web_sm')
except OSError as e:
    print(str(e).strip())
    command_str = f'{sys.executable} -m spacy download en_core_web_sm --quiet'
    print(command_str)
    !{command_str}
    nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

# Example usage
sentence = 'Barack Obama was the 44th President of the United States.'
doc = nlp(sentence)
print([{'text': word.text, 'tag_': word.tag_, 'ent_type_': word.ent_type_, 'pos_': word.pos_} for word in doc])
print([{'text': ent.text, 'label_': ent.label_} for ent in doc.ents])

[{'text': 'Barack', 'tag_': 'NNP', 'ent_type_': 'PERSON', 'pos_': 'PROPN'}, {'text': 'Obama', 'tag_': 'NNP', 'ent_type_': 'PERSON', 'pos_': 'PROPN'}, {'text': 'was', 'tag_': 'VBD', 'ent_type_': '', 'pos_': 'AUX'}, {'text': 'the', 'tag_': 'DT', 'ent_type_': '', 'pos_': 'DET'}, {'text': '44th', 'tag_': 'JJ', 'ent_type_': 'ORDINAL', 'pos_': 'ADJ'}, {'text': 'President', 'tag_': 'NNP', 'ent_type_': '', 'pos_': 'PROPN'}, {'text': 'of', 'tag_': 'IN', 'ent_type_': '', 'pos_': 'ADP'}, {'text': 'the', 'tag_': 'DT', 'ent_type_': 'GPE', 'pos_': 'DET'}, {'text': 'United', 'tag_': 'NNP', 'ent_type_': 'GPE', 'pos_': 'PROPN'}, {'text': 'States', 'tag_': 'NNP', 'ent_type_': 'GPE', 'pos_': 'PROPN'}, {'text': '.', 'tag_': '.', 'ent_type_': '', 'pos_': 'PUNCT'}]
[{'text': 'Barack Obama', 'label_': 'PERSON'}, {'text': '44th', 'label_': 'ORDINAL'}, {'text': 'the United States', 'label_': 'GPE'}]



## Extract the text from PDFs and load it into documents

In [6]:

from PyPDF2 import PdfReader

def convert(file_path, verbose=False):
    """
    Convert PDF, return its text content as a string
    """
    text = ''
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        for page_number in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_number]
            text += page.extract_text()
    if verbose: print(f'Text length for {file_path} is {len(text):,} characters.')

    return text

In [7]:

def get_sentences_dictionary(pdf_folder, verbose=False):
    domain_knowledge_sentences_dict = {}
    for sub_directory, directories_list, files_list in walk(pdf_folder):
        if all(map(lambda x: x not in sub_directory, black_list)):
            if any(map(lambda x: x.endswith('.txt'), files_list)):
                for file_name in files_list:
                    if file_name.endswith('.txt'):
                        file_path = osp.join(sub_directory, file_name)
                        with open(file_path, 'r', encoding=nu.encoding_type) as f:
                            text = f.read()
                            domain_knowledge_sentences_dict[file_path] = text
            else:
                for file_name in files_list:
                    if file_name.endswith('.pdf'):
                        file_path = osp.join(sub_directory, file_name)
                        text = convert(file_path, verbose=True)
                        domain_knowledge_sentences_dict[file_path] = text
    
    return domain_knowledge_sentences_dict

In [22]:

# Get text from PDFs
if nu.pickle_exists('domain_knowledge_sentences_dict'): domain_knowledge_sentences_dict = nu.load_object('domain_knowledge_sentences_dict')
else:
    domain_knowledge_sentences_dict = get_sentences_dictionary(pdf_folder, verbose=False)
    nu.store_objects(domain_knowledge_sentences_dict=domain_knowledge_sentences_dict)

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/domain_knowledge_sentences_dict.pkl


In [23]:

# Assert that you got all the hyphenated word wrappings out
for file_path, text in domain_knowledge_sentences_dict.items():
    assert not ('effec-' in text), f'{file_path} still has hyphenated word wrappings.'

In [24]:

import pandas as pd

# load documents
if nu.csv_exists('domain_doc_ners_df'): domain_doc_ners_df = nu.load_data_frames(domain_doc_ners_df='domain_doc_ners_df')['domain_doc_ners_df']
else:
    entities = []
    for file_path, text in domain_knowledge_sentences_dict.items():
        text_length = len(text)
        # print(f'Text length for {file_path} is {text_length:,} characters.')
        
        # Prepare to join subword tokens back together and keep track of entity and score
        output_words = []
        current_word = ''
        current_entities = []
        current_scores = []
        current_starts = []
        current_ends = []
        
        # Extract metadata from entity recognition pipeline and add it as a row dictionary to the entities rows list
        tokens = token_classifier(text)
        for metadata_dict in tokens:
            current_entities.append(metadata_dict['entity'])
            current_scores.append(metadata_dict['score'])
            current_starts.append(metadata_dict['start'])
            current_ends.append(metadata_dict['end'])
            if metadata_dict['word'].startswith('##'): current_word += metadata_dict['word'][2:]
            else:
                
                # Take the mode of entities and average of scores for the current_word
                if current_word:
                    if len(current_entities) > 1: current_entities = current_entities[:-1]
                    if len(current_scores) > 1: current_scores = current_scores[:-1]
                    if len(current_ends) > 2: current_ends = current_ends[:-1]
                    mode_entity = Series(current_entities).mode().tolist()[-1]
                    mean_score = Series(current_scores).mean()
                    start_idx = current_starts[0]
                    end_idx = current_ends[-1]
                    entity_tuple = (current_word, mode_entity, mean_score, start_idx, end_idx)
                    output_words.append(entity_tuple)
                    current_word = ''
                    current_entities = []
                    current_scores = []
                    current_starts = []
                    current_ends = []
                else:
                    current_word = metadata_dict['word']
                    current_entities = [metadata_dict['entity']]
                    current_scores = [metadata_dict['score']]
                    current_starts = [metadata_dict['start']]
                    current_ends = [metadata_dict['end']]
        
        # Take the mode of entities for the last current_word
        if current_word:
            mode_entity = Series(current_entities).mode().tolist()[-1]
            mean_score = Series(current_scores).mean()
            start_idx = current_starts[0]
            end_idx = current_ends[-1]
            entity_tuple = (current_word, mode_entity, mean_score, start_idx, end_idx)
            output_words.append(entity_tuple)
        
        for word, entity, score, start, end in output_words:
            metadata_dict = {'bert_word': word, 'bert_entity': entity, 'bert_score': score, 'bert_start': start, 'bert_end': end, 'file_path': file_path}
            entities.append(metadata_dict)

        # Extract SpaCy named entities and add them as a row dictionary to the entities rows list
        if text_length <= nlp.max_length:
            doc = nlp(text)
            entities.extend([
                {'file_path': file_path, 'nlp_word': word.text, 'nlp_tag': word.tag_, 'nlp_type': word.ent_type_, 'nlp_pofs': word.pos_}
                for word in doc
            ])
            entities.extend([
                {'file_path': file_path, 'ent_phrase': ent.text, 'ent_type': ent.label_, 'ent_start': ent.start_char, 'ent_end': ent.end_char}
                for ent in doc.ents
            ])
    domain_doc_ners_df = DataFrame(entities)
    nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


In [25]:

# Assert that all the subword tokens are gone
print(domain_doc_ners_df.columns.tolist())
mask_series = domain_doc_ners_df.bert_word.map(lambda x: str(x).startswith('##'))
df = domain_doc_ners_df[mask_series]
assert (df.shape[0] == 0), 'There still exist subword tokens.'

['bert_word', 'bert_entity', 'bert_score', 'bert_start', 'bert_end', 'file_path', 'nlp_word', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_phrase', 'ent_type', 'ent_start', 'ent_end']


In [26]:

# Check that you indeed have only strings among the BERT entities
mask_series = domain_doc_ners_df.bert_entity.isnull()
sorted(domain_doc_ners_df[~mask_series].bert_entity.tolist(), key=lambda x: len(str(x)), reverse=True)[:10]

['I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC']


## Explore the entity type columns

In [27]:

# Print a sample of the ent_type column
categories_list = sorted([str(w) for w in domain_doc_ners_df.ent_type.unique()])
categories_list = random.sample(categories_list, min(len(categories_list), 10))
print(categories_list)
mask_series = domain_doc_ners_df.ent_type.isin(categories_list)
for type, type_df in domain_doc_ners_df[mask_series].groupby('ent_type'):
    mask_series = type_df.ent_phrase.isnull()
    texts_list = sorted(type_df[~mask_series].ent_phrase.unique())
    print()
    display(type, random.sample(texts_list, min(len(texts_list), 10)))

['ORG', 'LAW', 'PERSON', 'MONEY', 'EVENT', 'LOC', 'PRODUCT', 'CARDINAL', 'QUANTITY', 'WORK_OF_ART']



'CARDINAL'

['9', '400', '3', '102', '20', '93', '1510.25', '4:1', '1-31', '1:1']




'EVENT'

['Operation Enduring Freedom', 'Operation Iraqi Freedom (OIF', 'Generation I and II IFAKs', 'The Generation II IFAK']




'LAW'

['Chapter 3', 'Option 2', '• •Check', 'Chapter 4\nMARCH/PAWS Treatment Algorithms', 'the Warrior Aid', 'Basic Management Plan for Tactical Field Care', 'Basic Management Plan for Care Under Fire/Threat', 'Chapter 2', 'Universal Splint 2', 'Nasal Trumpet 3\n']




'LOC'

['TCCC', '• Extraglottic', 'the Battlefield Part II', 'Black Talon', '• •Cut', 'analgesia', 'Luer', 'Elastic', 'Joint Publication', 'http://www.naemt.org/education/TCCC/guidelines_curriculum']




'MONEY'

['#', '90  \n• Hypothermia  \n• End', '500 mL bolus', '100 ml']




'ORG'

['AHS', 'Expeditionary Medical Support', 'OTFC', '− Direct', 'Hypothermia Prevention and Management Kit', 'Special Forces', 'C. Extraction', 'the Rapid Fielding Initiative', 'GlideScope', '• •Medical Simulation and Training Centers']




'PERSON'

['Flurbiprofen', '•••*NPA', '•Any', 'Andre Cap', 'Russ Kotwal', 'James Czarnik', '− Cover', 'MD CAPT', 'Zofran', '− Check']




'PRODUCT'

['• Chin', 'CRTS', 'a TCCC Card', 'c. Estimate', 'Task Evaluation', 'Tourniquet Conversion', '• •Assess', 'Fourth Edition', '•', '• Nasopharyngeal']




'QUANTITY'

['2 inches', '90-degree', '1 mg/2 ml', '4 1/2 inch', '8 cm', '3 inches', '6 mm', '90 degrees', '10 kg ABOVE', '30 degrees']




'WORK_OF_ART'

['Intraosseous Access with the Sternal EZ-IO Needle Set\nThe Sternal EZ-IO', '− Reassess', 'Request for Publications', '− Minimize', '•− News From the Front\n•− Training Lessons', 'Publications by Type', 'Aeromedical Evacuation', 'Fundamentals of Combat Casualty Care', 'Tactical Evacuation', 'A = Panels\nB = Pyrotechnic']

In [28]:

# Print a sample of the nlp_type column
categories_list = sorted([str(w) for w in domain_doc_ners_df.nlp_type.unique()])
categories_list = random.sample(categories_list, min(len(categories_list), 10))
print(categories_list)
mask_series = domain_doc_ners_df.nlp_type.isin(categories_list)
for type, type_df in domain_doc_ners_df[mask_series].groupby('nlp_type'):
    mask_series = type_df.nlp_word.isnull()
    texts_list = sorted(type_df[~mask_series].nlp_word.unique())
    print()
    display(type, random.sample(texts_list, min(len(texts_list), 10)))

['WORK_OF_ART', 'PERCENT', 'LOC', 'QUANTITY', 'DATE', 'PERSON', 'NORP', 'GPE', 'FAC', 'ORDINAL']



'DATE'

['242018Z', '5163120', '519', 'mid', '527', '1016', '1350', '524', '3204', '532']




'FAC'

['Palpated', '•', 'Casualty', 'Blizzard', 'Gram', 'Reflective', 'Key', 'Blanket', '•Move', 'Airway']




'GPE'

['tibia', '−', 'mmHG', 'MD', 'Chicago', 'IO', '.', 'NS', 'U.S.', 'Afghanistan']




'LOC'

['Black', 'Publication', 'analgesia', 'II', '•Cut', 'TCCC', 'Elastic', 'http://www.naemt.org/education/TCCC/guidelines_curriculum', 'Luer', 'Extraglottic']




'NORP'

['D.', 'Line', 'anesthetist', 'Heimlich', 'Melker', 'Soldier', 'Combat', 'Litter', 'q45min', 'R.']




'ORDINAL'

['6th', 'fifth', 'fourth', 'third', '5th', '3rd', 'second', '2nd', 'first', 'First']




'PERCENT'

['than', 'or', '33', 'greater', '10', '87', 'to', '20', 'percent', 'More']




'PERSON'

['Landing', 'Cover', 'Newsletters', 'Jay', 'Maitha', '\n', 'Lubricated', 'Donovan', 'Add', 'Expert']




'QUANTITY'

['2', 'milliliter', '6', 'ABOVE', 'centimeter', 'one', '/', '400', 'over', 'at']




'WORK_OF_ART'

['Management', 'Evacuation', 'Type', 'Shock', ':', 'Care', 'for', 'Request', 'EZ', 'A']


## Explore the tag and parts-of-speech columns

In [29]:

# Print a sample of the nlp_tag column
categories_list = sorted([str(w) for w in domain_doc_ners_df.nlp_tag.unique()])
categories_list = random.sample(categories_list, min(len(categories_list), 10))
print(categories_list)
mask_series = domain_doc_ners_df.nlp_tag.isin(categories_list)
for tag, tag_df in domain_doc_ners_df[mask_series].groupby('nlp_tag'):
    mask_series = tag_df.nlp_word.isnull()
    texts_list = sorted(tag_df[~mask_series].nlp_word.unique())
    print()
    display(tag, random.sample(texts_list, min(len(texts_list), 10)))

['nan', 'WP', 'UH', 'DT', 'VBD', '_SP', 'TO', 'VBG', 'POS', 'VBZ']



'DT'

['all', 'this', 'any', 'both', 'either', 'The', 'these', 'some', 'an', 'half']




'POS'

['s', "'s", '’s']




'TO'

['to', 'To']




'UH'

['O', 'Please', 'please']




'VBD'

['inserted', 'swung', 'Inserted', 'fasten', 'Applied', 'uncontrolled', 'were', '•Assess', 'cut', 'was']




'VBG'

['•Using', 'Folding', 'executing', 'aiming', 'becoming', 'spurting', 'speaking', 'marking', '•Secure', 'performing']




'VBZ'

['conducts', 'increases', 'has', 'is', 'consists', 'develops', 'helps', 'feels', 'scars', 'writes']




'WP'

['•Cover', 'whom', 'who', 'what']




'_SP'

['  \n', '  ', '\n \n \n', ' \n   ', ' \n \n', ' \n \n \n', '      ', '\n\n\n', ' ', '\n   ']

In [30]:

# Print a sample of the nlp_pofs column
categories_list = sorted([str(w) for w in domain_doc_ners_df.nlp_pofs.unique()])
categories_list = random.sample(categories_list, min(len(categories_list), 10))
print(categories_list)
mask_series = domain_doc_ners_df.nlp_pofs.isin(categories_list)
for pofs, pofs_df in domain_doc_ners_df[mask_series].groupby('nlp_pofs'):
    mask_series = pofs_df.nlp_word.isnull()
    texts_list = sorted(pofs_df[~mask_series].nlp_word.unique())
    print()
    display(pofs, random.sample(texts_list, min(len(texts_list), 10)))

['SCONJ', 'nan', 'PROPN', 'AUX', 'CCONJ', 'X', 'DET', 'NUM', 'ADV', 'SPACE']



'ADV'

['Therefore', 'regardless', 'Later', 'over', 'scientifically', 'least', 'longer', 'now', 'Continuously', 'readily']




'AUX'

['is', 'been', 'May', 'can', '•Did', 'Do', 'must', 'may', 'has', 'were']




'CCONJ'

['and', 'or', 'but', 'Either', 'nor', '•Place', 'OR', 'either', 'so', 'AND']




'DET'

['A', 'half', 'a', 'whose', 'the', 'no', 'No', 'that', 'Each', 'another']




'NUM'

['68', '119', '89', '1:1', '106', '4.5', '9', '8-', '5,000', 'six']




'PROPN'

['Pharyngeal', 'SOCIAL', 'Biological', 'Nored', 'Ankle', 'Arms', 'D.', 'MEDIA', 'TEC', 'Travis']




'SCONJ'

['for', 'After', 'so', 'While', 'whether', 'while', 'unless', 'As', 'after', 'when']




'SPACE'

['\n \n', ' \n \n                   ', ' \n \n \n', '\n ', ' \n  \n    ', '\n   ', ' ', ' \n   \n \n \n ', '\n \n ', ' \n ']




'X'

['15', '•Begin', '9', '•', '•Check', '.', '•••*Replace', 'TRAC2ES', 'MEDPROS', '•••*Combat']


## Explore the entity column

In [31]:

# Print a sample of the bert_entity column
categories_list = sorted([str(w) for w in domain_doc_ners_df.bert_entity.unique()])
categories_list = random.sample(categories_list, min(len(categories_list), 10))
print(categories_list)
mask_series = domain_doc_ners_df.bert_entity.isin(categories_list)
for entity, entity_df in domain_doc_ners_df[mask_series].groupby('bert_entity'):
    mask_series = entity_df.bert_word.isnull()
    texts_list = sorted(entity_df[~mask_series].bert_word.unique())
    print()
    display(entity, random.sample(texts_list, min(len(texts_list), 10)))

['I-LOC', 'I-ORG', 'I-MISC', 'nan']



'I-LOC'

['Afghanistan', 'S']




'I-MISC'

['US', 'Cas', 'DIA', 'Care', 'CareCCC', 'ian', 'Handbook', 'Tactical', 'as', 'lines']




'I-ORG'

['TCCC', 'CCC', 'All', 'CoTCCC', 'on', 'Co', 'Combat', 'S']


## Explore column groupbys

In [32]:

for file_path, file_path_df in domain_doc_ners_df.groupby('file_path'):
    print(file_path)
    display(file_path_df.sample(4).dropna(axis='columns', how='all').T)
    break

../data/Domain_Knowledge/OSU Additions/17-13-tactical-casualty-combat-care-handbook-v5-may-17-distro-a (1).txt


Unnamed: 0,7220,7748,22189,4312
file_path,../data/Domain_Knowledge/OSU Additions/17-13-t...,../data/Domain_Knowledge/OSU Additions/17-13-t...,../data/Domain_Knowledge/OSU Additions/17-13-t...,../data/Domain_Knowledge/OSU Additions/17-13-t...
nlp_word,axillary,.,easily,.
nlp_tag,JJ,.,RB,.
nlp_type,,,,
nlp_pofs,ADJ,PUNCT,ADV,PUNCT


In [34]:

for nlp_tag, nlp_tag_df in domain_doc_ners_df.groupby('nlp_tag'):
    if (nlp_tag_df.shape[0] >= 4):
        print(nlp_tag)
        display(nlp_tag_df.sample(4).dropna(axis='columns', how='all').T)
        break

''


Unnamed: 0,26745,26810,3478,23674
file_path,../data/Domain_Knowledge/OSU Additions/17-13-t...,../data/Domain_Knowledge/OSU Additions/17-13-t...,../data/Domain_Knowledge/OSU Additions/17-13-t...,../data/Domain_Knowledge/OSU Additions/17-13-t...
nlp_word,”,”,’,”
nlp_tag,'','','',''
nlp_type,,,,
nlp_pofs,PUNCT,PUNCT,PUNCT,PUNCT



## Explore combinations of BERT words

In [35]:

from itertools import combinations

columns_list = [cn for cn in domain_doc_ners_df.columns if cn.startswith('bert_')]
for groupby_columns in combinations(columns_list, 2):
    groupby_columns = list(groupby_columns)
    df = nu.get_minority_combinations(domain_doc_ners_df, groupby_columns).dropna(axis='columns', how='all')
    if df.shape[0]:
        print(nu.conjunctify_nouns(groupby_columns))
        df.file_path = df.file_path.map(lambda x: str(x).split('/')[-1])
        display(df)

bert_word and bert_entity


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
6,Afghanistan,I-LOC,0.999079,611.0,987.0,17-13-tactical-casualty-combat-care-handbook-v...
30463,ian,I-MISC,0.550099,1132.0,1143.0,9-LINE MEDEVAC AND MIST PREP.txt
11,as,I-MISC,0.485629,1505.0,1517.0,17-13-tactical-casualty-combat-care-handbook-v...
0,Tactical,I-MISC,0.952839,290.0,305.0,17-13-tactical-casualty-combat-care-handbook-v...


bert_word and bert_score


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
6,Afghanistan,I-LOC,0.999079,611.0,987.0,17-13-tactical-casualty-combat-care-handbook-v...
30883,lines,I-MISC,0.576725,46.0,159.0,Tactical Combat Casualty Care (TCCC) 2021.txt
30463,ian,I-MISC,0.550099,1132.0,1143.0,9-LINE MEDEVAC AND MIST PREP.txt
11,as,I-MISC,0.485629,1505.0,1517.0,17-13-tactical-casualty-combat-care-handbook-v...


bert_word and bert_start


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
6,Afghanistan,I-LOC,0.999079,611.0,987.0,17-13-tactical-casualty-combat-care-handbook-v...
30883,lines,I-MISC,0.576725,46.0,159.0,Tactical Combat Casualty Care (TCCC) 2021.txt
30463,ian,I-MISC,0.550099,1132.0,1143.0,9-LINE MEDEVAC AND MIST PREP.txt
11,as,I-MISC,0.485629,1505.0,1517.0,17-13-tactical-casualty-combat-care-handbook-v...


bert_word and bert_end


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
6,Afghanistan,I-LOC,0.999079,611.0,987.0,17-13-tactical-casualty-combat-care-handbook-v...
30883,lines,I-MISC,0.576725,46.0,159.0,Tactical Combat Casualty Care (TCCC) 2021.txt
30463,ian,I-MISC,0.550099,1132.0,1143.0,9-LINE MEDEVAC AND MIST PREP.txt
11,as,I-MISC,0.485629,1505.0,1517.0,17-13-tactical-casualty-combat-care-handbook-v...


bert_entity and bert_score


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
7,S,I-LOC,0.974557,988.0,1233.0,17-13-tactical-casualty-combat-care-handbook-v...
9,on,I-ORG,0.954088,1485.0,1496.0,17-13-tactical-casualty-combat-care-handbook-v...
10,Combat,I-ORG,0.898376,1497.0,1505.0,17-13-tactical-casualty-combat-care-handbook-v...
14,S,I-ORG,0.742123,1954.0,2068.0,17-13-tactical-casualty-combat-care-handbook-v...


bert_entity and bert_start


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
6,Afghanistan,I-LOC,0.999079,611.0,987.0,17-13-tactical-casualty-combat-care-handbook-v...
13,All,I-ORG,0.559561,1944.0,1953.0,17-13-tactical-casualty-combat-care-handbook-v...
30887,CCC,I-ORG,0.52063,1531.0,1534.0,Tactical Combat Casualty Care (TCCC) 2021.txt
12,CoTCCC,I-ORG,0.990001,1519.0,1525.0,17-13-tactical-casualty-combat-care-handbook-v...


bert_entity and bert_end


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
6,Afghanistan,I-LOC,0.999079,611.0,987.0,17-13-tactical-casualty-combat-care-handbook-v...
13,All,I-ORG,0.559561,1944.0,1953.0,17-13-tactical-casualty-combat-care-handbook-v...
30887,CCC,I-ORG,0.52063,1531.0,1534.0,Tactical Combat Casualty Care (TCCC) 2021.txt
12,CoTCCC,I-ORG,0.990001,1519.0,1525.0,17-13-tactical-casualty-combat-care-handbook-v...


bert_score and bert_start


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
3,DIA,I-MISC,0.462606,364.0,367.0,17-13-tactical-casualty-combat-care-handbook-v...
30465,US,I-MISC,0.984484,1164.0,1166.0,9-LINE MEDEVAC AND MIST PREP.txt
7,S,I-LOC,0.974557,988.0,1233.0,17-13-tactical-casualty-combat-care-handbook-v...
30464,US,I-MISC,0.968811,1144.0,1163.0,9-LINE MEDEVAC AND MIST PREP.txt


bert_score and bert_end


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
3,DIA,I-MISC,0.462606,364.0,367.0,17-13-tactical-casualty-combat-care-handbook-v...
30465,US,I-MISC,0.984484,1164.0,1166.0,9-LINE MEDEVAC AND MIST PREP.txt
7,S,I-LOC,0.974557,988.0,1233.0,17-13-tactical-casualty-combat-care-handbook-v...
30464,US,I-MISC,0.968811,1144.0,1163.0,9-LINE MEDEVAC AND MIST PREP.txt


bert_start and bert_end


Unnamed: 0,bert_word,bert_entity,bert_score,bert_start,bert_end,file_path
30880,Combat,I-MISC,0.575667,12.0,20.0,Tactical Combat Casualty Care (TCCC) 2021.txt
13,All,I-ORG,0.559561,1944.0,1953.0,17-13-tactical-casualty-combat-care-handbook-v...
30887,CCC,I-ORG,0.52063,1531.0,1534.0,Tactical Combat Casualty Care (TCCC) 2021.txt
12,CoTCCC,I-ORG,0.990001,1519.0,1525.0,17-13-tactical-casualty-combat-care-handbook-v...


In [37]:

nu.delete_ipynb_checkpoint_folders()

In [38]:

# Load the NER entities from a CSV
if nu.csv_exists('domain_doc_ners_df_old'):
    domain_doc_ners_old_df = nu.load_data_frames(domain_doc_ners_df_old='domain_doc_ners_df_old')['domain_doc_ners_df_old']
    print(domain_doc_ners_old_df.columns.tolist())

No pickle exists for domain_doc_ners_df_old - attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df_old.csv.
['bert_word', 'bert_entity', 'bert_score', 'bert_start', 'bert_end', 'file_path', 'nlp_word', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_phrase', 'ent_type', 'ent_start', 'ent_end', 'is_probe', 'is_probe_probability']


In [63]:

regexed_columns_list = ['bert_word', 'nlp_word', 'ent_phrase']
df = domain_doc_ners_old_df[domain_doc_ners_old_df.is_probe][regexed_columns_list]
sorted(set(
    [word_str for phrase_str in nu.modalize_columns(df, regexed_columns_list, 'phrase_str').phrase_str.unique() for word_str in re.split(
        r'[\s&./_•−◻®,†:‡\-]+', phrase_str.lower(), 0
    ) if word_str]
))

['01', '07002', '2018', '2nd', 'a', 'adjuncts', 'airway', 'airways', 'amedd', 'and', 'application', 'applied', 'area', 'army', 'assess', 'assessment', 'assets', 'background', 'based', 'battlefield', 'care', 'casualty', 'celox', 'chest', 'chin', 'chitogauze', 'chitosan', 'circulation', 'collection', 'com', 'combat', 'compression', 'concept', 'continue', 'control', 'controlled', 'convert', 'cpgs', 'current', 'decompress', 'decompressing', 'decompression', 'devices', 'docs', 'dressings', 'drugs', 'emergency', 'establishing', 'evacuation', 'external', 'extraction', 'field', 'for', 'forces', 'fulltext', 'gauze', 'guidelines', 'guiding', 'hemorrhage', 'hemostatic', 'https', 'immediate', 'impending', 'impregnated', 'in', 'individual', 'insert', 'interventions', 'intranasal', 'intranasally', 'jaw', 'journals', 'jtrauma', 'jts', 'junctional', 'kaolin', 'kit', 'learned', 'lessons', 'life', 'lifesaver', 'lift', 'likely', 'limb', 'lww', 'maintaining', 'management', 'maneuver', 'massive', 'medical'

In [64]:

canonical_phrases = [
    'airway', 'celox', 'chest', 'chin', 'chitogauze', 'chitosan', 'compression', 'cpgs', 'decompress', 'dressing', 'gauze', 'hemorrhage',
    'hemostatic',
    'intranasal', 'jaw', 'junctional', 'kaolin', 'lifesaver', 'limb', 'nasopharyngeal', 'needle', 'pernasal', 'pneumatic', 'prehospital',
    'quickclot',
    'respirations', 'tourniquet', 'trauma', 'triage'
]

In [66]:

# Build a data frame of just the phrases and words that were picked up and their various measures
rows_list = []
indices_list = []
cns_list = []
word_strs_list = []
word_columns_list = [
    'bert_word', 'nlp_word', 'ent_phrase',
    'bert_entity', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_type'
]
if 'is_probe' not in domain_doc_ners_df.columns: domain_doc_ners_df['is_probe'] = False
for cn in set(regexed_columns_list):
    for word_str in set(canonical_phrases):
        mask_series = domain_doc_ners_df[cn].map(lambda x: word_str in str(x).lower())
        df = domain_doc_ners_df[mask_series][word_columns_list].dropna(axis='columns', how='all')
        if df.shape[0]: domain_doc_ners_df.loc[mask_series, 'is_probe'] = True

In [67]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier

vectorizer = CountVectorizer(
    lowercase=True, ngram_range=(1, 3)
)
tfidf_transformer = TfidfTransformer(
    norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True
)
classifier = SGDClassifier(loss='log_loss', warm_start=True)
mask_series = domain_doc_ners_df.ent_phrase.isnull()
columns_list = ['ent_phrase', 'is_probe']
df = domain_doc_ners_df[~mask_series][columns_list]
df.is_probe = df.is_probe.map(
    lambda x: {True: 1, False: 0}.get(x, x)
)
train_data_list = df.ent_phrase.tolist()
train_labels_list = df.is_probe.values
X_train_counts = vectorizer.fit_transform(train_data_list)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Train on initial data
classifier.fit(X_train_tfidf, train_labels_list)

In [68]:

mask_series = domain_doc_ners_df.ent_phrase.isnull()
sample_ent_phrase = domain_doc_ners_df[~mask_series].sample(1).ent_phrase.squeeze()
display(sample_ent_phrase)
X_test = tfidf_transformer.transform(vectorizer.transform([sample_ent_phrase])).toarray()
display(classifier.predict_proba(X_test)[0][1])

'2006'

0.016255309990146363

In [69]:

if 'is_probe_probability' not in domain_doc_ners_df.columns: domain_doc_ners_df['is_probe_probability'] = 0.0
mask_series = domain_doc_ners_df.ent_phrase.isnull()
domain_doc_ners_df.loc[~mask_series, 'is_probe_probability'] = domain_doc_ners_df[~mask_series].ent_phrase.map(
    lambda x: classifier.predict_proba(tfidf_transformer.transform(vectorizer.transform([x])).toarray())[0][1]
)

In [70]:

nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


In [72]:

nu.save_data_frames(domain_doc_ners_df=concat([domain_doc_ners_df, domain_doc_ners_old_df]))

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv
