In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from FRVRS import nu
import os
import os.path as osp
import re


# Search for Scenario and Probe Vocabulary

In [3]:

# Load the NER entities from a CSV
if nu.csv_exists('domain_doc_ners_df'):
    domain_doc_ners_df = nu.load_data_frames(domain_doc_ners_df='domain_doc_ners_df')['domain_doc_ners_df']
    print(domain_doc_ners_df.columns.tolist())

No pickle exists for domain_doc_ners_df - attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv.
['bert_word', 'bert_entity', 'bert_score', 'bert_start', 'bert_end', 'file_path', 'nlp_word', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_phrase', 'ent_type', 'ent_start', 'ent_end']


In [4]:

# Attempt to load the logs data frame
frvrs_logs_df = nu.load_data_frames(frvrs_logs_df='frvrs_logs_df')['frvrs_logs_df']
print(frvrs_logs_df.shape)

Attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/frvrs_logs_df.pkl.
Argument 'placement' has incorrect type (expected pandas._libs.internals.BlockPlacement, got slice)
No pickle exists for frvrs_logs_df - attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/frvrs_logs_df.csv.
(829116, 114)


In [9]:

# Create a list of starting phrases to label for model training
df = nu.load_csv(csv_name='domain_doc_ners_df')
print(df.columns.tolist())
base_mask_series = df.is_probe
domain_doc_ners_df['is_probe'] = False

# Add the BERT words
mask_series = base_mask_series & ~df.bert_word.isnull()
bert_words_list = df[mask_series].bert_word.unique().tolist()
mask_series = domain_doc_ners_df.bert_word.isin(bert_words_list)
domain_doc_ners_df.loc[mask_series, 'is_probe'] = True

# Add the SpaCy words
mask_series = base_mask_series & ~df.nlp_word.isnull()
nlp_words_list = df[mask_series].nlp_word.unique().tolist()
mask_series = domain_doc_ners_df.nlp_word.isin(nlp_words_list)
domain_doc_ners_df.loc[mask_series, 'is_probe'] = True

# Add the SpaCy entities
mask_series = base_mask_series & ~df.ent_phrase.isnull()
ent_phrases_list = df[mask_series].ent_phrase.unique().tolist()
mask_series = domain_doc_ners_df.ent_phrase.isin(ent_phrases_list)
domain_doc_ners_df.loc[mask_series, 'is_probe'] = True

# Create a set of unique relevant words and save the data frame
canonical_phrases = list(set(bert_words_list + nlp_words_list + ent_phrases_list))
nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)

['bert_word', 'bert_entity', 'bert_score', 'file_path', 'nlp_word', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_phrase', 'ent_type', 'ent_start', 'ent_end', 'is_probe', 'is_probe_probability']
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv



## Clean up ent_phrase column

In [10]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier

vectorizer = CountVectorizer(
    lowercase=True, ngram_range=(1, 3)
)
tfidf_transformer = TfidfTransformer(
    norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True
)
classifier = SGDClassifier(loss='log_loss', warm_start=True)
mask_series = domain_doc_ners_df.ent_phrase.isnull()
columns_list = ['ent_phrase', 'is_probe']
df = domain_doc_ners_df[~mask_series][columns_list]
df.is_probe = df.is_probe.map(
    lambda x: {True: 1, False: 0}.get(x, x)
)
train_data_list = df.ent_phrase.tolist()
train_labels_list = df.is_probe.values
X_train_counts = vectorizer.fit_transform(train_data_list)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Train on initial data
classifier.fit(X_train_tfidf, train_labels_list)

In [11]:

mask_series = domain_doc_ners_df.ent_phrase.isnull()
sample_ent_phrase = domain_doc_ners_df[~mask_series].sample(1).ent_phrase.squeeze()
display(sample_ent_phrase)
X_test = tfidf_transformer.transform(vectorizer.transform([sample_ent_phrase])).toarray()
display(classifier.predict_proba(X_test)[0][1])

'2007;44(2):148–152'

0.0027281047226410384

In [12]:

if 'is_probe_probability' not in domain_doc_ners_df.columns: domain_doc_ners_df['is_probe_probability'] = 0.0
mask_series = domain_doc_ners_df.ent_phrase.isnull()
domain_doc_ners_df.loc[~mask_series, 'is_probe_probability'] = domain_doc_ners_df[~mask_series].ent_phrase.map(
    lambda x: classifier.predict_proba(tfidf_transformer.transform(vectorizer.transform([x])).toarray())[0][1]
)

In [15]:

mask_series = ~domain_doc_ners_df.is_probe & (domain_doc_ners_df.is_probe_probability > 0.0)
analysis_columns = [
    'bert_word', 'bert_entity', 'bert_score', 'nlp_word', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_phrase', 'ent_type',
    'is_probe', 'is_probe_probability'
]
print(domain_doc_ners_df.columns.tolist())
display(domain_doc_ners_df[mask_series][analysis_columns].sort_values('is_probe_probability', ascending=False).head(60).tail(20).dropna(axis='columns', how='all'))

['bert_word', 'bert_entity', 'bert_score', 'bert_start', 'bert_end', 'file_path', 'nlp_word', 'nlp_tag', 'nlp_type', 'nlp_pofs', 'ent_phrase', 'ent_type', 'ent_start', 'ent_end', 'is_probe', 'is_probe_probability']


Unnamed: 0,ent_phrase,ent_type,is_probe,is_probe_probability
743251,Combat Ethics,ORG,False,0.003823
743253,COMBAT ETHICS,ORG,False,0.003823
743256,COMBAT ETHICS,ORG,False,0.003823
743290,Combat Ethics,ORG,False,0.003823
743905,Combat Ethics,ORG,False,0.003823
743553,Combat Ethics,ORG,False,0.003823
743724,Combat Ethics,ORG,False,0.003823
743346,Combat Ethics,ORG,False,0.003823
744226,Combat Ethics,ORG,False,0.003823
275284,Combat Duty,ORG,False,0.0036



### ent_phrase Maintenance

In [31]:

# test_words_list = ' '.join([str(x).lower() for x in domain_doc_ners_df[domain_doc_ners_df.is_probe].ent_phrase.tolist()])
# print(sorted(set(re.compile(r'[\s/◻&‡:†®−\.•,-]+').split(test_words_list))))
test_words_list = [
    'airway', 'celox', 'chest', 'chin', 'chito', 'circulation', 'compression', 'decompress', 'dressings', 'gauze', 'hemorrhage', 'hemostatic',
    'jaw', 'junctional', 'kaolin', 'keenan', 'lifesaver', 'limb', 'nasopharyngeal', 'needle', 'pneumatic', 'quickclot', 'tourniquet', 'trauma', 'triage',
    'unconscious', 'wounds'
]
word_analysis_columns = ['file_path'] + analysis_columns
print(sorted(set(test_words_list)))

['airway', 'celox', 'chest', 'chin', 'chito', 'circulation', 'compression', 'decompress', 'dressings', 'gauze', 'hemorrhage', 'hemostatic', 'jaw', 'junctional', 'kaolin', 'keenan', 'lifesaver', 'limb', 'nasopharyngeal', 'needle', 'pneumatic', 'quickclot', 'tourniquet', 'trauma', 'triage', 'unconscious', 'wounds']


In [91]:

try:
    word_str = test_words_list.pop()
    mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: word_str.lower() in str(x).lower()) & domain_doc_ners_df.is_probe
    df = domain_doc_ners_df[mask_series][word_analysis_columns]
    if df.shape[0]:
        display(word_str)
        df.file_path = df.file_path.map(lambda x: str(x).replace('../data/Domain_Knowledge/', ''))
        display(df.dropna(axis='columns', how='all'))
except Exception as e: print(str(e).strip())

pop from empty list



----

In [95]:

mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: 'cpg' in str(x).lower())# & (domain_doc_ners_df.ent_type == 'PERSON')
domain_doc_ners_df.loc[mask_series, 'is_probe'] = False
nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)
df = domain_doc_ners_df[mask_series][word_analysis_columns].sort_values(['is_probe', 'is_probe_probability'], ascending=[True, False])
df.file_path = df.file_path.map(lambda x: str(x).replace('../data/Domain_Knowledge/', ''))
display(df.dropna(axis='columns', how='all'))

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


Unnamed: 0,file_path,ent_phrase,ent_type,is_probe,is_probe_probability
38372,Prolonged_Casualty_Care_Guidelines_21_Dec_2021...,Tactical Triage Protocol \n \n \nProlonged Cas...,WORK_OF_ART,False,0.002818
38347,Prolonged_Casualty_Care_Guidelines_21_Dec_2021...,Prolonged Casualty Care Guidelines \nCPG,WORK_OF_ART,False,0.002744
37733,Prolonged_Casualty_Care_Guidelines_21_Dec_2021...,Prolonged Field Care CPG,WORK_OF_ART,False,0.002733
38143,Prolonged_Casualty_Care_Guidelines_21_Dec_2021...,https://jts.amedd.army.mil/assets/docs/cpgs/Da...,PERSON,False,0.002726
38269,Prolonged_Casualty_Care_Guidelines_21_Dec_2021...,https://jts.amedd.army.mil/assets/docs/cpgs/Bu...,ORG,False,0.002724
...,...,...,...,...,...
464501,Fundamentals of Military Medicine/Fund ch 38.txt,CPG,ORG,False,0.002635
464502,Fundamentals of Military Medicine/Fund ch 38.txt,CPG,ORG,False,0.002635
481139,Fundamentals of Military Medicine/Fund ch 39.txt,CPG,ORG,False,0.002635
481193,Fundamentals of Military Medicine/Fund ch 39.txt,CPG,ORG,False,0.002635


In [70]:

mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: str(x).startswith('Massive Hemorrhage\nAssess'))
domain_doc_ners_df.loc[mask_series, 'is_probe'] = True
nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)
display(domain_doc_ners_df[mask_series][analysis_columns].sort_values('is_probe_probability', ascending=False).dropna(axis='columns', how='all'))

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


Unnamed: 0,ent_phrase,ent_type,is_probe,is_probe_probability
82147,Massive Hemorrhage\nAssess,ORG,True,0.002849


In [71]:

mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: str(x).startswith('Massive External \nHemorrhage'))
domain_doc_ners_df.loc[mask_series, 'is_probe'] = True
nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)
display(domain_doc_ners_df[mask_series][analysis_columns].sort_values('is_probe_probability', ascending=False).dropna(axis='columns', how='all'))

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


Unnamed: 0,ent_phrase,ent_type,is_probe,is_probe_probability
82295,Massive External \nHemorrhage,ORG,True,0.00289
82622,Massive External \nHemorrhage,ORG,True,0.00289


In [72]:

mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: str(x).startswith('CONTINUE TACTICAL FIELD CARE\nHemorrhage Contr'))
domain_doc_ners_df.loc[mask_series, 'is_probe'] = True
nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)
display(domain_doc_ners_df[mask_series][analysis_columns].sort_values('is_probe_probability', ascending=False).dropna(axis='columns', how='all'))

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


Unnamed: 0,ent_phrase,ent_type,is_probe,is_probe_probability
82304,CONTINUE TACTICAL FIELD CARE\nHemorrhage Contr...,ORG,True,0.002793


In [63]:

mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: 'junctional' in str(x).lower())# & (domain_doc_ners_df.ent_type != 'PERSON')
domain_doc_ners_df.loc[mask_series, 'is_probe'] = True
nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)
display(domain_doc_ners_df[mask_series][analysis_columns].sort_values('is_probe_probability', ascending=False).dropna(axis='columns', how='all'))

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


Unnamed: 0,ent_phrase,ent_type,is_probe,is_probe_probability
43482,Junctional,EVENT,True,0.003783
83572,SAM-Junctional Tourniquet,ORG,True,0.003396
82298,Junctional Wounds,FAC,True,0.003206
82625,Junctional Wounds,FAC,True,0.003206
383310,Junctional Hemorrhage,WORK_OF_ART,True,0.003188
383327,the SAM Junctional Tourniquet,ORG,True,0.003181
83052,Tourniquet Kit Junctional Compression,ORG,True,0.00312
384091,Control of junctional,ORG,True,0.002941
83047,JUNCTIONAL TOURNIQUETS & DEVICES,ORG,True,0.00293
383334,the Junctional Emergency Treatment Tool,ORG,True,0.002834


In [153]:

mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: 'nasopharyngeal' in str(x).lower())# & (domain_doc_ners_df.ent_type == 'PERSON')
domain_doc_ners_df.loc[mask_series, 'is_probe'] = True
nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)
display(domain_doc_ners_df[mask_series][analysis_columns].sort_values('is_probe_probability', ascending=False).dropna(axis='columns', how='all'))

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


Unnamed: 0,ent_phrase,ent_type,is_probe,is_probe_probability
82619,Insert Nasopharyngeal Airway,PERSON,True,0.002838
82949,Insert Nasopharyngeal Airway,PERSON,True,0.002838
54269,• Nasopharyngeal,PRODUCT,True,0.002654
54273,• Nasopharyngeal,PRODUCT,True,0.002654
62567,• Nasopharyngeal,PRODUCT,True,0.002654
62571,• Nasopharyngeal,PRODUCT,True,0.002654
82615,Impending Airway Obstruction\nUnconscious with...,WORK_OF_ART,True,0.00264
82944,Impending Airway Obstruction\nUnconscious with...,WORK_OF_ART,True,0.00264


In [43]:

mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: 'tourniquet' in str(x).lower()) & (domain_doc_ners_df.ent_type == 'PERSON')
domain_doc_ners_df.loc[mask_series, 'is_probe'] = False
mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: 'limb tourniquet' in str(x).lower()) & (domain_doc_ners_df.ent_type == 'PERSON')
domain_doc_ners_df.loc[mask_series, 'is_probe'] = True
nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)
display(domain_doc_ners_df[mask_series][analysis_columns].dropna(axis='columns', how='all').sort_values('is_probe_probability', ascending=False))

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


Unnamed: 0,ent_phrase,ent_type,is_probe,is_probe_probability
82293,Limb Tourniquet,PERSON,True,0.004204
82620,Limb Tourniquet,PERSON,True,0.004204


In [None]:

mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: 'triage' in str(x).lower()) & (domain_doc_ners_df.ent_type != 'PERSON')
domain_doc_ners_df.loc[mask_series, 'is_probe'] = True
nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)
display(domain_doc_ners_df[mask_series][analysis_columns].dropna(axis='columns', how='all').sort_values('is_probe_probability', ascending=False))

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ners_df.csv


Unnamed: 0,ent_phrase,ent_type,is_probe,is_probe_probability
405644,• Triage,ORG,True,0.003946
405699,Triage Area,LOC,True,0.003307
405705,the Triage Area,LOC,True,0.003095
405707,the Triage Area,LOC,True,0.003095
405708,the Triage Area,LOC,True,0.003095
36404,Tactical Triage Protocol,WORK_OF_ART,True,0.003018
38370,Tactical Triage Protocol,WORK_OF_ART,True,0.003018
36398,Triage Guiding Principles .......................,WORK_OF_ART,True,0.002992
38280,Triage Guiding Principles,WORK_OF_ART,True,0.002992
861794,Military Medical Triage\n,ORG,True,0.002949



----

In [None]:

mask_series = domain_doc_ners_df.ent_phrase.map(lambda x: 'Xxxxxxxx' in str(x).lower())# & (domain_doc_ners_df.ent_type == 'PERSON')
# domain_doc_ners_df.loc[mask_series, 'is_probe'] = True
# nu.save_data_frames(domain_doc_ners_df=domain_doc_ners_df)
df = domain_doc_ners_df[mask_series][word_analysis_columns].sort_values(['is_probe', 'is_probe_probability'], ascending=[True, False])
df.file_path = df.file_path.map(lambda x: str(x).replace('../data/Domain_Knowledge/', ''))
display(df.dropna(axis='columns', how='all'))


## Break up dataset into the word columns and save them that way

In [103]:

# Save the BERT words
mask_series = ~domain_doc_ners_df.bert_word.isnull()
nu.save_data_frames(**{'domain_doc_bert_words_df': domain_doc_ners_df[mask_series].dropna(axis='columns', how='all')})

# Save the SpaCy words
mask_series = ~domain_doc_ners_df.nlp_word.isnull()
nu.save_data_frames(**{'domain_doc_nlp_words_df': domain_doc_ners_df[mask_series].dropna(axis='columns', how='all')})

# Save the SpaCy phrases
mask_series = ~domain_doc_ners_df.ent_phrase.isnull()
nu.save_data_frames(**{'domain_doc_ent_phrases_df': domain_doc_ners_df[mask_series].dropna(axis='columns', how='all')})

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_bert_words_df.csv
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_nlp_words_df.csv
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_ent_phrases_df.csv


In [104]:

mask_series = ~domain_doc_ners_df.nlp_word.isnull()
df = domain_doc_ners_df[mask_series].dropna(axis='columns', how='all')
columns_list = [cn for cn in df.columns if cn.startswith('nlp_')]
for cn in columns_list:
    print(cn, df[cn].unique().shape[0])

nlp_word 47032
nlp_tag 50
nlp_type 19
nlp_pofs 18


In [109]:

for nlp_pofs, nlp_pofs_df in df.groupby('nlp_pofs', dropna=False):
    nu.save_data_frames(**{f'domain_doc_nlp_word_{str(nlp_pofs).lower()}_df': nlp_pofs_df})

Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_nlp_word_adj_df.csv
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_nlp_word_adp_df.csv
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_nlp_word_adv_df.csv
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_nlp_word_aux_df.csv
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_nlp_word_cconj_df.csv
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_nlp_word_det_df.csv
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_nlp_word_intj_df.csv
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/domain_doc_nlp_word_noun_df.csv
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-rep