In [1]:

%pprint
import sys
if ('../py' not in sys.path): sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from FRVRS import fu, nu
import numpy as np
import os
import os.path as osp
from pandas import DataFrame, Series, concat, notnull
import pandas as pd
import re

# Dataset Built for Metrics Evaluation Open World


Conduct some exploratory analysis of the open world segments for the ITM scenarios from the Metrics Evaluation.
For context, results of these analyses is a goal for the 4/30 results meeting (stretch) or the PI meeting (more likely).

<h2>I want to ask the question: which factors contribute to the variance in these outcomes?</h2>
Conceptually, I want an exploratory factor analysis using these IVs and DVs. But I suspect we don’t have enough data for that so as close as we can get to that, let’s get creative.
My thought was to keep the environments separate because each participant did 2 of the environments so if we use 1 to explore, we can use the other to confirm. But again, I recognize we do not have power to do these properly.

<h2>Here is my initial list but I am open to suggestions and modifications</h2>
<h3>IVs (these are not available in the csv; we are working on calculating them now and can get you that info.)</h3>
<ul>
    <li>Participant medical role</li>
    <li>Years of experience</li>
    <li>ST alignment score (continuous or group assignment)</li>
    <li>AD alignment score (continuous or group assignment)</li>
</ul>
<h3>DVs</h3>
<ul>
    <li>Total number of actions</li>
    <li>Count of assessment actions</li>
    <li>Count of treatment actions</li>
    <li>Count of tags applied</li>
    <li>Order of patients engaged</li>
    <li>Tag color for each patient</li>
    <li>Treat expectant patient (yes/no)</li>
    <li>Triage efficiency</li>
    <li>Time to hemorrhage control</li>
</ul>

<h2>The csv files are available: https://nextcentury.atlassian.net/wiki/x/IYDJsgand  and are labeled: </h2>
<ul>
    <li>ITM 3.13.2024.zip; </li>
    <li>ITM 3.14.2024 405F.zip; </li>
    <li>ITM 3.14.2024 405E.zip;</li>
    <li>ITM 3.20.2024 405F.zip; </li>
    <li>ITM 3.20.2024 405E.zip;</li>
    <li>ITM 3.22.2024</li>
</ul>
<h2>In creating this dataset:</h2>
<ul>
    <li>Please keep these data segregated from all others before and after and label it “Metrics Evaluation Open World”</li>
    <li>Please keep the environments labeled: whether it is Jungle, Desert, Submarine, or Urban</li>
</ul>
<h2>We only want to use data from the following characters within each csv (by environment):</h2>
<h3>Desert:</h3>
<ul>
    <li>Open World Marine 1 Female</li>
    <li>Open World Marine 2 Male</li>
    <li>Open World Civilian 1 Male</li>
    <li>Open World Civilian 2 Female</li>
</ul>
<h3>Jungle:</h3>
<ul>
    <li>Open World Marine 1 Male</li>
    <li>Open World Marine 2 Female</li>
    <li>Open World Marine 3 Male</li>
    <li>Open World Marine 4 Male</li>
</ul>
<h3>Submarine:</h3>
<ul>
    <li>Navy Soldier 1 Male</li>
    <li>Navy Soldier 2 Male</li>
    <li>Navy Soldier 3 Male</li>
    <li>Navy Soldier 4 Female</li>
</ul>
<h3>Urban:</h3>
<ul>
    <li>Marine 1 Male</li>
    <li>Marine 2 Male</li>
    <li>Marine 3 Male</li>
    <li>Marine 4 Male</li>
    <li>Civilian 1 Female</li>
</ul>

In [3]:

# Get all the Disaster Day logs into one data frame
logs_path = '../data/logs/Metrics Evaluation Open World'

# Add the CSVs to the data frame
metrics_evaluation_open_world_df = fu.concatonate_logs(logs_folder=logs_path)

# Remove numerically-named columns
columns_list = [x for x in metrics_evaluation_open_world_df.columns if not re.search(r'\d+', str(x))]
metrics_evaluation_open_world_df = metrics_evaluation_open_world_df[columns_list]

# Convert 'TRUE' and 'FALSE' to boolean values
for cn in fu.boolean_columns_list:
    metrics_evaluation_open_world_df[cn] = metrics_evaluation_open_world_df[cn].map({'TRUE': True, 'FALSE': False, 'True': True, 'False': False})

# Convert the nulls into NaNs
for cn in metrics_evaluation_open_world_df.columns: metrics_evaluation_open_world_df[cn] = metrics_evaluation_open_world_df[cn].replace('null', np.nan)

nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)

print(metrics_evaluation_open_world_df.shape) # (276926, 95)

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 95)



## Check for proper ingestion (duplicate file ingestion, et al)

In [4]:

# Check you even ingested anything
assert len(metrics_evaluation_open_world_df.columns) > 4, "Nothing ingested"

display(metrics_evaluation_open_world_df.groupby('logger_version').size().to_frame().rename(columns={0: 'record_count'}))

Unnamed: 0_level_0,record_count
logger_version,Unnamed: 1_level_1
1.4,276926


In [6]:

# Filter all the rows that have more than one unique value in the file_name column for each value in the session_uuid column
mask_series = (metrics_evaluation_open_world_df.groupby('session_uuid').file_name.transform(Series.nunique) > 1)
assert not mask_series.any(), "You have duplicate files"
# columns_list = ['session_uuid', 'file_name']
# for (session_uuid, file_name), df in metrics_evaluation_open_world_df[mask_series][columns_list].drop_duplicates().sort_values(columns_list).groupby(
#     columns_list
# ):
#     if not file_name.startswith('Double runs removed/'):
#         file_path = osp.join(fu.data_logs_folder, *file_name.split('/'))
#         os.remove(file_path)

In [7]:

# Check that all your junk scenes are the last scenes
display(metrics_evaluation_open_world_df.groupby('is_scene_aborted').size().to_frame().rename(columns={0: 'record_count'}))
mask_series = metrics_evaluation_open_world_df.is_scene_aborted
for (session_uuid, scene_id), scene_df in metrics_evaluation_open_world_df[mask_series].groupby(fu.scene_groupby_columns):
    mask_series = (metrics_evaluation_open_world_df.session_uuid == session_uuid)
    max_scene_id = metrics_evaluation_open_world_df[mask_series].scene_id.max()
    assert max_scene_id == scene_id, "You've got junk scenes in strange places"

Unnamed: 0_level_0,record_count
is_scene_aborted,Unnamed: 1_level_1
False,273964
True,2962



## Add new features according to your increasing domain knowledge


### Modalize separate columns into one

In [8]:

# Modalize into one patient ID column if possible
new_column_name = 'patient_id'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.patient_id_columns_list, new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 96)
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 96)


Unnamed: 0_level_0,record_count
patient_id,Unnamed: 1_level_1
Patient U Root,3787
Patient V Root,3426
Navy Soldier 2 Male Root,2530
Marine 3 Male Root,2489
patient U Root,1984


In [9]:

# Modalize into one injury ID column if possible
new_column_name = 'injury_id'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.injury_id_columns_list, new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    
    print(metrics_evaluation_open_world_df.shape) # (66069, 97)
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 97)


Unnamed: 0_level_0,record_count
injury_id,Unnamed: 1_level_1
L Leg Broken,267
L Shoulder Broken,207
R Forearm Burn,193
L Bicep Puncture,192
Unspecified,170


In [10]:

# Modalize into one location ID column if possible
new_column_name = 'location_id'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.location_id_columns_list, new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    
    print(metrics_evaluation_open_world_df.shape) # (66069, 98)
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 98)


Unnamed: 0_level_0,record_count
location_id,Unnamed: 1_level_1
"(0.0, 0.0, 0.0)",3259
"(0.2, 0.0, 0.3)",2395
"(-22.5, 0.0, -10.7)",2341
"(11.8, 0.0, 10.3)",2181
"(-19.6, 0.0, -10.0)",2085


In [11]:

# Modalize into one patient sort column if possible
new_column_name = 'patient_sort'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.sort_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.sort_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 99)

display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 99)


Unnamed: 0_level_0,record_count
patient_sort,Unnamed: 1_level_1
still,978
waver,1117
walker,426


In [12]:

# Modalize into one patient pulse column if possible
new_column_name = 'patient_pulse'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.pulse_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.pulse_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 100)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 100)


Unnamed: 0_level_0,record_count
patient_pulse,Unnamed: 1_level_1
none,2
faint,449
fast,1205
normal,865


In [13]:

# Modalize into one patient salt column if possible
new_column_name = 'patient_salt'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.salt_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.salt_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 101)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 101)


Unnamed: 0_level_0,record_count
patient_salt,Unnamed: 1_level_1
DEAD,0
EXPECTANT,448
IMMEDIATE,942
DELAYED,836
MINIMAL,295


In [14]:

# Modalize into one patient hearing column if possible
new_column_name = 'patient_hearing'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.hearing_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.hearing_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 102)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 102)


Unnamed: 0_level_0,record_count
patient_hearing,Unnamed: 1_level_1
none,0
limited,25
normal,1781


In [15]:

# Modalize into one patient breath column if possible
new_column_name = 'patient_breath'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.breath_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.breath_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 103)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 103)


Unnamed: 0_level_0,record_count
patient_breath,Unnamed: 1_level_1
none,93
collapsedRight,0
restricted,214
fast,891
normal,1206


In [16]:

# Modalize into one patient mood column if possible
new_column_name = 'patient_mood'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.mood_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.mood_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 104)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 104)


Unnamed: 0_level_0,record_count
patient_mood,Unnamed: 1_level_1
dead,416
unresponsive,0
agony,653
upset,368
calm,239


In [17]:

# Modalize into one patient pose column if possible
new_column_name = 'patient_pose'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.pose_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.pose_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 105)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 105)


Unnamed: 0_level_0,record_count
patient_pose,Unnamed: 1_level_1
supine,1038
fetal,153
sittingGround,329
kneeling,26
recovery,0
standing,129


In [18]:

# Modalize into one injury severity column if possible
new_column_name = 'injury_severity'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.severity_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.severity_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 106)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 106)


Unnamed: 0_level_0,record_count
injury_severity,Unnamed: 1_level_1
high,1438
medium,1386
low,119


In [19]:

# Modalize into one injury required_procedure column if possible
new_column_name = 'injury_required_procedure'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.required_procedure_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.required_procedure_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 107)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 107)


Unnamed: 0_level_0,record_count
injury_required_procedure,Unnamed: 1_level_1
tourniquet,507
gauzePressure,125
decompress,91
woundpack,469
airway,31
none,200


In [20]:

# Modalize into one injury body_region column if possible
new_column_name = 'injury_body_region'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.body_region_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.body_region_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 108)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 108)


Unnamed: 0_level_0,record_count
injury_body_region,Unnamed: 1_level_1
head,57
neck,85
chest,463
abdomen,184
leftLeg,459
rightLeg,513
rightArm,439
leftArm,662


In [21]:

# Modalize into one tool type column if possible
new_column_name = 'tool_type'
if (new_column_name not in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df = nu.modalize_columns(metrics_evaluation_open_world_df, fu.tool_type_columns_list, new_column_name)
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.tool_type_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    print(metrics_evaluation_open_world_df.shape) # (66069, 109)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv
(276926, 109)


Unnamed: 0_level_0,record_count
tool_type,Unnamed: 1_level_1
Tourniquet,9110
Gauze_Pack,289
Needle,16455
Naso,82
Nasal Airway,7350
Gauze_Dressing,235



### Convert text columns to categorical

In [22]:

new_column_name = 'pulse_taken_pulse_name'
if (new_column_name in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.pulse_name_category_order)
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv


Unnamed: 0_level_0,record_count
pulse_taken_pulse_name,Unnamed: 1_level_1
pulse_none,12
pulse_faint,491
pulse_fast,754
pulse_normal,283


In [23]:

new_column_name = 'tool_applied_data'
if (new_column_name in metrics_evaluation_open_world_df.columns):
    metrics_evaluation_open_world_df[new_column_name] = metrics_evaluation_open_world_df[new_column_name].astype(fu.tool_data_category_order)
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    
display(metrics_evaluation_open_world_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_df.csv


Unnamed: 0_level_0,record_count
tool_applied_data,Unnamed: 1_level_1
right_chest,14
left_chest,38
right_underarm,0
left_underarm,0



## Mask PII

In [25]:

# Mask voice capture PII. OSU screened all of the **VOICE_COMMAND** and **VOICE_CAPTURE** lines and
# replaced any names with either Max or Jane, regardless of whether the name was that of the responder.
# But, just to make sure...
columns_list = ['voice_command_command_description', 'voice_capture_message']
if not metrics_evaluation_open_world_df[columns_list].applymap(lambda x: '[PERSON]' in str(x), na_action='ignore').sum().sum():
    import spacy
    try: nlp = spacy.load('en_core_web_sm')
    except OSError as e:
        print(str(e).strip())
        command_str = f'{sys.executable} -m spacy download en_core_web_sm --quiet'
        print(command_str)
        !{command_str}
        nlp = spacy.load('en_core_web_sm')
    import en_core_web_sm
    nlp = en_core_web_sm.load()
    
    mask_series = metrics_evaluation_open_world_df.voice_command_command_description.isnull() & metrics_evaluation_open_world_df.voice_capture_message.isnull()
    df = metrics_evaluation_open_world_df[~mask_series]
    def mask_pii(srs):
        for idx in columns_list:
            new_text = srs[idx]
            if notnull(new_text):
                doc = nlp(new_text)
                for entity in doc.ents:
                    if entity.label_ == 'PERSON': new_text = re.sub('\\b' + entity.text + '\\b', '[PERSON]', new_text)
                srs[idx] = new_text
    
        return srs
    
    for row_index, row_series in df.apply(mask_pii, axis='columns')[columns_list].iterrows():
        for column_name, column_value in row_series.items():
            if notnull(column_value): metrics_evaluation_open_world_df.loc[row_index, column_name] = column_value
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    nu.save_data_frames(metrics_evaluation_open_world_df=metrics_evaluation_open_world_df)
    
    print(metrics_evaluation_open_world_df.shape) # (66069, 109)

ImportError: cannot import name 'registry' from 'thinc' (/home/dbabbitt/anaconda3/envs/itm_analysis_reporting/lib/python3.7/site-packages/thinc/__init__.py)

In [26]:

# USE:
# Navy Soldier 1 Male, Navy Soldier 2 Male, Navy Soldier 3 Male, Navy Soldier 4 Female - patient ids from the submarine
# Marine 1 Male, Marine 2 Male, Marine 3 Male, Marine 4 Male, Civilian 1 Female - patient ids from urban
# DISCARD:
# Marine with Leg Amputation Root, Marine with Narrative Root, Tutorial Military Marine Root, Intelligence Officer Root, Broken Bob Root,
# Local Civilian with Internal Bleeding Root, Simulation Root, Broken Helga Root, and Broken Gloria Root
# Use "resident" for all responder types and "OSU" for all site names
from pandas import to_datetime

rows_list = []
submarine_patients_list = ['Navy Soldier 1 Male Root', 'Navy Soldier 2 Male Root', 'Navy Soldier 3 Male Root', 'Navy Solider 4 Female Root']
urban_patients_list = ['Marine 1 Male Root', 'Marine 2 Male Root', 'Marine 3 Male Root', 'Marine 4 Male Root', 'Civilian 1 Female Root']
discard_patients_list = [
    'Marine with Leg Amputation Root', 'Marine with Narrative Root', 'Tutorial Military Marine Root', 'Intelligence Officer Root', 'Broken Bob Root',
    'Local Civilian with Internal Bleeding Root', 'Simulation Root', 'Broken Helga Root', 'Broken Gloria Root'
]
if 'file_name' in metrics_evaluation_open_world_df.columns:
    for file_name, file_name_df in metrics_evaluation_open_world_df.groupby('file_name'):
        row_dict = {}
        row_dict['session_file_date'] = to_datetime(file_name_df.event_time, infer_datetime_format=True).min().strftime('%B %d, %Y')
        row_dict['session_file_name'] = osp.basename(file_name)
        row_dict['session_uuid'] = file_name_df.session_uuid.min()
        row_dict['responder_name'] = 'N/A'
        row_dict['responder_type'] = 'EM-RES1'
        row_dict['site_name'] = 'OSU'
        mask_series = ~file_name_df.patient_id.isnull()
        patient_ids_list = file_name_df[mask_series].patient_id.unique().tolist()
        if any(map(lambda x: x in submarine_patients_list, patient_ids_list)): encounter_layout = 'Submarine'
        elif any(map(lambda x: x in urban_patients_list, patient_ids_list)): encounter_layout = 'Urban'
        elif any(map(lambda x: x in discard_patients_list, patient_ids_list)): encounter_layout = 'Discard'
        else: encounter_layout = 'Other'
        row_dict['encounter_layout'] = encounter_layout
        rows_list.append(row_dict)
    df = DataFrame(rows_list, columns=[
        'session_file_date', 'session_file_name', 'session_uuid', 'responder_name', 'responder_type', 'site_name', 'encounter_layout'
    ])
    print(df.shape)
    file_path = '../data/xlsx/metrics_evaluation_open_world_3.6.2024.xlsx'
    df.to_excel(file_path, index=False)

(22, 7)
