In [1]:

# Set up notebook
%pprint
import sys
if ('../py' not in sys.path): sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

# load libraries
from FRVRS import nu, fu
from numpy import nan, isnan
from os import listdir as listdir, makedirs as makedirs, path as osp, remove as remove, sep as sep, walk as walk
from pandas import (
    CategoricalDtype, DataFrame, Index, NaT, Series, concat, get_dummies, isna, notnull, read_csv, read_excel, to_datetime, to_numeric
)
from re import split, search, sub, MULTILINE
from scipy.stats import f_oneway, ttest_ind, kruskal, norm
import itertools
import re
import statsmodels.api as sm


## Use the CSV Data from the CACI Logs (Human_Sim_Metrics_Data_4-12-2024)

In [3]:

# In the zip there are 51 folders, (51 JSON, 51 CSV).
# All the files are named appropriated in the folder/csv/json UUID_ParticipantID.
# Some of the internal Participants IDs might be off because the moderator forgot to enter a Participant ID or didn't enter
# the Participant ID correctly so we needed to figure out which participant it was.
# So only utilize the UUID and Participant ID that is on the file name to identify and ignore the internal Participant IDs.
print("\nGet all the Open World logs into one data frame")
csv_stats_df = DataFrame([])
logs_path = osp.join(nu.data_folder, 'logs', 'Human_Sim_Metrics_Data_4-12-2024')
directories_list = listdir(logs_path)
for dir_name in directories_list:
    
    # Add the CSVs to the data frame
    folder_path = osp.join(logs_path, dir_name)
    df = fu.concatonate_logs(logs_folder=folder_path)
    
    session_uuid, participant_id = dir_name.split('_')
    df['session_uuid'] = session_uuid
    df['participant_id'] = int(participant_id)
    
    # Remove numerically-named columns
    columns_list = [x for x in df.columns if not search(r'\d+', str(x))]
    df = df[columns_list]
    
    # Convert 'TRUE' and 'FALSE' to boolean values
    for cn in fu.boolean_columns_list:
        df[cn] = df[cn].map({'TRUE': True, 'FALSE': False, 'True': True, 'False': False})
    
    # Convert the nulls into NaNs
    for cn in df.columns: df[cn] = df[cn].replace(['null', 'nan', 'n'], nan)
    
    # Append the data frame for the current subdirectory to the main data frame and break the participant ID loop
    csv_stats_df = concat([csv_stats_df, df], axis='index')

csv_stats_df = csv_stats_df.reset_index(drop=True).drop_duplicates()
csv_stats_df['csv_file_name'] = csv_stats_df.csv_file_subpath.map(lambda x: str(x).split('/')[-1])

# Check for proper ingestion (duplicate file ingestion, et al)
assert len(csv_stats_df.columns) > 4, "Nothing ingested"
assert csv_stats_df.participant_id.nunique() == 26, f"Participant count should be 26, it's {csv_stats_df.participant_id.nunique()} instead"

# Check that all the rows that have more than one unique value in the file_name column for each value in the session_uuid column
mask_series = (csv_stats_df.groupby('session_uuid').csv_file_subpath.transform(Series.nunique) > 1)
assert not mask_series.any(), "You have duplicate files"

print(csv_stats_df.shape)
display(csv_stats_df.groupby('participant_id').size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))
display(csv_stats_df.sample(4).dropna(axis='columns', how='all').T.sample(5))


Get all the Open World logs into one data frame
(158663, 110)


Unnamed: 0_level_0,record_count
participant_id,Unnamed: 1_level_1
2024202,11231
2024211,10888
2024209,10503
2024224,10365
2024218,10261


Unnamed: 0,121755,140998,198214,171829
tool_hover_count,1,,1,1
tool_hover_type,Pulse Oximeter,,Tourniquet,Pulse Oximeter
event_time,2024-03-14 14:32:22,2024-04-05 11:19:13,2024-03-20 10:31:17,2024-03-20 14:17:26
scene_id,0,0,0,0
participant_id,2024207,2024224,2024213,2024216


In [4]:

# Remove the Unity suffix from all patient_id columns
# The one without "Root" is the ID that CACI sets for it. Unity
# then takes the ID and adds "Root" to the end when it
# creates the hierarchy, so there's less room for human
# error. They're going to match perfectly.
for cn in fu.patient_id_columns_list:
    if cn in csv_stats_df.columns:
        mask_series = ~csv_stats_df[cn].isnull()
        csv_stats_df.loc[mask_series, cn] = csv_stats_df[mask_series][cn].map(lambda x: str(x).replace(' Root', ''))

In [5]:

# Modalize separate patient ID columns into one
new_column_name = 'patient_id'
patient_id_columns_list = sorted(set(fu.patient_id_columns_list).intersection(set(csv_stats_df.columns)))
csv_stats_df = nu.modalize_columns(csv_stats_df, patient_id_columns_list, new_column_name)
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Unnamed: 0_level_0,record_count
patient_id,Unnamed: 1_level_1
Open World Marine 1 Female,2321
Patient V,2068
Patient U,1907
Patient X,1026
Patient W,1015


In [6]:

# Modalize separate patient SORT columns into one
new_column_name = 'patient_sort'
patient_sort_columns_list = sorted(set(fu.patient_sort_columns_list).intersection(set(csv_stats_df.columns)))
csv_stats_df = nu.modalize_columns(csv_stats_df, patient_sort_columns_list, new_column_name)
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Unnamed: 0_level_0,record_count
patient_sort,Unnamed: 1_level_1
still,1066
waver,670
walker,336


In [7]:

# Modalize separate injury severity columns into one
new_column_name = 'injury_severity'
injury_severity_columns_list = sorted(set(fu.injury_severity_columns_list).intersection(set(csv_stats_df.columns)))
csv_stats_df = nu.modalize_columns(csv_stats_df, injury_severity_columns_list, new_column_name)
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Unnamed: 0_level_0,record_count
injury_severity,Unnamed: 1_level_1
medium,1020
high,966
low,110


In [8]:

# Modalize separate location ID columns into one
new_column_name = 'location_id'
location_id_columns_list = sorted(set(fu.location_id_columns_list).intersection(set(csv_stats_df.columns)))
csv_stats_df = nu.modalize_columns(csv_stats_df, location_id_columns_list, new_column_name)
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Unnamed: 0_level_0,record_count
location_id,Unnamed: 1_level_1
"(0.0, 0.0, 0.0)",2675
"(15.1, 0.0, -27.9)",2102
"(-19.6, 0.0, -10.0)",1053
"(-22.5, 0.0, -10.7)",843
"(-3.3, 0.0, 0.0)",824


In [9]:

# Modalize separate injury ID columns into one
new_column_name = 'injury_id'
injury_id_columns_list = sorted(set(fu.injury_id_columns_list).intersection(set(csv_stats_df.columns)))
csv_stats_df = nu.modalize_columns(csv_stats_df, injury_id_columns_list, new_column_name)
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Unnamed: 0_level_0,record_count
injury_id,Unnamed: 1_level_1
L Leg Broken,179
R Forearm Burn,151
L Shoulder Broken,138
Unspecified,136
R Shoulder Puncture,131


In [10]:

# Modalize separate injury required procedure columns into one
new_column_name = 'injury_required_procedure'
injury_required_procedure_columns_list = sorted(set(fu.injury_required_procedure_columns_list).intersection(set(csv_stats_df.columns)))
csv_stats_df = nu.modalize_columns(csv_stats_df, injury_required_procedure_columns_list, new_column_name)
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Unnamed: 0_level_0,record_count
injury_required_procedure,Unnamed: 1_level_1
burnDressing,539
splint,478
tourniquet,353
woundpack,334
none,162


In [11]:

# Modalize separate patient SALT columns into one
new_column_name = 'patient_salt'
patient_salt_columns_list = sorted(set(fu.patient_salt_columns_list).intersection(set(csv_stats_df.columns)))
csv_stats_df = nu.modalize_columns(csv_stats_df, patient_salt_columns_list, new_column_name)
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Unnamed: 0_level_0,record_count
patient_salt,Unnamed: 1_level_1
DELAYED,767
IMMEDIATE,674
EXPECTANT,373
MINIMAL,258



### Truncate the CSV data to only include our patients at the times they were engaged

In [12]:

# Build a dataset of each CSV file and the action tick where CACI patients first appear
patients_set = set(fu.desert_patients_list + fu.jungle_patients_list + fu.submarine_patients_list + fu.urban_patients_list)
mask_series = ~csv_stats_df.patient_id.isnull()
all_set = set(csv_stats_df[mask_series].patient_id)
assert patients_set.issubset(all_set), f"You're missing {patients_set.difference(all_set)} from the patients in the CSVs"
mask_series = csv_stats_df.patient_id.isin(patients_set)
rows_list = []
for csv_file_name, csv_file_name_df in csv_stats_df[mask_series].groupby('csv_file_name'):
    row_dict = {'csv_file_name': csv_file_name}
    action_tick = csv_file_name_df.action_tick.min()
    row_dict['first_occurence'] = action_tick
    rows_list.append(row_dict)
first_occurence_df = DataFrame(rows_list)
print(first_occurence_df.shape)
display(first_occurence_df.sample(5))

AssertionError: You're missing {'Open World Civilian 1 Male Root', 'Open World Civilian 2 Female Root', 'Marine 3 Male Root', 'Navy Soldier 4 Female Root', 'Open World Marine 4 Male Root', 'Marine 4 Male Root', 'Open World Marine 1 Female Root', 'Navy Soldier 3 Male Root', 'Open World Marine 3 Male Root', 'Marine 2 Male Root', 'Open World Marine 2 Male Root', 'Open World Marine 1 Male Root', 'Navy Soldier 2 Male Root', 'Marine 1 Male Root', 'Civilian 1 Female Root', 'Navy Soldier 1 Male Root', 'Open World Marine 2 Female Root'} from the patients in the CSVs

In [13]:

# From each file, filter out the time before the first appearance of the CACI patients
truncated_df = DataFrame([])
for (csv_file_name, first_occurence), _ in first_occurence_df.groupby(['csv_file_name', 'first_occurence']):
    mask_series = (csv_stats_df.csv_file_name == csv_file_name) & (csv_stats_df.action_tick >= first_occurence)
    df = csv_stats_df[mask_series]
    truncated_df = concat([truncated_df, df], axis='index')
print(truncated_df.shape)
display(truncated_df.sample(4).T)

(137063, 117)


Unnamed: 0,3217,103928,40547,66061
action_type,TOOL_HOVER,INJURY_RECORD,TOOL_HOVER,VOICE_COMMAND
action_tick,234170,15253,655337,631548
event_time,2024-03-20 09:42:14,2024-03-14 15:19:59,2024-03-14 10:47:47,2024-03-20 12:52:57
session_uuid,02d472ac-f6fe-474c-815d-6125fadfcbf7,67dc0230-511d-41ac-ae9b-850900ab9e6a,220b609b-0e35-454e-9afd-c84cbfa3e3ad,3cf14e31-f416-4c78-8a69-91bf0c685448
csv_file_subpath,Human_Sim_Metrics_Data_4-12-2024/02d472ac-f6fe...,Human_Sim_Metrics_Data_4-12-2024/67dc0230-511d...,Human_Sim_Metrics_Data_4-12-2024/220b609b-0e35...,Human_Sim_Metrics_Data_4-12-2024/3cf14e31-f416...
...,...,...,...,...
injury_severity,,medium,,
location_id,,"(2.5, 1.4, 2.6)",,
injury_id,,R Shoulder Broken,,
injury_required_procedure,,splint,,


In [33]:

# Create a list of patients that CACI doesn't care about
mask_series = ~csv_stats_df.patient_id.isin(patients_set) & ~csv_stats_df.patient_id.isnull()
non_patients_list = csv_stats_df[mask_series].patient_id.unique().tolist()
non_patients_list

['patient U', 'patient V', 'patient W', 'patient X', 'electrician', 'bystander', 'Player', 'Simulation', 'Patient V', 'Patient U', 'Patient W', 'Adept Victim', 'Adept Shooter', 'Patient X', 'NPC 2', 'NPC 1', 'NPC 3', 'NPC 4', 'Submarine Level Core', 'US Soldier 1', 'Local Soldier 1', 'Civilian 1', 'NPC', 'Civilian 2', 'Urban Level Core', 'Desert Level Core']

In [34]:

typos_df = nu.check_for_typos(
    sorted(patients_set), non_patients_list, rename_dict={'left_item': 'caci_patient', 'right_item': 'ta1_patient'}
)

In [36]:

print(len(patients_set))
typos_df.head()

34


Unnamed: 0,caci_patient,ta1_patient,max_similarity
0,Civilian 1 Female,Civilian 1,0.740741
1,Civilian 1 Female Root,Civilian 1,0.625
2,Marine 1 Male,Submarine Level Core,0.484848
3,Marine 1 Male Root,Submarine Level Core,0.421053
4,Marine 2 Male,Submarine Level Core,0.484848


In [37]:

# Filter out just the patients that CACI doesn't care about
mask_series = ~truncated_df.patient_id.isin(non_patients_list)
truncated_df = truncated_df[mask_series]
print(truncated_df.shape) # (124046, 117)

(124046, 117)



## Create the Scene Data

In [38]:

# Create the scene data frame precursor
distance_delta_df = fu.get_distance_deltas_data_frame(truncated_df)
print(distance_delta_df.shape)
display(distance_delta_df.sample(5).T)
display(distance_delta_df.groupby('patient_count', dropna=False).size().to_frame().rename(columns={0: 'record_count'}))

(67, 22)


Unnamed: 0,36,42,19,18,15
session_uuid,70eef02d-d2d0-458e-a8bb-f6511bf47a0c,922ad146-241a-4ea6-8ff1-413d7e0d16ec,45365e18-6e38-48e7-b4a2-6b448b209034,3cf14e31-f416-4c78-8a69-91bf0c685448,37a554ee-fc49-4730-819c-2d97727bb0b7
scene_id,0,0,0,1,0
patient_count,5,7,8,0,7
engaged_patient00_metadata,"Open World Marine 4 Male Root|770553|(0.0, 0.0...","Open World Civilian 1 Male Root|711909|(8.2, -...","Marine 1 Male Root|586668|(7.4, 21.4)|still|No...",,Open World Civilian 2 Female Root|725617|(14.2...
engaged_patient01_metadata,"Open World Marine 3 Male Root|904151|(0.5, -2....","Open World Marine 2 Male Root|736441|(10.4, -2...","Marine 4 Male Root|642551|(10.1, 19.1)|waver|N...",,"Open World Marine 2 Male Root|775248|(10.3, -2..."
engaged_patient02_metadata,"Open World Marine 3 Male|932736|(0.0, 0.0)|Non...",Open World Civilian 2 Female Root|805702|(14.4...,"Marine 3 Male Root|699746|(11.3, 10.4)|waver|N...",,"Open World Marine 2 Male|780303|(0.0, 0.0)|Non..."
engaged_patient03_metadata,,"Open World Marine 1 Female Root|862629|(14.4, ...","Marine 3 Male|707363|(0.0, 0.0)|None|None|nan",,"Open World Civilian 1 Male Root|895159|(8.7, -..."
engaged_patient04_metadata,,"Open World Marine 2 Male|936471|(0.0, 0.0)|Non...","Marine 2 Male Root|746656|(14.1, 9.6)|waver|No...",,"Open World Civilian 1 Male|920787|(0.0, 0.0)|N..."
engaged_patient05_metadata,,"Open World Civilian 1 Male|977557|(0.0, 0.0)|N...","Marine 2 Male|776978|(0.0, 0.0)|None|None|nan",,"Open World Marine 1 Female|948274|(0.0, 0.0)|N..."
engaged_patient06_metadata,,"Open World Civilian 2 Female|1042812|(0.0, 0.0...","Civilian 1 Female Root|828435|(13.5, 4.9)|wave...",,"Open World Marine 1 Female Root|948274|(0.0, 0..."


Unnamed: 0_level_0,record_count
patient_count,Unnamed: 1_level_1
0,13
4,13
5,13
6,6
7,13
8,4
9,3
10,2


In [19]:

# Create the scene stats data frame
rows_list = []
for (session_uuid, scene_id), idx_df in distance_delta_df.groupby(fu.scene_groupby_columns):
    row_dict = list(idx_df.T.to_dict().values())[0]
    
    # mean_patient_count
    patient_count = row_dict.pop('patient_count')
    row_dict.update({'scene_patient_count': patient_count})
    
    # Get the whole scene history
    mask_series = True
    for cn in fu.scene_groupby_columns: mask_series &= (truncated_df[cn] == eval(cn))
    scene_df = truncated_df[mask_series]
    
    if scene_df.shape[0]:
        row_dict['participant_id'] = scene_df.participant_id.iloc[0]
        
        # Get the count of all the patient injuries
        all_patient_injuries_count = 0
        for patient_id, patient_df in scene_df.groupby('patient_id'):
            all_patient_injuries_count += patient_df.injury_id.nunique()
        row_dict['scene_patient_injuries_count'] = all_patient_injuries_count
        
        # percent_injury_correctly_treated
        correctly_treated_count = fu.get_injury_correctly_treated_count(scene_df)
        row_dict['scene_correctly_treated_count'] = correctly_treated_count
        try: percent_injury_correctly_treated = 100 * correctly_treated_count / all_patient_injuries_count
        except ZeroDivisionError: percent_injury_correctly_treated = nan
        row_dict['scene_percent_injury_correctly_treated'] = percent_injury_correctly_treated
        
        # mean_pulse_taken_count
        pulse_taken_count = fu.get_pulse_taken_count(scene_df)
        row_dict['scene_pulse_taken_count'] = pulse_taken_count
        
        # mean_stills_value
        row_dict['scene_stills_value'] = fu.get_stills_value(scene_df)
        
        # mean_teleport_count
        row_dict['scene_teleport_count'] = fu.get_teleport_count(scene_df)
        
        # mean_time_to_hemorrhage_control_per_patient
        row_dict['scene_time_to_hemorrhage_control_per_patient'] = fu.get_time_to_hemorrhage_control_per_patient(scene_df)
        
        # mean_triage_time
        row_dict['scene_triage_time'] = fu.get_triage_time(scene_df)
        
        # total_action_count
        mask_series = scene_df.action_type.isin(fu.action_types_list)
        row_dict['scene_action_count'] = scene_df[mask_series].shape[0]
        
        # total_assessment_count
        mask_series = scene_df.action_type.isin(['PATIENT_ENGAGED', 'PULSE_TAKEN'])
        row_dict['scene_assessment_count'] = scene_df[mask_series].shape[0]
        
        # total_treatment_count
        mask_series = scene_df.action_type.isin(['INJURY_TREATED'])
        row_dict['scene_treatment_count'] = scene_df[mask_series].shape[0]
        
        # total_tag_application_count
        mask_series = scene_df.action_type.isin(['TAG_APPLIED'])
        row_dict['scene_tag_application_count'] = scene_df[mask_series].shape[0]
    
    rows_list.append(row_dict)
scene_stats_df = DataFrame(rows_list).drop_duplicates()
print(scene_stats_df.shape)
display(scene_stats_df.sample(5).T)
display(scene_stats_df.groupby('scene_patient_count', dropna=False).size().to_frame().rename(columns={0: 'record_count'}))

(67, 35)


Unnamed: 0,24,18,53,23,11
session_uuid,50b15e40-9860-4574-8ab8-0bd960fe27de,3cf14e31-f416-4c78-8a69-91bf0c685448,b5989edc-8348-4b84-b649-87fc4f1cca53,50b15e40-9860-4574-8ab8-0bd960fe27de,23081f6e-875e-44f5-8bd0-edc3905f5c2c
scene_id,1,1,0,0,2
engaged_patient00_metadata,,,"Open World Civilian 1 Male Root|584795|(8.1, -...","Navy Soldier 2 Male Root|432760|(-0.3, -6.6)|w...",
engaged_patient01_metadata,,,"Open World Civilian 1 Male|591939|(0.0, 0.0)|N...","Navy Soldier 2 Male|459325|(0.0, 0.0)|None|Non...",
engaged_patient02_metadata,,,"Open World Marine 2 Male Root|658810|(10.3, -2...","Navy Soldier 1 Male Root|498393|(0.0, 0.0)|sti...",
engaged_patient03_metadata,,,"Open World Marine 2 Male|692739|(0.0, 0.0)|Non...","Navy Soldier 3 Male Root|511124|(-0.3, 3.4)|wa...",
engaged_patient04_metadata,,,"Open World Civilian 2 Female Root|760294|(0.0,...","Navy Soldier 4 Female Root|552381|(-5.3, -0.1)...",
engaged_patient05_metadata,,,"Open World Marine 1 Female Root|797091|(14.8, ...","Navy Soldier 4 Female|562777|(0.0, 0.0)|None|N...",
engaged_patient06_metadata,,,"Open World Marine 1 Female|838703|(0.0, 0.0)|N...",,
engaged_patient07_metadata,,,,,


Unnamed: 0_level_0,record_count
scene_patient_count,Unnamed: 1_level_1
0,13
4,13
5,13
6,6
7,13
8,4
9,3
10,2


In [27]:

# Add the sim environment column
new_column_name = 'encounter_layout'
encounter_layouts_list = ['Desert', 'Jungle', 'Submarine', 'Urban']
for (session_uuid, scene_id), scene_df in csv_stats_df.groupby(fu.scene_groupby_columns):
    mask_series = ~scene_df.patient_id.isnull()
    spl = sorted(scene_df[mask_series].patient_id.unique())
    for env_str in encounter_layouts_list:
        patients_list = eval(f'fu.{env_str.lower()}_patients_list')
        root_patients_list = [c for c in patients_list if c.endswith(' Root')]
        nonroot_patients_list = sorted(set(patients_list).difference(set(root_patients_list)))
        if all(map(lambda p: p in spl, root_patients_list)) or all(map(lambda p: p in spl, nonroot_patients_list)):
            mask_series = (scene_stats_df.session_uuid == session_uuid) & (scene_stats_df.scene_id == scene_id)
            scene_stats_df.loc[mask_series, new_column_name] = env_str
display(scene_stats_df.groupby([new_column_name, 'scene_patient_count'], dropna=False).size().to_frame().rename(columns={0: 'record_count'}))

Unnamed: 0_level_0,Unnamed: 1_level_0,record_count
encounter_layout,scene_patient_count,Unnamed: 2_level_1
Desert,4,5
Desert,5,3
Desert,7,6
Desert,8,1
Jungle,4,4
Jungle,5,5
Jungle,6,2
Jungle,7,2
Submarine,4,4
Submarine,5,2



### Add the mean % accurate tagging column

In [None]:

# Create the tag-to-SALT data frame
tag_to_salt_df = fu.get_is_tag_correct_data_frame(truncated_df, groupby_column='participant_id')
assert tag_to_salt_df.max_salt.dropna().shape[0] > 0, "You have to add the patient_salt column to truncated_df"
display(tag_to_salt_df.sample(5))

In [None]:

# Get the percentage tag correct counts for each scene for each group
correct_count_by_tag_df = fu.get_percentage_tag_correct_data_frame(tag_to_salt_df, groupby_column='participant_id')
display(correct_count_by_tag_df.sample(5))

In [None]:

# Group the scene stats by participant ID to get the tagging accuracy measure
for participant_id, idx_df in scene_stats_df.groupby('participant_id'):
    
    # mean_percent_accurate_tagging
    mask_series = (correct_count_by_tag_df.participant_id == participant_id)
    scene_stats_df.loc[idx_df.index, 'mean_percent_accurate_tagging'] = correct_count_by_tag_df[mask_series].percentage_tag_correct.mean()


### Add the treated-expectant count column

In [None]:

# Loop through each patient to build the max salt and treated-expectant measures
assert 'patient_salt' in truncated_df.columns, "You have to add the patient_salt column to truncated_df"
rows_list = []
for (session_uuid, scene_id, patient_id), patient_df in truncated_df.groupby(fu.patient_groupby_columns):
    row_dict = {cn: eval(cn) for cn in fu.patient_groupby_columns}
    row_dict['max_salt'] = fu.get_max_salt(patient_df, session_uuid=session_uuid, scene_id=scene_id, random_patient_id=patient_id)[1]
    if (row_dict['max_salt'] == 'EXPECTANT'):
        mask_series = ~patient_df.injury_treated_required_procedure.isnull() | ~patient_df.tool_applied_type.isnull()
        row_dict['treated_expectant'] = {True: 'yes', False: 'no'}[mask_series.any()]
    else: row_dict['treated_expectant'] = nan
    rows_list.append(row_dict)
treated_expectant_df = DataFrame(rows_list)
display(treated_expectant_df.sample(5))

In [None]:

# Loop through each scene to build the treated-expectant counts
for (session_uuid, scene_id), scene_df in treated_expectant_df.groupby(fu.scene_groupby_columns):
    mask_series = (scene_df.treated_expectant == 'yes')
    
    # treated_expectant_count
    treated_expectant_count = mask_series.sum()
    mask_series = (scene_stats_df.session_uuid == session_uuid) & (scene_stats_df.scene_id == scene_id)
    scene_stats_df.loc[mask_series, 'treated_expectant_count'] = treated_expectant_count

In [None]:

# Add the survey columns
survey_columns = ['AD_KDMA_Sim', 'AD_KDMA_Text', 'PropTrust', 'ST_KDMA_Sim', 'ST_KDMA_Text']
if any(map(lambda x: x not in scene_stats_df.columns, survey_columns)):
    file_path = osp.join(nu.data_folder, 'xlsx', 'participant_data_0420.xlsx')
    participant_data_df = read_excel(file_path).rename(columns={'ParticipantID': 'participant_id'})
    print(participant_data_df.shape)
    display(participant_data_df.sample(12).T)
    
    print("\nColumns to merge the participant data with the scene stats on:")
    on_columns = sorted(set(scene_stats_df.columns).intersection(set(participant_data_df.columns)))
    assert on_columns, "You have nothing to merge the participant dataset with the scene stats on"
    print(on_columns)

    print("\nThe participant data columns we want to have in the merge:")
    survey_set = set(on_columns + survey_columns)
    all_set = set(participant_data_df.columns)
    assert survey_set.issubset(all_set), f"You're missing {survey_set.difference(all_set)} from participant_data_0420.xlsx"
    columns_list = sorted(survey_set)
    print(columns_list)
    
    # mean_AD_KDMA_Sim
    # mean_AD_KDMA_Text
    # mean_PropTrust
    # mean_ST_KDMA_Sim
    # mean_ST_KDMA_Text
    df = participant_data_df[columns_list]
    print(scene_stats_df.shape)
    print(df.shape)
    scene_stats_df = scene_stats_df.merge(df, how='left', on=on_columns)
    print(scene_stats_df.shape)
    display(scene_stats_df.groupby(survey_columns, dropna=False).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
        'record_count', ascending=False
    ).head())


### Truncate the scene data to only include our patients at the times they were engaged

In [None]:

# Filter out the unnamed layouts
mask_series = scene_stats_df.encounter_layout.isin(encounter_layouts_list)
pre_count = scene_stats_df.shape[0]
scene_stats_df = scene_stats_df[mask_series]
print(f"\nFiltered out {pre_count - scene_stats_df.shape[0]} unnamed encounter layouts")
display(scene_stats_df.groupby([new_column_name, 'scene_patient_count'], dropna=False).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Check for zero teleports
mask_series = (scene_stats_df.scene_teleport_count < 1)
print(f"\nThere are {scene_stats_df[mask_series].shape[0]} out of {scene_stats_df.shape[0]} scenes with no teleports")

In [None]:

# Save the scene stats dataset
nu.save_data_frames(truncated_scene_stats_df=scene_stats_df, verbose=True)


## Create the ANOVA Dataframe

In [None]:

# Get the columns to merge the scene stats dataset with the CSV stats on
print("\nColumns to merge the scene stats dataset with the CSV stats on:")
on_columns = sorted(set(csv_stats_df.columns).intersection(set(scene_stats_df.columns)))
print(on_columns)

In [None]:

# Get the scene stats dataset columns we want to have in the merge
print('\nThe scene stats dataset columns we want to have in the merge:')
mean_analysis_columns = sorted([
    'scene_percent_injury_correctly_treated', 'scene_pulse_taken_count', 'scene_stills_value', 'scene_teleport_count',
    'scene_time_to_hemorrhage_control_per_patient', 'scene_triage_time', 'scene_patient_count', 'mean_percent_accurate_tagging'
] + survey_columns)
sum_analysis_columns = sorted([
    'scene_action_count', 'scene_assessment_count', 'scene_treatment_count', 'scene_tag_application_count', 'treated_expectant_count'
])
analysis_set = set(mean_analysis_columns + sum_analysis_columns)
all_set = set(scene_stats_df.columns)
assert analysis_set.issubset(all_set), f"You're missing {analysis_set.difference(all_set)} from your analysis_columns"
print(analysis_set)

In [None]:

# Merge the scene stats with the CSV stats
print("\nMerge the scene stats with the CSV stats")
columns_list = on_columns + mean_analysis_columns + sum_analysis_columns
assert set(columns_list).issubset(set(scene_stats_df.columns)), "You've lost access to the analysis columns"
df = scene_stats_df[columns_list]
print(csv_stats_df.shape)
print(df.shape)
merge_df = csv_stats_df.merge(df, on=on_columns, how='left').drop_duplicates()
print(merge_df.shape)
display(merge_df.sample(4).T)
display(merge_df.groupby('scene_patient_count', dropna=False).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Get the merge dataset columns we want to have in the groupby
print('\nThe merge dataset columns we want to have in the groupby:')
columns_list = sorted(set(
    on_columns + mean_analysis_columns + sum_analysis_columns
).intersection(set(merge_df.columns)))
print(columns_list)

In [None]:

# Get the numeric columns we want to take the mean of
print("\nThe numeric columns we want to take the mean of:")
df = merge_df[on_columns + mean_analysis_columns]
assert set(df.columns).issubset(set(merge_df.columns)), "You've lost access to the mean analysis columns"
mean_numeric_columns = sorted(set(nu.get_numeric_columns(df)).difference(set(
    on_columns
)))
print(mean_numeric_columns)

In [None]:

# Get the numeric columns we want to take the sum of
print("\nThe numeric columns we want to take the sum of:")
df = merge_df[on_columns + sum_analysis_columns]
assert set(df.columns).issubset(set(merge_df.columns)), "You've lost access to the sum analysis columns"
sum_numeric_columns = sorted(set(nu.get_numeric_columns(df)).difference(set(
    on_columns
)))
print(sum_numeric_columns)

In [None]:

# Get the other columns we do not want to take the mean or sum of
print("\nThe other columns we do not want to take the mean or sum of:")
other_columns = sorted(set(on_columns + mean_analysis_columns + sum_analysis_columns).difference(set(
    mean_numeric_columns + sum_numeric_columns
)))
print(other_columns)

In [None]:

# Get the means and sums datasets and the columns to merge the summed half of the merge with the meaned half of the merge on
means_df = merge_df[mean_numeric_columns+on_columns].groupby(on_columns).mean().reset_index(drop=False).rename(
    columns={cn: 'mean_'+cn.replace('mean_', '').replace('scene_', '') for cn in mean_numeric_columns}
).dropna(axis='columns', how='all')
# display(means_df.sample(7).T)
# display(means_df.groupby('mean_patient_count', dropna=False).size().to_frame().rename(columns={0: 'record_count'}))
sums_df = merge_df[sum_numeric_columns+on_columns].groupby(on_columns).sum().reset_index(drop=False).rename(
    columns={cn: 'sum_'+cn.replace('sum_', '').replace('scene_', '') for cn in sum_numeric_columns}
).dropna(axis='columns', how='all')
# display(sums_df.sample(7).T)
print("\nColumns to merge the summed half of the merge with the meaned half of the merge on:")
on_columns = sorted(set(means_df.columns).intersection(set(sums_df.columns)))
print(on_columns)
print(means_df.shape)
print(sums_df.shape)
left_df = means_df.merge(sums_df, on=on_columns, how='outer').drop_duplicates()
print(left_df.shape)
# display(left_df.sample(7).T)
# display(left_df.groupby('mean_patient_count', dropna=False).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Get the columns to merge the unaggregated half of the merge with the aggregated half of the merge on
right_df = merge_df[other_columns].drop_duplicates().dropna(axis='columns', how='all')
print(right_df.shape)
display(right_df.sample(5))
print("\nColumns to merge the unaggregated half of the merge with the aggregated half of the merge on:")
on_columns = sorted(set(left_df.columns).intersection(set(right_df.columns)))
print(on_columns)

In [None]:

# Merge the unaggregated half of the merge with the aggregated half
print(
    "\nAggregate the data from the merged datasets and group by participant, session,"
    " and scene to get the means and sums of the numeric columns"
)
print(left_df.shape)
print(right_df.shape)
anova_df = left_df.merge(right_df, on=on_columns, how='outer').drop_duplicates()
print(anova_df.shape)
display(anova_df.sample(7).T)
display(anova_df.groupby('mean_patient_count', dropna=False).size().to_frame().rename(columns={0: 'record_count'}))
assert set(
    ['mean_'+cn for cn in survey_columns]
).issubset(set(anova_df.columns)), "You've lost acces to the survey columns (PropTrust, et al)"
assert len(anova_df.groupby(
    ['participant_id', 'scene_id', 'session_uuid']
).groups.keys()) == anova_df.shape[0], "You have duplicate rows in anova_df"

In [None]:

# Add the sim environment back in
new_column_name = 'encounter_layout'
if new_column_name not in anova_df.columns:
    print("\nAdd the sim environment back in")
    on_columns = sorted(set(anova_df.columns).intersection(set(scene_stats_df.columns)))
    columns_list = on_columns + [new_column_name]
    assert set(columns_list).issubset(set(scene_stats_df.columns)), f"You've lost acces to the {new_column_name} column"
    df = scene_stats_df[columns_list]
    print(anova_df.shape)
    print(df.shape)
    anova_df = anova_df.merge(
        df, on=on_columns, how='left'
    )
    print(anova_df.shape)
    display(anova_df.groupby([new_column_name, 'mean_patient_count'], dropna=False).size().to_frame().rename(
        columns={0: 'record_count'}
    ).sort_values('record_count', ascending=False))

In [None]:

# Filter out the unnamed layouts
mask_series = anova_df.encounter_layout.isin(encounter_layouts_list)
pre_count = anova_df.shape[0]
anova_df = anova_df[mask_series]
print(f"\nFiltered out {pre_count - anova_df.shape[0]} unnamed encounter layouts")
display(anova_df.groupby(['encounter_layout', 'mean_patient_count'], dropna=False).size().to_frame().rename(
    columns={0: 'record_count'}
).sort_values('record_count', ascending=False))

In [None]:

# Check for low patient counts
mask_series = (anova_df.mean_patient_count < 4)
print(f"\nThere are {anova_df[mask_series].shape[0]} out of {anova_df.shape[0]} participations with low patient counts")

In [None]:

# Store the results
columns_list = anova_df.columns.tolist()
nu.save_data_frames(truncated_anova_df=anova_df[columns_list], verbose=True)

In [None]:

# Get statistics using OSU format
columns_list = ['mean_'+cn for cn in survey_columns] + [
    'mean_percent_accurate_tagging', 'mean_patient_count', 'mean_percent_injury_correctly_treated', 'mean_pulse_taken_count',
    'mean_stills_value', 'mean_teleport_count', 'mean_time_to_hemorrhage_control_per_patient', 'mean_triage_time',
    'sum_action_count', 'sum_assessment_count', 'sum_tag_application_count', 'sum_treatment_count', 'sum_treated_expectant_count'
]
description_df = nu.get_statistics(anova_df, columns_list).T
assert float(description_df.loc['mean_patient_count', 'min']) >= 4.0, "There are not less than 4 patients in any scene"

# Calculate range and IQR
description_df['range'] = description_df['max'] - description_df['min']
description_df['IQR'] = description_df['75%'] - description_df['25%']

# Define the index list
index_list = ['min', 'median', 'max', 'IQR', 'range', 'mean', 'SD']

# Set formatting to prevent scientific notation (assuming numeric columns)
description_df = description_df[index_list].applymap('{:.4f}'.format)  # Format as floats with 4 decimals

# Save and show the description data frame
nu.save_data_frames(description_df=description_df, verbose=True)
display(description_df)


# Plot the Correlations

In [None]:

# Define a means to plot a correlation graph
import matplotlib.colors as colors
import numpy as np
import matplotlib.pyplot as plt

def plot_correlation_graph(correlations, title_str='Correlation Matrix of ??', fig=None, ax=None):
    
    # Plotting the correlation matrix using matplotlib
    cmap = plt.get_cmap('coolwarm')
    norm = colors.Normalize(vmin=-1, vmax=1)

    # Create the heatmap
    if (fig is None) or (ax is None):
        fig, ax = plt.subplots(figsize=(8, 6))
    cax = ax.matshow(correlations, cmap=cmap, norm=norm)

    # Add a color bar
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticks(np.arange(len(correlations.columns)))
    ax.set_yticks(np.arange(len(correlations.columns)))
    ax.set_xticklabels(correlations.columns)
    ax.set_yticklabels(correlations.columns)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha='left', rotation_mode='anchor')

    # Add text annotations.
    for (i, j), val in np.ndenumerate(correlations):
        ax.text(j, i, f'{val:.3f}', ha='center', va='center', color='black', fontsize=84/correlations.shape[0])
    plt.title(title_str)
    plt.show()

In [None]:

# Plot the correlations of all the numeric columns in the ANOVA dataset
columns_list = ['mean_'+cn for cn in survey_columns] + [
    'mean_percent_accurate_tagging', 'mean_patient_count', 'mean_percent_injury_correctly_treated', 'mean_pulse_taken_count',
    'mean_stills_value', 'mean_teleport_count', 'mean_time_to_hemorrhage_control_per_patient', 'mean_triage_time',
    'sum_action_count', 'sum_assessment_count', 'sum_tag_application_count', 'sum_treatment_count', 'sum_treated_expectant_count'
]
# columns_list = ['mean_'+cn for cn in survey_columns]
df = anova_df[columns_list]
correlations_df = df.corr().round(3)
display(correlations_df.sample(4).T)

In [None]:

import os

nu.save_data_frames(correlations_df=correlations_df, verbose=True)
fig_height = 7
fig_size=(nu.twitter_aspect_ratio*fig_height, fig_height)
fig, ax = plt.subplots(figsize=fig_size)
title_str = "Correlation Matrix of 18 Analytic and KDMA Columns"
plot_correlation_graph(correlations_df, title_str=title_str, fig=fig, ax=ax)
dir_names_list=['png', 'svg']
for dir_name in dir_names_list:
    try:
        dir_path = osp.join(nu.saves_folder, dir_name)
        os.makedirs(name=dir_path, exist_ok=True)
        file_path = osp.join(dir_path, '{}.{}'.format(re.sub('[^A-Za-z0-9]+', '_', title_str).lower(), dir_name))
        if osp.exists(file_path): os.remove(file_path)
        print(f'Saving to {osp.abspath(file_path)}')
        fig.savefig(file_path, bbox_inches='tight')
    except Exception as e:
        print(print(f'{e.__class__} error saving the figure as a {dir_name}: {str(e).strip()}'))