In [1]:

# Set up notebook
%pprint
import sys
if ('../py' not in sys.path): sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

# load libraries
from FRVRS import nu, fu
from numpy import nan, isnan
from os import listdir as listdir, makedirs as makedirs, path as osp, remove as remove, sep as sep, walk as walk
from pandas import (
    CategoricalDtype, DataFrame, Index, NaT, Series, concat, get_dummies, isna, notnull, read_csv, read_excel, to_datetime, to_numeric
)
from re import split, search, sub, MULTILINE
from scipy.stats import f_oneway, ttest_ind, kruskal, norm
import itertools
import re
import statsmodels.api as sm

In [3]:

# Find unaccounted-for patient IDs in weird columns
desert_patients_list = [
    'Open World Marine 1 Female Root', 'Open World Marine 2 Male Root', 'Open World Civilian 1 Male Root', 'Open World Civilian 2 Female Root'
]
jungle_patients_list = [
    'Open World Marine 1 Male Root', 'Open World Marine 2 Female Root', 'Open World Marine 3 Male Root', 'Open World Marine 4 Male Root'
]
submarine_patients_list = ['Navy Soldier 1 Male Root', 'Navy Soldier 2 Male Root', 'Navy Soldier 3 Male Root', 'Navy Soldier 4 Female Root']
urban_patients_list = ['Marine 1 Male Root', 'Marine 2 Male Root', 'Marine 3 Male Root', 'Marine 4 Male Root', 'Civilian 1 Female Root']
patients_set = set(desert_patients_list + jungle_patients_list + submarine_patients_list + urban_patients_list)
patients_regex = re.compile(r',(' + '|'.join(patients_set) + r')\b')

In [None]:

# Add all mentions of TA3 patients to its own CSV file
import csv

logs_path = osp.join(nu.data_folder, 'logs', 'Human_Sim_Metrics_Data_4-12-2024')
directories_list = listdir(logs_path)
output_file_path = osp.join(fu.saves_folder, 'csv', 'desert_jungle_submarine_urban_patients.csv')
with open(output_file_path, mode='w', encoding=nu.encoding_type) as f: print('', file=f)
with open(output_file_path, mode='a', encoding=nu.encoding_type) as f_output:
    for dir_name in directories_list:
        folder_path = osp.join(logs_path, dir_name)
        for sub_directory, directories_list, files_list in walk(folder_path):
            
            # Iterate over the files in the current subdirectory
            for file_name in files_list:
                
                # If the file is a CSV file, merge it into the subdirectory data frame
                if file_name.endswith('.csv'):
                    
                    # Construct the full path to the file
                    file_path = osp.join(sub_directory, file_name)
                    
                    # Read CSV file
                    with open(file_path, 'r') as f_input:
                        for line_str in f_input:
                            if patients_regex.search(line_str):
                                print(line_str, end='', file=f_output)

# Read CSV file using a CSV reader
rows_list = []
with open(osp.abspath(output_file_path), 'r', encoding=nu.encoding_type) as f:
    reader = csv.reader(f, delimiter=',', quotechar='"')
    for values_list in reader:
        rows_list.append({i: v for i, v in enumerate(values_list)})
file_df = DataFrame(rows_list).dropna(axis='columns', how='all').dropna(axis='index', how='all')

# Show the columns that have the patient's names in them
patients_regex = re.compile(r'\b(' + '|'.join(patients_set) + r')\b')
for column_name in range(19):
    if any(map(lambda x: patients_regex.search(str(x)), file_df[column_name].tolist())):
        print(column_name)

In [7]:

# In the zip there are 51 folders, (51 JSON, 51 CSV).
# All the files are named appropriated in the folder/csv/json UUID_ParticipantID.
# Some of the internal Participants IDs might be off because the moderator forgot to enter a Participant ID or didn't enter
# the Participant ID correctly so we needed to figure out which participant it was.
# So only utilize the UUID and Participant ID that is on the file name to identify and ignore the internal Participant IDs.
print("\nGet all the Open World logs into one data frame")
csv_stats_df = DataFrame([])
logs_path = osp.join(nu.data_folder, 'logs', 'Human_Sim_Metrics_Data_4-12-2024')
directories_list = listdir(logs_path)
for dir_name in directories_list:
    
    # Add the CSVs to the data frame
    folder_path = osp.join(logs_path, dir_name)
    df = fu.concatonate_logs(logs_folder=folder_path, verbose=True)
    
    session_uuid, participant_id = dir_name.split('_')
    df['session_uuid'] = session_uuid
    df['participant_id'] = int(participant_id)
    
    # Remove numerically-named columns
    columns_list = [x for x in df.columns if not search(r'\d+', str(x))]
    df = df[columns_list]
    
    # Convert 'TRUE' and 'FALSE' to boolean values
    for cn in fu.boolean_columns_list:
        df[cn] = df[cn].map({'TRUE': True, 'FALSE': False, 'True': True, 'False': False})
    
    # Convert the nulls into NaNs
    for cn in df.columns: df[cn] = df[cn].replace(['null', 'nan', 'n'], nan)
    
    # Append the data frame for the current subdirectory to the main data frame and break the participant ID loop
    csv_stats_df = concat([csv_stats_df, df], axis='index')

csv_stats_df = csv_stats_df.reset_index(drop=True).drop_duplicates()
csv_stats_df['csv_file_name'] = csv_stats_df.csv_file_subpath.map(lambda x: str(x).split('/')[-1])

# Check for proper ingestion (duplicate file ingestion, et al)
assert len(csv_stats_df.columns) > 4, "Nothing ingested"
assert csv_stats_df.participant_id.nunique() == 26, f"Participant count should be 26, it's {csv_stats_df.participant_id.nunique()} instead"

# Check that all the rows that have more than one unique value in the file_name column for each value in the session_uuid column
mask_series = (csv_stats_df.groupby('session_uuid').csv_file_subpath.transform(Series.nunique) > 1)
assert not mask_series.any(), "You have duplicate files"

print(csv_stats_df.shape)
display(csv_stats_df.groupby('participant_id').size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))
display(csv_stats_df.sample(4).dropna(axis='columns', how='all').T.sample(5))


Get all the Open World logs into one data frame
(158663, 113)


Unnamed: 0_level_0,record_count
participant_id,Unnamed: 1_level_1
2024202,11231
2024211,10888
2024209,10503
2024224,10365
2024218,10261


Unnamed: 0,184600,182194,85357,43864
logger_version,1.4,1.4,1.4,1.4
event_time,2024-03-20 12:21:14,2024-03-20 14:26:07,2024-03-22 14:06:08,2024-03-14 10:48:04
is_scene_aborted,False,False,False,False
patient_record_rotation,"(0.0, -0.1, 0.0, 1.0)",,,
action_tick,24324,456034,499246,751686


In [8]:

# Modalize separate patient ID columns into one
new_column_name = 'patient_id'
patient_id_columns_list = sorted(set(fu.patient_id_columns_list).intersection(set(csv_stats_df.columns)))
csv_stats_df = nu.modalize_columns(csv_stats_df, patient_id_columns_list, new_column_name)
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Unnamed: 0_level_0,record_count
patient_id,Unnamed: 1_level_1
Open World Marine 1 Female Root,2313
Patient V Root,2052
Patient U Root,1900
Patient X Root,1003
patient U Root,992


In [9]:

[cn for cn in csv_stats_df.columns if 'applied' in cn]

['tool_applied_row_shape', 'tool_applied_patient_id', 'tool_applied_type', 'tool_applied_attachment_point', 'tool_applied_tool_location', 'tool_applied_sender', 'tool_applied_attach_message', 'tool_applied_data', 'tag_applied_patient_id', 'tag_applied_type']

In [11]:

mask_series = csv_stats_df.patient_id.isnull()
action_types_list = sorted(csv_stats_df[mask_series].action_type.unique())
print(action_types_list)
for action_type in action_types_list:
    mask_series = (csv_stats_df.action_type == action_type) & ~csv_stats_df.patient_id.isnull()
    if mask_series.any():
        print(action_type)
        mask_series = (csv_stats_df.action_type == action_type) & csv_stats_df.patient_id.isnull()
        df = csv_stats_df[mask_series]
        print(
            f'When you are applying a tool ({action_type}) of these types you are not recording a patient ID in the logs:'
            f' {nu.conjunctify_nouns(sorted(df.tool_applied_type.unique()))}'
        )
        break
        display(df.sample(min(df.shape[0], 4)).dropna(axis='columns', how='all').T)

['BAG_ACCESS', 'BAG_CLOSED', 'BUTTON_CLICKED', 'PLAYER_LOCATION', 'SESSION_END', 'SESSION_START', 'SP_O2_TAKEN', 'TAG_DISCARDED', 'TAG_SELECTED', 'TELEPORT', 'TOOL_DISCARDED', 'TOOL_HOVER', 'TOOL_SELECTED', 'TRIAGE_LEVEL_WALK_IF_CAN', 'TRIAGE_LEVEL_WAVED', 'TRIAGE_LEVEL_WAVE_IF_CAN', 'VOICE_CAPTURE', 'VOICE_COMMAND']


In [23]:

sorted(df.tool_applied_type.unique())

['Burn_Dressing', 'Gauze_Dressing', 'Gauze_Pack', 'IV_Blood', 'IV_Saline', 'Pain_Meds', 'SAM_Splint']

In [25]:

print('\\b(' + '|'.join(df.tool_applied_type.unique()) + ')\\b')

\b(IV_Blood|IV_Saline|Gauze_Dressing|Burn_Dressing|Pain_Meds|SAM_Splint|Gauze_Pack)\b


In [33]:

logs_path = osp.join(nu.data_folder, 'logs', 'Human_Sim_Metrics_Data_4-12-2024')
for metrics_type in fu.known_mcivr_metrics_types:
    output_file_path = osp.join(fu.saves_folder, 'csv', f'{metrics_type}.csv')
    with open(output_file_path, mode='w', encoding=nu.encoding_type) as f: print('', file=f)
    for dir_name in listdir(logs_path):
        folder_path = osp.join(logs_path, dir_name)
        for sub_directory, directories_list, files_list in walk(folder_path):
            
            # Iterate over the files in the current subdirectory
            for file_name in files_list:
                
                # If the file is a CSV file, merge it into the subdirectory data frame
                if file_name.endswith('.csv'):
                    
                    # Construct the full path to the file
                    file_path = osp.join(sub_directory, file_name)
                    # if (metrics_type != 'BAG_ACCESS'): print(file_path)
                    
                    # Read CSV file
                    with open(file_path, 'r') as f_input:
                        for line_str in f_input:
                            if re.search(f'^{metrics_type},', line_str):
                                
                                # Add all mentions of each metrics type to its own CSV file
                                with open(output_file_path, mode='a', encoding=nu.encoding_type) as f_output:
                                    print(line_str, end='', file=f_output)

In [37]:

import csv

print()
for metrics_type in fu.known_mcivr_metrics_types:
    
    # Read CSV file using a CSV reader
    output_file_path = osp.join(fu.saves_folder, 'csv', f'{metrics_type}.csv')
    rows_list = []
    with open(osp.abspath(output_file_path), 'r', encoding=nu.encoding_type) as f:
        reader = csv.reader(f, delimiter=',', quotechar='"')
        for values_list in reader:
            rows_list.append({i: v for i, v in enumerate(values_list)})
    file_df = DataFrame(rows_list).dropna(axis='columns', how='all').dropna(axis='index', how='all')
    # print(metrics_type, file_df.shape)
    
    # Show the columns that have the patient's names in them
    patients_regex = re.compile(r'\b(' + '|'.join(patients_set) + r')\b')
    column_names_list = []
    for column_name in range(19):
        if (column_name in file_df.columns) and any(map(lambda x: patients_regex.search(str(x)), file_df[column_name].tolist())):
            column_names_list.append(column_name-1)

    if column_names_list:
        print(metrics_type, column_names_list)


INJURY_RECORD [5]
INJURY_TREATED [5]
PATIENT_DEMOTED [6]
PATIENT_ENGAGED [6]
BREATHING_CHECKED [5]
PATIENT_RECORD [6]
PULSE_TAKEN [5]
SP_O2_TAKEN [5]
TRIAGE_LEVEL_WALKED [6]
TRIAGE_LEVEL_WALK_IF_CAN [6]
TRIAGE_LEVEL_WAVED [6]
TRIAGE_LEVEL_WAVE_IF_CAN [6]
TAG_APPLIED [4]
TOOL_APPLIED [4]
PLAYER_GAZE [5]


In [19]:

file_df