In [1]:

# Set up notebook
%pprint
import sys
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))

Pretty printing has been turned OFF


In [2]:

# load libraries
from FRVRS import (nu, fu, DataFrame, osp, listdir, nan, concat, Series)
from pandas import get_dummies
from re import MULTILINE, search, split, sub
from scipy.stats import f_oneway, ttest_ind, kruskal, norm
import inspect
import itertools
import json
import matplotlib.pyplot as plt

IS_DEBUG = False

In [None]:

# In the zip there are 51 folders, (51 JSON, 51 CSV).
# All the files are named appropriated in the folder/CSV/json UUID_ParticipantID.
# Some of the internal Participants IDs might be off because the moderator forgot to enter a Participant ID or didn't enter
# the Participant ID correctly so we needed to figure out which participant it was.
# So only utilize the UUID and Participant ID that is on the file name to identify and ignore the internal Participant IDs.
# Get all the Open World logs into one data frame")
csv_stats_df = DataFrame([])
logs_path = osp.join(nu.data_folder, 'logs', 'Human_Sim_Metrics_Data_4-12-2024')
directories_list = listdir(logs_path)
for dir_name in directories_list:
    
    # Add the CSVs to the data frame
    folder_path = osp.join(logs_path, dir_name)
    df = fu.concatonate_logs(logs_folder=folder_path)
    
    session_uuid, participant_id = dir_name.split('_')
    df['session_uuid'] = session_uuid
    df['participant_id'] = int(participant_id)
    
    # Remove numerically-named columns
    columns_list = [x for x in df.columns if not search(r'\d+', str(x))]
    df = df[columns_list]
    
    # Convert 'TRUE' and 'FALSE' to boolean values
    for cn in fu.boolean_columns_list:
        df[cn] = df[cn].map({'TRUE': True, 'FALSE': False, 'True': True, 'False': False})
    
    # Convert the nulls into NaNs
    for cn in df.columns: df[cn] = df[cn].replace(['null', 'nan'], nan)
    
    # Append the data frame for the current subdirectory to the main data frame and break the participant ID loop
    csv_stats_df = concat([csv_stats_df, df], axis='index')

csv_stats_df = csv_stats_df.reset_index(drop=True).drop_duplicates()
csv_stats_df['csv_file_name'] = csv_stats_df.csv_file_subpath.map(lambda x: str(x).split('/')[-1])

# Check for proper ingestion (duplicate file ingestion, et al)")
assert len(csv_stats_df.columns) > 4, "Nothing ingested"
assert csv_stats_df.participant_id.nunique() == 26, f"Participant count should be 26, it's {csv_stats_df.participant_id.nunique()} instead"

if IS_DEBUG: print(csv_stats_df.groupby('logger_version').size().to_frame().rename(columns={0: 'record_count'})) # 276926

# Filter all the rows that have more than one unique value in the file_name column for each value in the session_uuid column")
mask_series = (csv_stats_df.groupby('session_uuid').csv_file_subpath.transform(Series.nunique) > 1)
assert not mask_series.any(), "You have duplicate files"

# Check that all your junk scenes are the last scenes")
if IS_DEBUG: print(csv_stats_df.groupby('is_scene_aborted').size().to_frame().rename(columns={0: 'record_count'}))
mask_series = csv_stats_df.is_scene_aborted
for (session_uuid, scene_id), scene_df in csv_stats_df[mask_series].groupby(fu.scene_groupby_columns):
    mask_series = (csv_stats_df.session_uuid == session_uuid)
    max_scene_id = csv_stats_df[mask_series].scene_id.max()
    assert max_scene_id == scene_id, "You've got junk scenes in strange places"

In [None]:

# Remove the Unity suffix from all patient_id columns
# The one without "Root" is the ID that CACI sets for it. Unity
# then takes the ID and adds "Root" to the end when it
# creates the hierarchy, so there's less room for human
# error. They're going to match perfectly.
for cn in fu.patient_id_columns_list + ['patient_id']:
    if cn in csv_stats_df.columns:
        mask_series = ~csv_stats_df[cn].isnull()
        csv_stats_df.loc[mask_series, cn] = csv_stats_df[mask_series][cn].map(lambda x: str(x).replace(' Root', ''))

In [None]:

# Remove the patients not in our lists
patients_set = set(fu.ow_patients_list)
mask_series = ~csv_stats_df.injury_record_patient_id.isnull()
all_set = set(csv_stats_df[mask_series].injury_record_patient_id)
assert patients_set.issubset(all_set), f"You're missing {patients_set.difference(all_set)} from the patients in the CSVs"

In [None]:

# Modalize separate columns into one")
csv_stats_df = fu.add_modal_column_to_dataframe('patient_id', csv_stats_df, is_categorical=False, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('injury_id', csv_stats_df, is_categorical=False, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('location_id', csv_stats_df, is_categorical=False, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('patient_sort', csv_stats_df, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('patient_pulse', csv_stats_df, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('patient_salt', csv_stats_df, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('patient_hearing', csv_stats_df, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('patient_breath', csv_stats_df, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('patient_mood', csv_stats_df, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('patient_pose', csv_stats_df, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('injury_severity', csv_stats_df, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('injury_required_procedure', csv_stats_df, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('injury_body_region', csv_stats_df, verbose=IS_DEBUG)
csv_stats_df = fu.add_modal_column_to_dataframe('tool_type', csv_stats_df, verbose=IS_DEBUG)

csv_stats_df = fu.convert_column_to_categorical(csv_stats_df, 'pulse_taken_pulse_name', verbose=IS_DEBUG)
csv_stats_df = fu.convert_column_to_categorical(csv_stats_df, 'tool_applied_data', verbose=IS_DEBUG)

patients_set = set(fu.ow_patients_list)
mask_series = ~csv_stats_df.patient_id.isnull()
all_set = set(csv_stats_df[mask_series].patient_id
assert patients_set.issubset(all_set), f"You're missing {patients_set.difference(all_set)} from the patients in the CSVs"

AssertionError: Our patients lists are not in the CSVs

In [4]:

csv_stats_df.patient_id.unique()

array([nan])