In [2]:

# Set up notebook
%pprint
import sys
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))

Pretty printing has been turned OFF


In [3]:

# load libraries
from FRVRS import (nu, fu, osp, re, listdir, walk, DataFrame, nan, concat, Series, display, csv)
from pandas import get_dummies
from re import split, search, sub, MULTILINE
from scipy.stats import f_oneway, ttest_ind, kruskal, norm
import itertools


# Find unaccounted-for patient IDs in weird columns

In [24]:

# Analyze the non-patients to see if there are typos of Jennifer's patients
patients_set = set(fu.ow_patients_list)

In [9]:

logs_path = osp.join(nu.data_folder, 'logs', 'Human_Sim_Metrics_Data_4-12-2024')
rows_list = []
for id_prefix in patients_set:
    prefix_regex = re.compile(',' + id_prefix)
    for dir_name in listdir(logs_path):
        logs_folder = osp.join(logs_path, dir_name)
        for sub_directory, directories_list, files_list in walk(logs_folder):
            for file_name in files_list:
                if file_name.endswith('.csv'):
                    file_path = osp.join(sub_directory, file_name)
                    with open(file_path, 'r') as f_input:
                        for line_str in f_input:
                            if prefix_regex.search(line_str):
                                row_dict = {'id_prefix': id_prefix, 'file_path': file_path, 'line_str': line_str}
                                nonroot_regex = re.compile(',' + id_prefix + ',')
                                if nonroot_regex.search(line_str):
                                    row_dict['is_nonroot'] = True
                                else:
                                    row_dict['is_nonroot'] = False
                                root_regex = re.compile(',' + id_prefix + ' Root,')
                                if root_regex.search(line_str):
                                    row_dict['has_root_suffix'] = True
                                else:
                                    row_dict['has_root_suffix'] = False
                                rows_list.append(row_dict)
root_prefix_df = DataFrame(rows_list)

In [23]:

from IPython.display import HTML

print(
    "\n@Jacob Audick @Kaitlyn Choy For the CSV logs in @Brian Pippin (TA3/CACI)'s Human_Sim_Metrics_Data_4-12-2024,"
    " for @Jennifer McVay (TA3/CACI)'s list of characters within each CSV"
    " (n the ITM BBAI Exploratory analysis email on March 25th), we have the following alternate spellings:\n"
)
for index_tuple, row_series in root_prefix_df.groupby(
    ['id_prefix', 'has_root_suffix']
).size().to_frame().rename(columns={0: 'line_count'}).iterrows():
    id_prefix, has_root_suffix = index_tuple
    line_count = row_series.line_count
    if has_root_suffix:
        print(f'"{id_prefix} Root": {line_count} instances')
    else:
        print(f'"{id_prefix}": {line_count} instances')
print('\nShould we trust both the "Root" suffix and the non-root spelling as among the list of characters?')


@Jacob Audick @Kaitlyn Choy For the CSV logs in @Brian Pippin (TA3/CACI)'s Human_Sim_Metrics_Data_4-12-2024, for @Jennifer McVay (TA3/CACI)'s list of characters within each CSV (n the ITM BBAI Exploratory analysis email on March 25th), we have the following alternate spellings:

"Civilian 1 Female": 17 instances
"Civilian 1 Female Root": 259 instances
"Marine 1 Male": 7 instances
"Marine 1 Male Root": 674 instances
"Marine 2 Male": 9 instances
"Marine 2 Male Root": 391 instances
"Marine 3 Male": 25 instances
"Marine 3 Male Root": 522 instances
"Marine 4 Male": 7 instances
"Marine 4 Male Root": 791 instances
"Navy Soldier 1 Male": 4 instances
"Navy Soldier 1 Male Root": 483 instances
"Navy Soldier 2 Male": 11 instances
"Navy Soldier 2 Male Root": 467 instances
"Navy Soldier 3 Male": 3 instances
"Navy Soldier 3 Male Root": 480 instances
"Navy Soldier 4 Female": 10 instances
"Navy Soldier 4 Female Root": 2553 instances
"Open World Civilian 1 Male": 11 instances
"Open World Civilian 1 Mal

In [3]:

# In the zip there are 51 folders, (51 JSON, 51 CSV).
# All the files are named appropriated in the folder/CSV/json UUID_ParticipantID.
# Some of the internal Participants IDs might be off because the moderator forgot to enter a Participant ID or didn't enter
# the Participant ID correctly so we needed to figure out which participant it was.
# So only utilize the UUID and Participant ID that is on the file name to identify and ignore the internal Participant IDs.
print("\nGet all the Open World logs into one data frame")
csv_stats_df = DataFrame([])
logs_path = osp.join(nu.data_folder, 'logs', 'Human_Sim_Metrics_Data_4-12-2024')
for dir_name in listdir(logs_path):
    
    # Add the CSVs to the data frame
    folder_path = osp.join(logs_path, dir_name)
    df = fu.concatonate_logs(logs_folder=folder_path, verbose=True)
    
    session_uuid, participant_id = dir_name.split('_')
    df['session_uuid'] = session_uuid
    df['participant_id'] = int(participant_id)
    
    # Remove numerically-named columns
    columns_list = [x for x in df.columns if not search(r'\d+', str(x))]
    df = df[columns_list]
    
    # Convert 'TRUE' and 'FALSE' to boolean values
    for cn in fu.boolean_columns_list:
        df[cn] = df[cn].map({'TRUE': True, 'FALSE': False, 'True': True, 'False': False})
    
    # Convert the nulls into NaNs
    for cn in df.columns: df[cn] = df[cn].replace(['null', 'nan'], nan)
    
    # Append the data frame for the current subdirectory to the main data frame and break the participant ID loop
    csv_stats_df = concat([csv_stats_df, df], axis='index')

csv_stats_df = csv_stats_df.reset_index(drop=True).drop_duplicates()
csv_stats_df['csv_file_name'] = csv_stats_df.csv_file_subpath.map(lambda x: str(x).split('/')[-1])

# Check for proper ingestion (duplicate file ingestion, et al)
assert len(csv_stats_df.columns) > 4, "Nothing ingested"
assert csv_stats_df.participant_id.nunique() == 26, f"Participant count should be 26, it's {csv_stats_df.participant_id.nunique()} instead"

# Check that all the rows that have more than one unique value in the file_name column for each value in the session_uuid column
mask_series = (csv_stats_df.groupby('session_uuid').csv_file_subpath.transform(Series.nunique) > 1)
assert not mask_series.any(), "You have duplicate files"

# Show what the datset looks like
print(csv_stats_df.shape)
display(csv_stats_df.groupby('participant_id').size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))
display(csv_stats_df.sample(4).dropna(axis='columns', how='all').T.sample(5))


Get all the Open World logs into one data frame
(158663, 111)


Unnamed: 0_level_0,record_count
participant_id,Unnamed: 1_level_1
2024202,11231
2024211,10888
2024209,10503
2024224,10365
2024218,10261


Unnamed: 0,62716,77340,34596,48571
player_gaze_direction_of_gaze,,,"(0.4, 0.9, 0.4)",
player_location_location,,"(-12.9, 1.5, 1.3)",,
is_scene_aborted,False,False,False,False
session_uuid,385032e9-9801-4dcf-a841-b3703a0d9acd,499179ba-3138-4bae-918e-ffc7fb943760,1995e7ef-ef02-4fc1-b1ab-f137dbf69d48,37a554ee-fc49-4730-819c-2d97727bb0b7
player_gaze_patient_id,,,Patient V Root,


In [4]:

# Get the patients lists
patients_regex = re.compile(r',(' + '|'.join(patients_set) + r')\b')

In [5]:

# Add all mentions of TA3 patients to its own CSV file
logs_path = osp.join(nu.data_folder, 'logs', 'Human_Sim_Metrics_Data_4-12-2024')
directories_list = listdir(logs_path)
output_file_path = osp.join(fu.saves_folder, 'csv', 'desert_jungle_submarine_urban_patients.csv')
with open(output_file_path, mode='w', encoding=nu.encoding_type) as f: print('', file=f)
with open(output_file_path, mode='a', encoding=nu.encoding_type) as f_output:
    for dir_name in directories_list:
        folder_path = osp.join(logs_path, dir_name)
        for sub_directory, directories_list, files_list in walk(folder_path):
            
            # Iterate over the files in the current subdirectory
            for file_name in files_list:
                
                # If the file is a CSV file, merge it into the subdirectory data frame
                if file_name.endswith('.csv'):
                    
                    # Construct the full path to the file
                    file_path = osp.join(sub_directory, file_name)
                    
                    # Read CSV file
                    with open(file_path, 'r') as f_input:
                        for line_str in f_input:
                            if patients_regex.search(line_str):
                                print(line_str, end='', file=f_output)

# Read CSV file using a CSV reader
rows_list = []
with open(osp.abspath(output_file_path), 'r', encoding=nu.encoding_type) as f:
    reader = csv.reader(f, delimiter=',', quotechar='"')
    for values_list in reader:
        rows_list.append({i: v for i, v in enumerate(values_list)})
file_df = DataFrame(rows_list).dropna(axis='columns', how='all').dropna(axis='index', how='all')

# Show the columns that have the patient's names in them
patients_regex = re.compile(r'\b(' + '|'.join(patients_set) + r')\b')
for column_name in range(19):
    if any(map(lambda x: patients_regex.search(str(x)), file_df[column_name].tolist())):
        print(column_name)

5
6
7


In [6]:

# Modalize separate patient ID columns into one
new_column_name = 'patient_id'
patient_id_columns_list = sorted(set(fu.patient_id_columns_list).intersection(set(csv_stats_df.columns)))
csv_stats_df = nu.modalize_columns(csv_stats_df, patient_id_columns_list, new_column_name)
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

Unnamed: 0_level_0,record_count
patient_id,Unnamed: 1_level_1
Open World Marine 1 Female Root,2313
Patient V Root,2052
Patient U Root,1900
Patient X Root,1003
patient U Root,992


In [7]:

# Get the name of the row_dict.shape column
[cn for cn in csv_stats_df.columns if 'applied' in cn]

['tool_applied_row_shape', 'tool_applied_patient_id', 'tool_applied_type', 'tool_applied_attachment_point', 'tool_applied_tool_location', 'tool_applied_data', 'tag_applied_patient_id', 'tag_applied_type']

In [8]:

# Show all action types that make no mention of a patient and those that sometimes do
mask_series = csv_stats_df.patient_id.isnull()
action_types_list = sorted(csv_stats_df[mask_series].action_type.unique())
print(action_types_list)
for action_type in action_types_list:
    mask_series = (csv_stats_df.action_type == action_type) & ~csv_stats_df.patient_id.isnull()
    if mask_series.any():
        print(action_type)
        mask_series = (csv_stats_df.action_type == action_type) & csv_stats_df.patient_id.isnull()
        df = csv_stats_df[mask_series]
        print(
            f'When you are applying a tool ({action_type}) of these types you are not recording a patient ID in the logs:'
            f' {nu.conjunctify_nouns(sorted(df.tool_applied_type.unique()))}'
        )
        break
        display(df.sample(min(df.shape[0], 4)).dropna(axis='columns', how='all').T)

['BAG_ACCESS', 'BAG_CLOSED', 'BUTTON_CLICKED', 'PLAYER_LOCATION', 'SESSION_END', 'SESSION_START', 'SP_O2_TAKEN', 'TAG_DISCARDED', 'TAG_SELECTED', 'TELEPORT', 'TOOL_DISCARDED', 'TOOL_HOVER', 'TOOL_SELECTED', 'TRIAGE_LEVEL_WALK_IF_CAN', 'TRIAGE_LEVEL_WAVED', 'TRIAGE_LEVEL_WAVE_IF_CAN', 'VOICE_CAPTURE', 'VOICE_COMMAND']


In [9]:

# Add all mentions of each metrics type to its own CSV file
logs_path = osp.join(nu.data_folder, 'logs', 'Human_Sim_Metrics_Data_4-12-2024')
for metrics_type in fu.known_mcivr_metrics:
    output_file_path = osp.join(fu.saves_folder, 'csv', f'{metrics_type}.csv')
    with open(output_file_path, mode='w', encoding=nu.encoding_type) as f: print('', file=f)
    for dir_name in listdir(logs_path):
        folder_path = osp.join(logs_path, dir_name)
        for sub_directory, directories_list, files_list in walk(folder_path):
            
            # Iterate over the files in the current subdirectory
            for file_name in files_list:
                
                # If the file is a CSV file, merge it into the subdirectory data frame
                if file_name.endswith('.csv'):
                    
                    # Construct the full path to the file
                    file_path = osp.join(sub_directory, file_name)
                    # if (metrics_type != 'BAG_ACCESS'): print(file_path)
                    
                    # Read CSV file
                    with open(file_path, 'r') as f_input:
                        for line_str in f_input:
                            if re.search(f'^{metrics_type},', line_str):
                                
                                # Add all mentions of each metrics type to its own CSV file
                                with open(output_file_path, mode='a', encoding=nu.encoding_type) as f_output:
                                    print(line_str, end='', file=f_output)

In [10]:

# Show the columns that have the patient's names in them for each metrics type
print()
for metrics_type in fu.known_mcivr_metrics:
    
    # Read CSV file using a CSV reader
    output_file_path = osp.join(fu.saves_folder, 'csv', f'{metrics_type}.csv')
    if osp.isfile(output_file_path):
        rows_list = []
        with open(osp.abspath(output_file_path), 'r', encoding=nu.encoding_type) as f:
            reader = csv.reader(f, delimiter=',', quotechar='"')
            for values_list in reader:
                rows_list.append({i: v for i, v in enumerate(values_list)})
        file_df = DataFrame(rows_list).dropna(axis='columns', how='all').dropna(axis='index', how='all')
        # print(metrics_type, file_df.shape)
        
        # Show the columns that have the patient's names in them
        patients_regex = re.compile(r'\b(' + '|'.join(patients_set) + r')\b')
        column_names_list = []
        for column_name in range(19):
            if (column_name in file_df.columns) and any(map(lambda x: patients_regex.search(str(x)), file_df[column_name].tolist())):
                column_names_list.append(column_name-1)
        if column_names_list:
            # print(metrics_type, column_names_list)
            print(f"            df.loc[row_index, '{metrics_type.lower()}_patient_id'] = row_series{column_names_list} # patientId")


            df.loc[row_index, 'injury_record_patient_id'] = row_series[5] # patientId
            df.loc[row_index, 'injury_treated_patient_id'] = row_series[5] # patientId
            df.loc[row_index, 'patient_demoted_patient_id'] = row_series[6] # patientId
            df.loc[row_index, 'patient_engaged_patient_id'] = row_series[6] # patientId
            df.loc[row_index, 'breathing_checked_patient_id'] = row_series[5] # patientId
            df.loc[row_index, 'patient_record_patient_id'] = row_series[6] # patientId
            df.loc[row_index, 'pulse_taken_patient_id'] = row_series[5] # patientId
            df.loc[row_index, 'sp_o2_taken_patient_id'] = row_series[5] # patientId
            df.loc[row_index, 'triage_level_walked_patient_id'] = row_series[6] # patientId
            df.loc[row_index, 'triage_level_walk_if_can_patient_id'] = row_series[6] # patientId
            df.loc[row_index, 'triage_level_waved_patient_id'] = row_series[6] # patientId
            df.loc[row_in

In [11]:

# Simplify the contingencies in TOOL_APPLIED
metrics_type = 'TOOL_APPLIED'
output_file_path = osp.join(fu.saves_folder, 'csv', f'{metrics_type}.csv')
if osp.isfile(output_file_path):
    rows_list = []
    with open(osp.abspath(output_file_path), 'r', encoding=nu.encoding_type) as f:
        reader = csv.reader(f, delimiter=',', quotechar='"')
        for values_list in reader:
            rows_list.append({i: v for i, v in enumerate(values_list)})
        file_df = DataFrame(rows_list).dropna(axis='columns', how='all').dropna(axis='index', how='all')

In [12]:

# Write out a statement you can show to the developers
print(f"""\nOkay, as far as {metrics_type} goes, I'm assuming that:""")
for i, list_name in zip(range(5, 13), ['patientId', 'type', 'attachmentPoint', 'toolLocation', 'data']):
    parts_list = re.split('([A-Z]+)', list_name, 0)
    if len(parts_list) > 1:
        column_suffix = parts_list[0] + '_' + ''.join(parts_list[1:]).lower()
    else:
        column_suffix = parts_list[0]
    mask_series = ~file_df[i].isnull()
    list_str = nu.conjunctify_nouns(sorted([str(x).replace(' (UnityEngine.GameObject)', '') for x in file_df[mask_series][i].unique()]))
    print(f'These are all examples of {list_name}: {list_str}')
    # print(f"            df.loc[row_index, 'tool_applied_{column_suffix}'] = row_series[{i-1}] # {list_name}")
print('Is this correct?')


Okay, as far as TOOL_APPLIED goes, I'm assuming that:
These are all examples of patientId: Adept Shooter Root, Adept Victim Root, Civilian 1, Civilian 1 Female, Civilian 1 Female Root, Civilian 1 Root, Desert Level Core, Local Soldier 1, Local Soldier 1 Root, Marine 1 Male, Marine 1 Male Root, Marine 2 Male, Marine 2 Male Root, Marine 3 Male, Marine 3 Male Root, Marine 4 Male, Marine 4 Male Root, Navy Soldier 1 Male, Navy Soldier 1 Male Root, Navy Soldier 2 Male, Navy Soldier 2 Male Root, Navy Soldier 3 Male, Navy Soldier 3 Male Root, Navy Soldier 4 Female, Navy Soldier 4 Female Root, Open World Civilian 1 Male, Open World Civilian 1 Male Root, Open World Civilian 2 Female, Open World Civilian 2 Female Root, Open World Marine 1 Female, Open World Marine 1 Female Root, Open World Marine 1 Male, Open World Marine 1 Male Root, Open World Marine 2 Female, Open World Marine 2 Female Root, Open World Marine 2 Male, Open World Marine 2 Male Root, Open World Marine 3 Male, Open World Marine 3