In [1]:

%pprint
import sys
if (osp.join(os.pardir, 'py') not in sys.path): sys.path.insert(1, osp.join(os.pardir, 'py'))

Pretty printing has been turned OFF


In [2]:

from FRVRS import (nu, osp, DataFrame, np, re, walk, to_numeric, to_datetime, nan, Series, display, notnull, concat, remove)
import os
import pandas as pd


# Replace UUIDs with Cleaned and Revised File Info

In [3]:

# Get all CSVs into one data frame
if nu.pickle_exists('frvrs_logs_df'):
    frvrs_logs_df = nu.load_object('frvrs_logs_df')
    print(frvrs_logs_df.shape)
    # df = frvrs_logs_df.sample(4).dropna(axis='columns', how='all')
    # display(df.T)

(829277, 113)


In [4]:

# Get all deduped logs into one data frame
if nu.pickle_exists('sub_directory_df'):
    sub_directory_df = nu.load_object('sub_directory_df')
else:
    
    # Iterate over the subdirectories, directories, and files in the logs folder
    for sub_directory, directories_list, files_list in walk(nu.data_logs_folder):
        if (sub_directory == f'{nu.data_logs_folder}\\Double runs removed'):
    
            # Create a data frame to store the data for the current subdirectory
            sub_directory_df = DataFrame([])
    
            # Iterate over the files in the current subdirectory
            for file_name in files_list:
    
                # If the file is a CSV file, merge it into the subdirectory data frame
                if file_name.endswith('.csv'): sub_directory_df = fu.process_files(sub_directory_df, sub_directory, file_name)
    
    # Convert event time to a datetime
    if ('event_time' in sub_directory_df.columns): sub_directory_df['event_time'] = to_datetime(sub_directory_df['event_time'], format='mixed')
    
    # Convert elapsed time to an integer
    if ('action_tick' in sub_directory_df.columns):
        sub_directory_df['action_tick'] = to_numeric(sub_directory_df['action_tick'], errors='coerce')
        sub_directory_df['action_tick'] = sub_directory_df['action_tick'].astype('int64')
    
    sub_directory_df = sub_directory_df.reset_index(drop=True)
    
    # Remove numerically-named columns
    columns_list = [x for x in sub_directory_df.columns if not re.search(r'\d+', str(x))]
    sub_directory_df = sub_directory_df[columns_list]
    
    # Convert 'TRUE' and 'FALSE' to boolean values
    for cn in [
        'injury_record_injury_treated_with_wrong_treatment', 'injury_record_injury_treated',
        'injury_treated_injury_treated_with_wrong_treatment', 'injury_treated_injury_treated'
    ]:
        sub_directory_df[cn] = sub_directory_df[cn].map({'TRUE': True, 'FALSE': False, 'True': True, 'False': False})
    
    # Convert the nulls into NaNs
    for cn in frvrs_logs_df.columns: frvrs_logs_df[cn] = frvrs_logs_df[cn].replace('null', nan)
    
    # Store the results and show the new data frame shape
    nu.store_objects(sub_directory_df=sub_directory_df)
    print(sub_directory_df.shape)

In [7]:

display(sub_directory_df.sample(min(4, sub_directory_df.shape[0])).dropna(axis='columns', how='all').T)

Unnamed: 0,24969,28663,22629,7203
action_type,TOOL_HOVER,TOOL_HOVER,TOOL_HOVER,TOOL_HOVER
action_tick,782340,412856,627142,569178
event_time,2022-03-16 12:28:00,2023-08-03 08:02:00,2022-03-16 11:14:00,2022-03-15 11:15:00
session_uuid,0b1d6253-9c4f-43b7-8217-6f0e486aabe7,ab1f8cd1-8d65-45da-b087-89b25ff46c66,0d3e0c62-db52-40f9-9ee8-7fc84a1dcbf2,22e2c9a3-b93f-4f7f-896a-ca188c78505b
file_name,Double runs removed/22.03.16.1215r.csv,Double runs removed/23.08.03.0755r.csv,Double runs removed/22.03.16.1104r.csv,Double runs removed/22.03.15.1106r.csv
logger_version,1.0,1.0,1.0,1.0
tool_hover_type,Hemostatic Gauze,Hemostatic Gauze,Gauze,Tourniquet
tool_hover_count,1001,1000,1006,994
scene_id,0,0,0,0
is_scene_aborted,False,False,False,False



## Check for duplicate file ingestion

In [8]:

# Filter all the rows that have more than one unique value in the file_name column for each value in the session_uuid column
mask_series = (sub_directory_df.groupby('session_uuid').file_name.transform(Series.nunique) > 1)
assert sub_directory_df[mask_series].shape[0] == 0, "You have duplicate files"


## Add new features according to your increasing domain knowledge

In [9]:

# Modalize into one patient ID column if possible
new_column_name = 'patient_id'
if (new_column_name not in sub_directory_df.columns):
    columns_list= [
        'patient_demoted_patient_id', 'patient_record_patient_id', 'injury_record_patient_id', 's_a_l_t_walk_if_can_patient_id',
        's_a_l_t_walked_patient_id', 's_a_l_t_wave_if_can_patient_id', 's_a_l_t_waved_patient_id', 'patient_engaged_patient_id',
        'pulse_taken_patient_id', 'injury_treated_patient_id', 'tool_applied_patient_id', 'tag_applied_patient_id',
        'player_gaze_patient_id'
    ]
    sub_directory_df = nu.modalize_columns(sub_directory_df, columns_list, new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(sub_directory_df=sub_directory_df)
    print(sub_directory_df.shape) # (28978, 107)

In [10]:

# Modalize into one location ID column if possible
new_column_name = 'location_id'
if (new_column_name not in sub_directory_df.columns):
    columns_list= [
        'teleport_location', 'patient_demoted_position', 'patient_record_position', 'injury_record_injury_injury_locator',
        's_a_l_t_walk_if_can_sort_location', 's_a_l_t_walked_sort_location', 's_a_l_t_wave_if_can_sort_location',
        's_a_l_t_waved_sort_location', 'patient_engaged_position', 'bag_access_location', 'injury_treated_injury_injury_locator',
        'bag_closed_location', 'tag_discarded_location', 'tool_discarded_location', 'player_location_location',
        'player_gaze_location'
    ]
    sub_directory_df = nu.modalize_columns(sub_directory_df, columns_list, new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(sub_directory_df=sub_directory_df)
    print(sub_directory_df.shape) # (28978, 108)

In [11]:

# Modalize into one injury ID column if possible
new_column_name = 'injury_id'
if (new_column_name not in sub_directory_df.columns):
    sub_directory_df = nu.modalize_columns(sub_directory_df, ['injury_record_id', 'injury_treated_id'], new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(sub_directory_df=sub_directory_df)
    print(sub_directory_df.shape) # (28978, 109)

In [12]:

# Add a voice capture sentiment score
new_column_name = 'voice_capture_sentiment_score'
if (new_column_name not in sub_directory_df.columns):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    mask_series = sub_directory_df.voice_capture_message.isnull()
    for row_index, row_series in sub_directory_df[~mask_series].iterrows():
        voice_capture_message = '\n' + row_series.voice_capture_message
        sub_directory_df.loc[row_index, new_column_name] = sid.polarity_scores(voice_capture_message)['compound']
    
    # Store the results and show the new data frame shape
    nu.store_objects(sub_directory_df=sub_directory_df)
    print(sub_directory_df.shape) # (28978, 110)

In [13]:

# Mask voice capture PII
# OSU screened all of the **VOICE_COMMAND** and **VOICE_CAPTURE** lines and
# replaced any names with either Max or Jane, regardless of whether the name was that of the responder.
# But, just to make sure...
columns_list = ['voice_command_command_description', 'voice_capture_message']
if not sub_directory_df[columns_list].applymap(lambda x: '[PERSON]' in str(x), na_action='ignore').sum().sum():
    import spacy
    nlp = spacy.load('en_core_web_sm')
    import en_core_web_sm
    nlp = en_core_web_sm.load()
    
    mask_series = sub_directory_df.voice_command_command_description.isnull() & sub_directory_df.voice_capture_message.isnull()
    df = sub_directory_df[~mask_series]
    def mask_pii(srs):
        for idx in columns_list:
            new_text = srs[idx]
            if notnull(new_text):
                doc = nlp(new_text)
                for entity in doc.ents:
                    if entity.label_ == 'PERSON':
                        new_text = re.sub('\\b' + entity.text + '\\b', '[PERSON]', new_text)
                srs[idx] = new_text
    
        return srs
    
    for row_index, row_series in df.apply(mask_pii, axis='columns')[columns_list].iterrows():
        for column_name, column_value in row_series.items():
            if notnull(column_value):
                sub_directory_df.loc[row_index, column_name] = column_value
    
    # Store the results and show the new data frame shape
    nu.store_objects(sub_directory_df=sub_directory_df)
    print(sub_directory_df.shape) # (28978, 110)

In [14]:

# Any runs longer than that 16 minutes are probably an instance
# of someone taking off the headset and setting it on the ground.
# 1 second = 1,000 milliseconds; 1 minute = 60 seconds
new_column_name = 'is_scene_aborted'
if (new_column_name in sub_directory_df.columns): sub_directory_df = sub_directory_df.drop(columns=new_column_name)
if (new_column_name not in sub_directory_df.columns) and nu.pickle_exists('time_to_engagement_df'):
    sub_directory_df[new_column_name] = False

    # Create the still patients data frame
    time_to_engagement_df = nu.load_object('time_to_engagement_df')
    mask_series = ~time_to_engagement_df.start_to_last_engagement.isnull()
    min_start_to_engagement_df = time_to_engagement_df[mask_series].groupby(fu.scene_groupby_columns).min().reset_index(drop=False).sort_values('start_to_last_engagement')

    # Filter it for overly-long runs
    sixteen_minutes = 1_000 * 60 * 16
    mask_series = (min_start_to_engagement_df.start_to_last_engagement > sixteen_minutes)
    
    # Get the run's entire history and mark it as aborted
    for (session_uuid, scene_id), _ in min_start_to_engagement_df[mask_series].groupby(fu.scene_groupby_columns):
        mask_series = True
        for cn in fu.scene_groupby_columns: mask_series &= (sub_directory_df[cn] == eval(cn))
        sub_directory_df.loc[mask_series, new_column_name] = True
    
    # Store the results and show the new data frame shape
    nu.store_objects(sub_directory_df=sub_directory_df)
    print(sub_directory_df.shape) # (28978, 111)
    display(sub_directory_df.groupby('is_scene_aborted').size().to_frame().rename(columns={0: 'count'}))

Pickling to C:\Users\DaveBabbitt\Documents\GitHub\itm-analysis-reporting\saves\pkl\sub_directory_df.pkl
(28978, 113)


Unnamed: 0_level_0,count
is_scene_aborted,Unnamed: 1_level_1
False,28978


In [15]:

# Check if all the patient IDs in any run are some variant of Mike and designate those runs as "Orientation"
if ('scene_type' not in sub_directory_df.columns): sub_directory_df['scene_type'] = 'Triage'
column_value = 'Orientation'
if (column_value not in sub_directory_df.scene_type.unique()):
    
    # Filter out those files from the dataset and mark them
    base_mask_series = sub_directory_df.groupby(fu.scene_groupby_columns).patient_id.transform(lambda srs: all(srs.str.lower().str.contains('mike')))
    sub_directory_df.loc[base_mask_series, 'scene_type'] = column_value
    
    # Store the results and show the new data frame shape
    nu.store_objects(sub_directory_df=sub_directory_df)
    print(sub_directory_df.shape) # (28978, 112)

In [16]:

# Get a sample with a clear count of responders
new_column_name = 'is_a_one_triage_file'
if (new_column_name not in sub_directory_df.columns):
    sub_directory_df[new_column_name] = False

    # Create the triage count data frame
    time_groups_dict = {}
    mask_series = (sub_directory_df.scene_type == 'Triage') & (sub_directory_df.is_scene_aborted == False)
    for (file_name), file_name_df in sub_directory_df[mask_series].groupby('file_name'):
        actions_list = []
        
        # Add the scene type for this run
        for (scene_id), scene_df in file_name_df.groupby('scene_id'):
            scene_type = fu.get_scene_type(scene_df)
            if len(scene_type) != 1: raise
            else: scene_type = scene_type[0]
            actions_list.append(scene_type)
        
        time_groups_dict[file_name] = actions_list
    triage_count_df = DataFrame([{'file_name': k, 'triage_count': v.count('Triage')} for k, v in time_groups_dict.items()])

    # Filter only those file which have only one triage run
    mask_series = (triage_count_df.triage_count == 1)
    file_names_list = triage_count_df[mask_series].file_name.tolist()
    mask_series = sub_directory_df.file_name.isin(file_names_list)
    sub_directory_df.loc[mask_series, new_column_name] = True
    
    nu.store_objects(sub_directory_df=sub_directory_df)
    print(sub_directory_df.shape) # (28978, 113)

In [17]:

for cn in [
    'player_gaze_distance_to_patient', 'player_gaze_direction_of_gaze', 'player_location_right_hand_location', 'player_location_left_hand_location',
    'player_gaze_location', 'player_location_location', 'player_gaze_patient_id'
]:
    sub_directory_df[cn] = nan

missing_columns = list(set(frvrs_logs_df.columns) - set(sub_directory_df.columns))
if missing_columns: print("'(" + '|'.join(missing_columns) + ")'")

In [19]:

# Replace the old file rows with the new file rows
for session_uuid, file_df in sub_directory_df.groupby('session_uuid'):

    # Filter out the file from the main data frame
    mask_series = (frvrs_logs_df.session_uuid == session_uuid)
    # assert frvrs_logs_df[mask_series].shape[0] >= file_df.shape[0], f"{session_uuid} seems sus"

    # Remove the old file
    file_name = frvrs_logs_df[mask_series].file_name.unique().item()
    file_path = osp.join(nu.data_logs_folder, file_name)
    if osp.exists(file_path): remove(file_path)
    print(file_name, frvrs_logs_df[mask_series].shape, file_df.shape)

    # Remove the old file rows
    frvrs_logs_df = frvrs_logs_df[~mask_series]
    
    # Append the data frame for the current file to the main data frame
    frvrs_logs_df = concat([frvrs_logs_df, file_df], axis='index')

# Convert event time to a datetime
if ('event_time' in frvrs_logs_df.columns): frvrs_logs_df['event_time'] = to_datetime(frvrs_logs_df['event_time'], format='mixed')

# Convert elapsed time to an integer
if ('action_tick' in frvrs_logs_df.columns):
    frvrs_logs_df['action_tick'] = to_numeric(frvrs_logs_df['action_tick'], errors='coerce')
    frvrs_logs_df['action_tick'] = frvrs_logs_df['action_tick'].astype('int64')

frvrs_logs_df = frvrs_logs_df.reset_index(drop=True)
nu.store_objects(frvrs_logs_df=frvrs_logs_df)

Disaster Day 2022/AS_1056.csv (1737, 113) (1371, 113)
Disaster Day 2022/NEM_1053.csv (1046, 113) (719, 113)
Disaster Day 2022/JH_1235.csv (1440, 113) (872, 113)
Disaster Day 2022/CF_1117.csv (1518, 113) (1079, 113)
Disaster Day 2022/EB_0128.csv (1875, 113) (1085, 113)
Disaster Day 2022/JM_1119.csv (1054, 113) (882, 113)
Disaster Day 2022/MM_1136.csv (1593, 113) (537, 113)
Disaster Day 2022/RO_1152.csv (1168, 113) (645, 113)
Disaster Day 2022/BC_1136.csv (1350, 113) (1133, 113)
Disaster Day 2022/HL_1034.csv (1456, 113) (969, 113)
Disaster Day 2022/RP_0853.csv (934, 113) (526, 113)
Disaster Day 2022/AD_1154.csv (2883, 113) (1890, 113)
Disaster Day 2022/MB_0927.csv (1194, 113) (789, 113)
Disaster Day 2022/JL_1013.csv (5093, 113) (3095, 113)
Disaster Day 2022/ES_0938.csv (1625, 113) (1065, 113)
All CSV files renamed by date/12.01.22.1551.csv (420, 113) (404, 113)
Disaster Day 2022/PA_132.csv (2251, 113) (1373, 113)
Disaster Day 2022/TS_0900.csv (2252, 113) (1919, 113)
All CSV files renamed

In [4]:

# Get rid of the path prefix
path_prefix = 'Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/data/logs/'
frvrs_logs_df.file_name = frvrs_logs_df.file_name.map(lambda x: str(x).replace(path_prefix, ''))
nu.store_objects(frvrs_logs_df=frvrs_logs_df)
print(frvrs_logs_df.shape)

Pickling to C:\Users\DaveBabbitt\Documents\GitHub\itm-analysis-reporting\saves\pkl\frvrs_logs_df.pkl
(829277, 113)
