In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from datetime import timedelta
from frvrs_utils import FRVRSUtilities
from notebook_utils import NotebookUtilities
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import os
import os.path as osp
import pandas as pd
import re
import re

nu = NotebookUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)
fu = FRVRSUtilities(
    data_folder_path=osp.abspath('../data'),
    saves_folder_path=osp.abspath('../saves')
)


# Build and deidentify the OSU dataset of Simulation Sessions

In [3]:

# Get all logs into one data frame
if nu.pickle_exists('frvrs_logs_df'):
    frvrs_logs_df = nu.load_object('frvrs_logs_df')
else:
    
    # Add the CSVs to the data frame
    frvrs_logs_df = fu.concatonate_logs()
    
    # Remove numerically-named columns
    columns_list = [x for x in frvrs_logs_df.columns if not re.search(r'\d+', str(x))]
    frvrs_logs_df = frvrs_logs_df[columns_list]
    
    # Convert 'TRUE' and 'FALSE' to boolean values
    for cn in [
        'injury_record_injury_treated_with_wrong_treatment', 'injury_record_injury_treated',
        'injury_treated_injury_treated_with_wrong_treatment', 'injury_treated_injury_treated'
    ]:
        frvrs_logs_df[cn] = frvrs_logs_df[cn].map({'TRUE': True, 'FALSE': False, 'True': True, 'False': False})
    
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
print(frvrs_logs_df.shape) # (842663, 106)
columns_list = [cn for cn in frvrs_logs_df.columns if 'appl' in cn]
mask_series = False
for cn in columns_list: mask_series |= ~frvrs_logs_df[cn].isnull()
df = frvrs_logs_df[mask_series][columns_list]
display(df.sample(min(4, df.shape[0])).dropna(axis='columns', how='all').T)

All CSV files renamed by date/03.10.23.0753.csv


RuntimeError: No active exception to reraise


## Check for duplicate file ingestion

In [None]:

# Filter all the rows that have more than one unique value in the file_name column for each value in the session_uuid column
mask_series = (frvrs_logs_df.groupby('session_uuid').file_name.transform(pd.Series.nunique) > 1)
assert frvrs_logs_df[mask_series].shape[0] == 0, "You have duplicate files"

In [7]:

columns_list = ['session_uuid', 'file_name']
frvrs_logs_df[mask_series][columns_list].drop_duplicates().T.to_dict()

{56265: {'session_uuid': 'a7804ee3-6a1c-4462-957a-17976a540483', 'file_name': 'Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/data/logs/All CSV files renamed by date/03.15.23.1220.csv'}, 137347: {'session_uuid': '91a84c6a-ba27-4116-8644-e5373b3588de', 'file_name': 'Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/data/logs/All CSV files renamed by date/12.01.22.1551.csv'}, 193921: {'session_uuid': 'ab1f8cd1-8d65-45da-b087-89b25ff46c66', 'file_name': 'Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/data/logs/DCEMS Round 2 only triage sessions/ab1f8cd1-8d65-45da-b087-89b25ff46c66.csv'}, 228447: {'session_uuid': '677d1c18-f292-4bcb-924d-52f5e762533f', 'file_name': 'Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/data/logs/Disaster Day 2022/AD_1154.csv'}, 231330: {'session_uuid': 'aca8a746-0f72-4c8d-87a5-70d836da8768', 'file_name': 'Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/data/logs/Disaster Day 2022/AF.csv'}, 232175: {'session_uuid'


## Add new features according to your increasing domain knowledge

In [None]:

# Modalize into one patient ID column if possible
new_column_name = 'patient_id'
if (new_column_name not in frvrs_logs_df.columns):
    columns_list= [
        'patient_demoted_id', 'patient_record_id', 'injury_record_patient_id', 's_a_l_t_walk_if_can_patient_id',
        's_a_l_t_walked_patient_id', 's_a_l_t_wave_if_can_patient_id', 's_a_l_t_waved_patient_id', 'patient_engaged_id',
        'pulse_taken_patient_id', 'injury_treated_patient_id', 'tool_applied_patient_id', 'tag_applied_patient_id',
        'player_gaze_patient_id'
    ]
    frvrs_logs_df = nu.modalize_columns(frvrs_logs_df, columns_list, new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    print(frvrs_logs_df.shape) # (842663, 107)

In [None]:

# Modalize into one location ID column if possible
new_column_name = 'location_id'
if (new_column_name not in frvrs_logs_df.columns):
    columns_list= [
        'teleport_location', 'patient_demoted_position', 'patient_record_position', 'injury_record_injury_injury_locator',
        's_a_l_t_walk_if_can_sort_location', 's_a_l_t_walked_sort_location', 's_a_l_t_wave_if_can_sort_location',
        's_a_l_t_waved_sort_location', 'patient_engaged_position', 'bag_access_location', 'injury_treated_injury_injury_locator',
        'bag_closed_location', 'tag_discarded_location', 'tool_discarded_location', 'player_location_location',
        'player_gaze_location'
    ]
    frvrs_logs_df = nu.modalize_columns(frvrs_logs_df, columns_list, new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    print(frvrs_logs_df.shape) # (842663, 108)

In [None]:

# Modalize into one injury ID column if possible
new_column_name = 'injury_id'
if (new_column_name not in frvrs_logs_df.columns):
    frvrs_logs_df = nu.modalize_columns(frvrs_logs_df, ['injury_record_id', 'injury_treated_id'], new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    print(frvrs_logs_df.shape) # (842663, 109)

In [None]:

# Modalize into one patient sort column if possible
new_column_name = 'patient_sort'
if (new_column_name not in frvrs_logs_df.columns):
    frvrs_logs_df = nu.modalize_columns(frvrs_logs_df, ['patient_demoted_sort', 'patient_record_sort', 'patient_engaged_sort'], new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    print(frvrs_logs_df.shape) # (829277, 109)

In [None]:

# Any runs longer than that 16 minutes are probably an instance
# of someone taking off the headset and setting it on the ground.
# 1 second = 1,000 milliseconds; 1 minute = 60 seconds
new_column_name = 'is_scene_aborted'
if (new_column_name in frvrs_logs_df.columns): frvrs_logs_df = frvrs_logs_df.drop(columns=new_column_name)
if (new_column_name not in frvrs_logs_df.columns):
    frvrs_logs_df[new_column_name] = False
    for (session_uuid, scene_index), scene_df in frvrs_logs_df.groupby(fu.scene_groupby_columns):
        mask_series = True
        for cn in fu.scene_groupby_columns: mask_series &= (frvrs_logs_df[cn] == eval(cn))
        frvrs_logs_df.loc[mask_series, new_column_name] = fu.get_is_scene_aborted(scene_df)
    
    # Store the results and show the new data frame shape
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    print(frvrs_logs_df.shape) # (842663, 111)
    display(frvrs_logs_df.groupby('is_scene_aborted').size().to_frame().rename(columns={0: 'count'}))

In [None]:

# Check if all the patient IDs in any run are some variant of Mike and designate those runs as "Orientation"
if ('scene_type' not in frvrs_logs_df.columns): frvrs_logs_df['scene_type'] = 'Triage'
column_value = 'Orientation'
if (column_value not in frvrs_logs_df.scene_type.unique()):
    
    # Filter out those files from the dataset and mark them
    base_mask_series = frvrs_logs_df.groupby(fu.scene_groupby_columns).patient_id.transform(lambda srs: all(srs.str.lower().str.contains('mike')))
    frvrs_logs_df.loc[base_mask_series, 'scene_type'] = column_value
    
    # Store the results and show the new data frame shape
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    print(frvrs_logs_df.shape) # (842663, 113)

In [None]:

# Get a sample with a clear count of responders
new_column_name = 'is_a_one_triage_file'
if (new_column_name not in frvrs_logs_df.columns):
    frvrs_logs_df[new_column_name] = False
    for file_name in frvrs_logs_df.file_name.unique():
        is_a_one_triage_file = fu.get_is_a_one_triage_file(frvrs_logs_df, file_name)
        mask_series = (frvrs_logs_df.file_name == file_name)
        frvrs_logs_df.loc[mask_series, new_column_name] = is_a_one_triage_file
    
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    print(frvrs_logs_df.shape) # (842663, 111)

In [None]:

# Mask voice capture PII
# OSU screened all of the **VOICE_COMMAND** and **VOICE_CAPTURE** lines and
# replaced any names with either Max or Jane, regardless of whether the name was that of the responder.
# But, just to make sure...
columns_list = ['voice_command_command_description', 'voice_capture_message']
if not frvrs_logs_df[columns_list].applymap(lambda x: '[PERSON]' in str(x), na_action='ignore').sum().sum():
    import spacy
    nlp = spacy.load('en_core_web_sm')
    import en_core_web_sm
    nlp = en_core_web_sm.load()
    
    mask_series = frvrs_logs_df.voice_command_command_description.isnull() & frvrs_logs_df.voice_capture_message.isnull()
    df = frvrs_logs_df[~mask_series]
    def mask_pii(srs):
        for idx in columns_list:
            new_text = srs[idx]
            if str(new_text) != 'nan':
                doc = nlp(new_text)
                for entity in doc.ents:
                    if entity.label_ == 'PERSON':
                        new_text = re.sub('\\b' + entity.text + '\\b', '[PERSON]', new_text)
                srs[idx] = new_text
    
        return srs
    
    for row_index, row_series in df.apply(mask_pii, axis='columns')[columns_list].iterrows():
        for column_name, column_value in row_series.items():
            if str(column_value) != 'nan':
                frvrs_logs_df.loc[row_index, column_name] = column_value
    
    # Store the results and show the new data frame shape
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    print(frvrs_logs_df.shape) # (842663, 110)

In [None]:

# Add a voice capture sentiment score
if ('voice_capture_sentiment_score' not in frvrs_logs_df.columns):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    mask_series = frvrs_logs_df.voice_capture_message.isnull()
    for row_index, row_series in frvrs_logs_df[~mask_series].iterrows():
        voice_capture_message = '\n' + row_series.voice_capture_message
        frvrs_logs_df.loc[row_index, 'voice_capture_sentiment_score'] = sid.polarity_scores(voice_capture_message)['compound']
    
    # Store the results and show the new data frame shape
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    print(frvrs_logs_df.shape) # (842663, 110)