In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

import os
import numpy as np
from pandas import DataFrame
import pandas as pd
import re
from notebook_utils import NotebookUtilities
from datetime import timedelta
import matplotlib.pyplot as plt
import re

nu = NotebookUtilities(data_folder_path=os.path.abspath('../data'))


# OSU dataset of Simulation Sessions, deidentified

Doug screened all of the **VOICE_COMMAND** and **VOICE_CAPTURE** lines and replaced any names with either Max or Jane, regardless of whether the name was that of the trainee.

In [3]:

# Group the event times by consecutive minutes
def get_datetime_indexes(df):
    by_minute = timedelta(minutes=1)
    
    # Create a list of time stamps
    time_stamps = sorted(df.event_time.unique())
    
    # Calculate the time difference between consecutive time stamps
    time_diffs = [b - a for a, b in zip(time_stamps[:-1], time_stamps[1:])]
    
    # Check if any of the time differences are larger than a minute
    if any(time_diff > by_minute for time_diff in time_diffs):
        
        # Create a list of groups
        groups = []
        current_group = []
        for time_stamp, time_diff in zip(time_stamps, time_diffs):
            current_group.append(time_stamp)
            if time_diff > by_minute:
                groups.append(current_group)
                current_group = []
        
        # Add the last group to the list of groups
        current_group.append(time_stamps[-1])
        groups.append(current_group)
            
    else:
        groups = [time_stamps]
    
    # Create a list of DatetimeIndexes
    datetime_indexes_list = []
    for stamps_list in groups:
        start_date = min(stamps_list)
        end_date = max(stamps_list)
        datetime_index = pd.date_range(start=start_date, end=end_date, freq=by_minute)
        datetime_indexes_list.append(datetime_index)

    return datetime_indexes_list

In [4]:

# Get all CSVs into one DataFrame
if nu.pickle_exists('clean_csvs_df'):
    clean_csvs_df = nu.load_object('clean_csvs_df')
else:
    clean_csvs_df = DataFrame([])
    csvs_folder = '../data/csv'
    for sub_directory, directories_list, files_list in os.walk(csvs_folder):
        for file_name in files_list:
            if file_name.endswith('.csv') and file_name.lower().startswith('clean'):
                file_path = os.path.join(sub_directory, file_name)
                df = pd.read_csv(file_path, header=None, index_col=False)
                df['file_name'] = file_name.replace('.csv', '').replace('clean-', '').replace('Clean ', '')
                clean_csvs_df = pd.concat([clean_csvs_df, df], axis='index')
    clean_csvs_df[2] = pd.to_datetime(clean_csvs_df[2], format='%m/%d/%Y %H:%M')

    # Fix columns
    clean_csvs_df = clean_csvs_df.reset_index(drop=True)
    columns_list = ['action_type', 'elapsed_time', 'event_time', 'session_uuid']
    clean_csvs_df.columns = columns_list + clean_csvs_df.columns.tolist()[len(columns_list):]

    old_session_uuid = ''
    for row_index, row_series in clean_csvs_df.iterrows():
        action_type = row_series.action_type
        # elapsed_time = row_series.elapsed_time
        # event_time = row_series.event_time
        new_session_uuid = row_series.session_uuid
        if new_session_uuid != old_session_uuid:
            time_group = 0
            old_session_uuid = new_session_uuid
        if (action_type == 'SESSION_END'): # SessionEnd
            clean_csvs_df.loc[row_index, 'time_group'] = time_group
            continue
        elif (action_type == 'SESSION_START'): # SessionStart
            time_group += 1
            clean_csvs_df.loc[row_index, 'time_group'] = time_group
            continue
        clean_csvs_df.loc[row_index, 'time_group'] = time_group
        if (action_type == 'BAG_ACCESS'): # BagAccess
            clean_csvs_df.loc[row_index, 'bag_access_location'] = row_series[4] # Location
        elif (action_type == 'BAG_CLOSED'): # BagClosed
            clean_csvs_df.loc[row_index, 'bag_closed_location'] = row_series[4] # Location
        elif (action_type == 'INJURY_RECORD'): # InjuryRecord
            clean_csvs_df.loc[row_index, 'injury_record_id'] = row_series[4] # Id
            clean_csvs_df.loc[row_index, 'injury_record_patient_id'] = row_series[5] # patientId
            clean_csvs_df.loc[row_index, 'injury_record_required_procedure'] = row_series[6] # requiredProcedure
            clean_csvs_df.loc[row_index, 'injury_record_severity'] = row_series[7] # severity
            clean_csvs_df.loc[row_index, 'injury_record_body_region'] = row_series[8] # bodyRegion
            clean_csvs_df.loc[row_index, 'injury_record_injury_treated'] = row_series[9] # injuryTreated
            clean_csvs_df.loc[row_index, 'injury_record_injury_treated_with_wrong_treatment'] = row_series[10] # injuryTreatedWithWrongTreatment
            clean_csvs_df.loc[row_index, 'injury_record_injury_injury_locator'] = row_series[11] # injuryLocator
        elif (action_type == 'INJURY_TREATED'): # InjuryTreated
            clean_csvs_df.loc[row_index, 'injury_treated_id'] = row_series[4] # Id
            clean_csvs_df.loc[row_index, 'injury_treated_patient_id'] = row_series[5] # patientId
            clean_csvs_df.loc[row_index, 'injury_treated_required_procedure'] = row_series[6] # requiredProcedure
            clean_csvs_df.loc[row_index, 'injury_treated_severity'] = row_series[7] # severity
            clean_csvs_df.loc[row_index, 'injury_treated_body_region'] = row_series[8] # bodyRegion
            clean_csvs_df.loc[row_index, 'injury_treated_injury_treated'] = row_series[9] # injuryTreated
            clean_csvs_df.loc[row_index, 'injury_treated_injury_treated_with_wrong_treatment'] = row_series[10] # injuryTreatedWithWrongTreatment
            clean_csvs_df.loc[row_index, 'injury_treated_injury_injury_locator'] = row_series[11] # injuryLocator
        elif (action_type == 'PATIENT_DEMOTED'): # PatientDemoted
            clean_csvs_df.loc[row_index, 'patient_demoted_health_level'] = row_series[4] # healthLevel
            clean_csvs_df.loc[row_index, 'patient_demoted_health_time_remaining'] = row_series[5] # healthTimeRemaining
            clean_csvs_df.loc[row_index, 'patient_demoted_id'] = row_series[6] # id
            clean_csvs_df.loc[row_index, 'patient_demoted_position'] = row_series[7] # position
            clean_csvs_df.loc[row_index, 'patient_demoted_rotation'] = row_series[8] # rotation
            clean_csvs_df.loc[row_index, 'patient_demoted_salt'] = row_series[9] # salt
            clean_csvs_df.loc[row_index, 'patient_demoted_sort'] = row_series[10] # sort
            clean_csvs_df.loc[row_index, 'patient_demoted_pulse'] = row_series[11] # pulse
            clean_csvs_df.loc[row_index, 'patient_demoted_breath'] = row_series[12] # breath
            clean_csvs_df.loc[row_index, 'patient_demoted_hearing'] = row_series[13] # hearing
            clean_csvs_df.loc[row_index, 'patient_demoted_mood'] = row_series[14] # mood
            clean_csvs_df.loc[row_index, 'patient_demoted_pose'] = row_series[15] # pose
        elif (action_type == 'PATIENT_ENGAGED'): # PatientEngaged
            clean_csvs_df.loc[row_index, 'patient_engaged_health_level'] = row_series[4] # healthLevel
            clean_csvs_df.loc[row_index, 'patient_engaged_health_time_remaining'] = row_series[5] # healthTimeRemaining
            clean_csvs_df.loc[row_index, 'patient_engaged_id'] = row_series[6] # id
            clean_csvs_df.loc[row_index, 'patient_engaged_position'] = row_series[7] # position
            clean_csvs_df.loc[row_index, 'patient_engaged_rotation'] = row_series[8] # rotation
            clean_csvs_df.loc[row_index, 'patient_engaged_salt'] = row_series[9] # salt
            clean_csvs_df.loc[row_index, 'patient_engaged_sort'] = row_series[10] # sort
            clean_csvs_df.loc[row_index, 'patient_engaged_pulse'] = row_series[11] # pulse
            clean_csvs_df.loc[row_index, 'patient_engaged_breath'] = row_series[12] # breath
            clean_csvs_df.loc[row_index, 'patient_engaged_hearing'] = row_series[13] # hearing
            clean_csvs_df.loc[row_index, 'patient_engaged_mood'] = row_series[14] # mood
            clean_csvs_df.loc[row_index, 'patient_engaged_pose'] = row_series[15] # pose
        elif (action_type == 'PATIENT_RECORD'): # PatientRecord
            clean_csvs_df.loc[row_index, 'patient_record_health_level'] = row_series[4] # healthLevel
            clean_csvs_df.loc[row_index, 'patient_record_health_time_remaining'] = row_series[5] # healthTimeRemaining
            clean_csvs_df.loc[row_index, 'patient_record_id'] = row_series[6] # id
            clean_csvs_df.loc[row_index, 'patient_record_position'] = row_series[7] # position
            clean_csvs_df.loc[row_index, 'patient_record_rotation'] = row_series[8] # rotation
            clean_csvs_df.loc[row_index, 'patient_record_salt'] = row_series[9] # salt
            clean_csvs_df.loc[row_index, 'patient_record_sort'] = row_series[10] # sort
            clean_csvs_df.loc[row_index, 'patient_record_pulse'] = row_series[11] # pulse
            clean_csvs_df.loc[row_index, 'patient_record_breath'] = row_series[12] # breath
            clean_csvs_df.loc[row_index, 'patient_record_hearing'] = row_series[13] # hearing
            clean_csvs_df.loc[row_index, 'patient_record_mood'] = row_series[14] # mood
            clean_csvs_df.loc[row_index, 'patient_record_pose'] = row_series[15] # pose
        elif (action_type == 'PULSE_TAKEN'): # PulseTaken
            clean_csvs_df.loc[row_index, 'pulse_taken_pulse_name'] = row_series[4] # pulseName
            clean_csvs_df.loc[row_index, 'pulse_taken_patient_id'] = row_series[5] # patientId
        elif (action_type == 'S_A_L_T_WALKED'): # SALTWalked
            clean_csvs_df.loc[row_index, 's_a_l_t_walked_sort_location'] = row_series[4] # sortLocation
            clean_csvs_df.loc[row_index, 's_a_l_t_walked_sort_command_text'] = row_series[5] # sortCommandText
            clean_csvs_df.loc[row_index, 's_a_l_t_walked_patient_id'] = row_series[6] # patientId
        elif (action_type == 'S_A_L_T_WALK_IF_CAN'): # SALTWalkIfCan
            clean_csvs_df.loc[row_index, 's_a_l_t_walk_if_can_sort_location'] = row_series[4] # sortLocation
            clean_csvs_df.loc[row_index, 's_a_l_t_walk_if_can_sort_command_text'] = row_series[5] # sortCommandText
            clean_csvs_df.loc[row_index, 's_a_l_t_walk_if_can_patient_id'] = row_series[6] # patientId
        elif (action_type == 'S_A_L_T_WAVED'): # SALTWave
            clean_csvs_df.loc[row_index, 's_a_l_t_waved_sort_location'] = row_series[4] # sortLocation
            clean_csvs_df.loc[row_index, 's_a_l_t_waved_sort_command_text'] = row_series[5] # sortCommandText
            clean_csvs_df.loc[row_index, 's_a_l_t_waved_patient_id'] = row_series[6] # patientId
        elif (action_type == 'S_A_L_T_WAVE_IF_CAN'): # SALTWaveIfCan
            clean_csvs_df.loc[row_index, 's_a_l_t_wave_if_can_sort_location'] = row_series[4] # sortLocation
            clean_csvs_df.loc[row_index, 's_a_l_t_wave_if_can_sort_command_text'] = row_series[5] # sortCommandText
            clean_csvs_df.loc[row_index, 's_a_l_t_wave_if_can_patient_id'] = row_series[6] # patientId
        elif (action_type == 'TAG_APPLIED'): # TagApplied
            clean_csvs_df.loc[row_index, 'tag_applied_patient_id'] = row_series[4] # patientId
            clean_csvs_df.loc[row_index, 'tag_applied_type'] = row_series[5] # type
        elif (action_type == 'TAG_DISCARDED'): # TagDiscarded
            clean_csvs_df.loc[row_index, 'tag_discarded_type'] = row_series[4] # Type
            clean_csvs_df.loc[row_index, 'tag_discarded_location'] = row_series[5] # Location
        elif (action_type == 'TAG_SELECTED'): # TagSelected
            clean_csvs_df.loc[row_index, 'tag_selected_type'] = row_series[4] # Type
        elif (action_type == 'TELEPORT'): # Teleport
            clean_csvs_df.loc[row_index, 'teleport_location'] = row_series[4] # Location
        elif (action_type == 'TOOL_APPLIED'): # ToolApplied
            tool_applied_patient_id = row_series[4]
            if ' Root' in tool_applied_patient_id:
                clean_csvs_df.loc[row_index, 'tool_applied_patient_id'] = tool_applied_patient_id # patientId
            clean_csvs_df.loc[row_index, 'tool_applied_type'] = row_series[5] # type
            clean_csvs_df.loc[row_index, 'tool_applied_attachment_point'] = row_series[6] # attachmentPoint
            clean_csvs_df.loc[row_index, 'tool_applied_tool_location'] = row_series[7] # toolLocation
            clean_csvs_df.loc[row_index, 'tool_applied_data'] = row_series[8] # data
            clean_csvs_df.loc[row_index, 'tool_applied_sender'] = row_series[9] # sender
            clean_csvs_df.loc[row_index, 'tool_applied_attach_message'] = row_series[10] # attachMessage
        elif (action_type == 'TOOL_DISCARDED'): # ToolDiscarded
            clean_csvs_df.loc[row_index, 'tool_discarded_type'] = row_series[4] # Type
            clean_csvs_df.loc[row_index, 'tool_discarded_count'] = row_series[5] # Count
            clean_csvs_df.loc[row_index, 'tool_discarded_location'] = row_series[6] # Location
        elif (action_type == 'TOOL_HOVER'): # ToolHover
            clean_csvs_df.loc[row_index, 'tool_hover_type'] = row_series[4] # Type
            clean_csvs_df.loc[row_index, 'tool_hover_count'] = row_series[5] # Count
        elif (action_type == 'TOOL_SELECTED'): # ToolSelected
            clean_csvs_df.loc[row_index, 'tool_selected_type'] = row_series[4] # Type
            clean_csvs_df.loc[row_index, 'tool_selected_count'] = row_series[5] # Count
        elif (action_type == 'VOICE_CAPTURE'): # VoiceCapture
            clean_csvs_df.loc[row_index, 'voice_capture_message'] = row_series[4] # Message
            clean_csvs_df.loc[row_index, 'voice_capture_command_description'] = row_series[5] # commandDescription
        elif (action_type == 'VOICE_COMMAND'): # VoiceCommand
            clean_csvs_df.loc[row_index, 'voice_command_message'] = row_series[4] # Message
            clean_csvs_df.loc[row_index, 'voice_command_command_description'] = row_series[5] # commandDescription
    
    # Add time groups
    mask_series = clean_csvs_df.time_group.isnull()
    if clean_csvs_df[mask_series].shape[0]:
        columns_list = ['session_uuid', 'elapsed_time', 'event_time']
        srs = clean_csvs_df[columns_list].groupby(['session_uuid']).apply(get_datetime_indexes)
        for session_uuid in srs.index:
            datetime_indexes_list = srs[session_uuid]
            for i, datetime_index in enumerate(datetime_indexes_list):
                mask_series = (clean_csvs_df.session_uuid == session_uuid) & clean_csvs_df.event_time.isin(datetime_index)
                clean_csvs_df.loc[mask_series, 'time_group'] = clean_csvs_df[mask_series].time_group.mode()
                print(clean_csvs_df[mask_series].time_group.dropna())
    clean_csvs_df.time_group = clean_csvs_df.time_group.astype('int64')

    # Remove numeric columns
    columns_list = [x for x in clean_csvs_df.columns if not re.search(r'\d+', str(x))]
    clean_csvs_df = clean_csvs_df[columns_list]
    
    # Convert 'TRUE' and 'FALSE' to boolean values
    for cn in [
        'injury_record_injury_treated_with_wrong_treatment', 'injury_record_injury_treated',
        'injury_treated_injury_treated_with_wrong_treatment', 'injury_treated_injury_treated'
    ]:
        clean_csvs_df[cn] = clean_csvs_df[cn].map({'TRUE': True, 'FALSE': False})
    
    # Collapse into one patient ID column from the rest
    columns_list= [
        'patient_demoted_id', 'patient_record_id', 'injury_record_patient_id', 's_a_l_t_walk_if_can_patient_id',
        's_a_l_t_walked_patient_id', 's_a_l_t_wave_if_can_patient_id', 's_a_l_t_waved_patient_id', 'patient_engaged_id',
        'pulse_taken_patient_id', 'injury_treated_patient_id', 'tool_applied_patient_id', 'tag_applied_patient_id'
    ]
    mask_series = (clean_csvs_df[columns_list].apply(pd.Series.nunique, axis='columns') == 1)
    clean_csvs_df.loc[~mask_series, 'patient_id'] = np.nan
    def f(srs):
        cn = srs.first_valid_index()
        
        return srs[cn]
    clean_csvs_df.loc[mask_series, 'patient_id'] = clean_csvs_df[mask_series][columns_list].apply(f, axis='columns')
    
    # Collapse into one location ID column from the rest
    location_regex = re.compile(r'\((-?\d\.\d, ){2}-?\d\.\d\)')
    srs = clean_csvs_df.applymap(lambda x: bool(location_regex.fullmatch(str(x))), na_action='ignore').sum()
    mask_series = (srs != 0)
    columns_list = srs[mask_series].index.tolist()
    mask_series = (clean_csvs_df[columns_list].apply(pd.Series.nunique, axis='columns') == 1)
    clean_csvs_df.loc[~mask_series, 'location_id'] = np.nan
    def f(srs):
        cn = srs.first_valid_index()
        
        return srs[cn]
    clean_csvs_df.loc[mask_series, 'location_id'] = clean_csvs_df[mask_series][columns_list].apply(f, axis='columns')
    
    nu.store_objects(clean_csvs_df=clean_csvs_df)
    nu.save_dataframes(clean_csvs_df=clean_csvs_df)
print(clean_csvs_df.shape)
df = clean_csvs_df.sample(4).dropna(axis='columns', how='all')
df.T

(149166, 101)


Unnamed: 0,71169,96460,2035,85550
action_type,PATIENT_ENGAGED,TOOL_HOVER,TOOL_HOVER,TOOL_HOVER
elapsed_time,544227,225104,1421254,127898
event_time,2023-05-10 10:49:00,2023-06-12 14:33:00,2023-05-16 15:22:00,2023-05-09 08:55:00
session_uuid,741e494b-63ff-4ea2-ad88-657f2276045d,a0987257-801e-44c5-a1ad-81e0083bfa46,06574b6f-ab02-432c-9a65-7b031218a270,8f6097ce-795d-483a-a811-e6a5423bc59d
file_name,741e494b-63ff-4ea2-ad88-657f2276045d,a0987257-801e-44c5-a1ad-81e0083bfa46,06574b6f-ab02-432c-9a65-7b031218a270,8f6097ce-795d-483a-a811-e6a5423bc59d
time_group,2,1,5,1
patient_engaged_health_level,100,,,
patient_engaged_health_time_remaining,Infinity,,,
patient_engaged_id,Gloria_6 Root,,,
patient_engaged_position,"(-1.2, 0.0, 3.1)",,,


In [5]:

# Mask voice capture PII
columns_list = ['voice_command_command_description', 'voice_capture_message']
if not clean_csvs_df[columns_list].applymap(lambda x: '[PERSON]' in str(x), na_action='ignore').sum().sum():
    import spacy
    nlp = spacy.load('en_core_web_sm')
    import en_core_web_sm
    nlp = en_core_web_sm.load()
    
    mask_series = clean_csvs_df.voice_command_command_description.isnull() & clean_csvs_df.voice_capture_message.isnull()
    df = clean_csvs_df[~mask_series]
    def mask_pii(srs):
        for idx in columns_list:
            new_text = srs[idx]
            if str(new_text) != 'nan':
                doc = nlp(new_text)
                for entity in doc.ents:
                    if entity.label_ == 'PERSON':
                        new_text = re.sub('\\b' + entity.text + '\\b', '[PERSON]', new_text)
                srs[idx] = new_text
    
        return srs
    
    for row_index, row_series in df.apply(mask_pii, axis='columns')[columns_list].iterrows():
        for column_name, column_value in row_series.items():
            if str(column_value) != 'nan':
                clean_csvs_df.loc[row_index, column_name] = column_value
    
    nu.store_objects(clean_csvs_df=clean_csvs_df)
    nu.save_dataframes(clean_csvs_df=clean_csvs_df)

Pickling to C:\Users\DaveBabbitt\Documents\GitHub\notebooks\saves\pkl\clean_csvs_df.pkl
Saving to C:\Users\DaveBabbitt\Documents\GitHub\notebooks\saves\csv\clean_csvs_df.csv
