In [1]:

%pprint
import sys
sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

import os
import numpy as np
from pandas import DataFrame
import pandas as pd
import re
from notebook_utils import NotebookUtilities
from datetime import timedelta
import matplotlib.pyplot as plt
import re

nu = NotebookUtilities(data_folder_path=os.path.abspath('../data'))


# OSU dataset of Simulation Sessions, deidentified

Doug screened all of the **VOICE_COMMAND** and **VOICE_CAPTURE** lines and replaced any names with either Max or Jane, regardless of whether the name was that of the trainee.

In [3]:

# Group the event times by consecutive minutes
def get_datetime_indexes(df):
    by_minute = timedelta(minutes=1)
    
    # Create a list of time stamps
    time_stamps = sorted(df.event_time.unique())
    
    # Calculate the time difference between consecutive time stamps
    time_diffs = [b - a for a, b in zip(time_stamps[:-1], time_stamps[1:])]
    
    # Check if any of the time differences are larger than a minute
    if any(time_diff > by_minute for time_diff in time_diffs):
        
        # Create a list of groups
        groups = []
        current_group = []
        for time_stamp, time_diff in zip(time_stamps, time_diffs):
            current_group.append(time_stamp)
            if time_diff > by_minute:
                groups.append(current_group)
                current_group = []
        
        # Add the last group to the list of groups
        current_group.append(time_stamps[-1])
        groups.append(current_group)
            
    else:
        groups = [time_stamps]
    
    # Create a list of DatetimeIndexes
    datetime_indexes_list = []
    for stamps_list in groups:
        start_date = min(stamps_list)
        end_date = max(stamps_list)
        datetime_index = pd.date_range(start=start_date, end=end_date, freq=by_minute)
        datetime_indexes_list.append(datetime_index)

    return datetime_indexes_list

In [4]:

def set_time_groups(frvrs_logs_df):

    # Section off player actions by session start and end
    old_session_uuid = ''
    gaze_rows_list = []
    for row_index, row_series in frvrs_logs_df.iterrows():
        action_type = row_series.action_type
        # elapsed_time = row_series.elapsed_time
        # event_time = row_series.event_time
        new_session_uuid = row_series.session_uuid
        if new_session_uuid != old_session_uuid:
            time_group = 0
            old_session_uuid = new_session_uuid
        if (action_type == 'SESSION_END'): # SessionEnd
            frvrs_logs_df.loc[row_index, 'time_group'] = time_group
            continue
        elif (action_type == 'SESSION_START'): # SessionStart
            time_group += 1
            frvrs_logs_df.loc[row_index, 'time_group'] = time_group
            continue
        frvrs_logs_df.loc[row_index, 'time_group'] = time_group
        frvrs_logs_df = set_mcivr_metrics_types(action_type, frvrs_logs_df, row_index, row_series)
        if (action_type in ['PLAYER_GAZE', 'PLAYER_LOCATION']):
            gaze_rows_list.append(row_series)
    
    # Add time groups
    mask_series = frvrs_logs_df.time_group.isnull()
    if frvrs_logs_df[mask_series].shape[0]:
        columns_list = ['session_uuid', 'elapsed_time', 'event_time']
        srs = frvrs_logs_df[columns_list].groupby(['session_uuid']).apply(get_datetime_indexes)
        for session_uuid in srs.index:
            datetime_indexes_list = srs[session_uuid]
            for i, datetime_index in enumerate(datetime_indexes_list):
                mask_series = (frvrs_logs_df.session_uuid == session_uuid) & frvrs_logs_df.event_time.isin(datetime_index)
                frvrs_logs_df.loc[mask_series, 'time_group'] = frvrs_logs_df[mask_series].time_group.mode()
                print(frvrs_logs_df[mask_series].time_group.dropna())
    frvrs_logs_df.time_group = frvrs_logs_df.time_group.astype('int64')

    return gaze_rows_list, frvrs_logs_df

In [5]:

def modalize_patient_id(frvrs_logs_df):
    columns_list= [
        'patient_demoted_id', 'patient_record_id', 'injury_record_patient_id', 's_a_l_t_walk_if_can_patient_id',
        's_a_l_t_walked_patient_id', 's_a_l_t_wave_if_can_patient_id', 's_a_l_t_waved_patient_id', 'patient_engaged_id',
        'pulse_taken_patient_id', 'injury_treated_patient_id', 'tool_applied_patient_id', 'tag_applied_patient_id',
        'player_gaze_patient_id'
    ]
    mask_series = (frvrs_logs_df[columns_list].apply(pd.Series.nunique, axis='columns') == 1)
    frvrs_logs_df.loc[~mask_series, 'patient_id'] = np.nan
    def f(srs):
        cn = srs.first_valid_index()
        
        return srs[cn]
    frvrs_logs_df.loc[mask_series, 'patient_id'] = frvrs_logs_df[mask_series][columns_list].apply(f, axis='columns')

    return frvrs_logs_df

In [6]:

def modalize_location_id(frvrs_logs_df):
    
    # All locations have zero in the y dimension and positive floats in the x and z dimensions
    # location_regex = re.compile(r'\((-?\d\.\d, ){2}-?\d\.\d\)')
    # srs = frvrs_logs_df.applymap(lambda x: bool(location_regex.fullmatch(str(x))), na_action='ignore').sum()
    # mask_series = (srs != 0)
    # columns_list = srs[mask_series].index.tolist()
    
    # Modalize into one location ID column if possible
    columns_list= [
        'teleport_location', 'patient_demoted_position', 'patient_record_position', 'injury_record_injury_injury_locator',
        's_a_l_t_walk_if_can_sort_location', 's_a_l_t_walked_sort_location', 's_a_l_t_wave_if_can_sort_location',
        's_a_l_t_waved_sort_location', 'patient_engaged_position', 'bag_access_location', 'injury_treated_injury_injury_locator',
        'bag_closed_location', 'tag_discarded_location', 'tool_discarded_location', 'player_location_location',
        'player_gaze_location'
    ]
    mask_series = (frvrs_logs_df[columns_list].apply(pd.Series.nunique, axis='columns') == 1)
    frvrs_logs_df.loc[~mask_series, 'location_id'] = np.nan
    def f(srs):
        cn = srs.first_valid_index()
        
        return srs[cn]
    frvrs_logs_df.loc[mask_series, 'location_id'] = frvrs_logs_df[mask_series][columns_list].apply(f, axis='columns')

    return frvrs_logs_df 

In [7]:

def concatonate_logs(logs_folder='../data/logs'):
    frvrs_logs_df = DataFrame([])
    for sub_directory, directories_list, files_list in os.walk(logs_folder):
        sub_directory_df = DataFrame([])
        for file_name in files_list:
            if sub_directory.endswith('v.1.0'):
                file_filter = file_name.endswith('.csv') and file_name.lower().startswith('clean')
            elif sub_directory.endswith('v.1.3'):
                file_filter = file_name.endswith('.csv')
            if file_filter:
                file_path = os.path.join(sub_directory, file_name)
                if sub_directory.endswith('v.1.0'):
                    df = pd.read_csv(file_path, header=None, index_col=False)
                else:
                    version_number = '1.3'
                    rows_list = []
                    with open(file_path, 'r') as f:
                        reader = csv.reader(f, delimiter=',', quotechar='"')
                        for values_list in reader:
                            version_number = values_list.pop(4) # Remove version number column
                            if (values_list[-1] == ''): values_list.pop(-1)
                            rows_list.append({i: v for i, v in enumerate(values_list)})
                    df = pd.DataFrame(rows_list)
                df['file_name'] = file_name
                if sub_directory.endswith('v.1.0'):
                    df['logger_version'] = 1.0
                else:
                    df['logger_version'] = float(version_number)
                sub_directory_df = pd.concat([sub_directory_df, df], axis='index')
        
        # Parse the third column as a date column
        if (2 in sub_directory_df.columns):
            if sub_directory.endswith('v.1.0'):
                sub_directory_df[2] = pd.to_datetime(sub_directory_df[2], format='%m/%d/%Y %H:%M')
            # elif sub_directory.endswith('v.1.3'):
            #     sub_directory_df[2] = pd.to_datetime(sub_directory_df[2], format='%m/%d/%Y %I:%M:%S %p')
            else: sub_directory_df[2] = pd.to_datetime(sub_directory_df[2], format='mixed')
        
        frvrs_logs_df = pd.concat([frvrs_logs_df, sub_directory_df], axis='index')

    # Convert event time to a datetime
    if (2 in frvrs_logs_df.columns): frvrs_logs_df[2] = pd.to_datetime(frvrs_logs_df[2], format='mixed')
    
    # Convert elapsed time to an integer
    if (1 in frvrs_logs_df.columns):
        frvrs_logs_df[1] = pd.to_numeric(frvrs_logs_df[1], errors='coerce')
        frvrs_logs_df[1] = frvrs_logs_df[1].astype('int64')
    
    frvrs_logs_df = frvrs_logs_df.reset_index(drop=True)

    return frvrs_logs_df

In [8]:

def set_mcivr_metrics_types(action_type, frvrs_logs_df, row_index, row_series):
    if (action_type == 'BAG_ACCESS'): # BagAccess
        frvrs_logs_df.loc[row_index, 'bag_access_location'] = row_series[4] # Location
    elif (action_type == 'BAG_CLOSED'): # BagClosed
        frvrs_logs_df.loc[row_index, 'bag_closed_location'] = row_series[4] # Location
    elif (action_type == 'INJURY_RECORD'): # InjuryRecord
        frvrs_logs_df.loc[row_index, 'injury_record_id'] = row_series[4] # Id
        frvrs_logs_df.loc[row_index, 'injury_record_patient_id'] = row_series[5] # patientId
        frvrs_logs_df.loc[row_index, 'injury_record_required_procedure'] = row_series[6] # requiredProcedure
        frvrs_logs_df.loc[row_index, 'injury_record_severity'] = row_series[7] # severity
        frvrs_logs_df.loc[row_index, 'injury_record_body_region'] = row_series[8] # bodyRegion
        frvrs_logs_df.loc[row_index, 'injury_record_injury_treated'] = row_series[9] # injuryTreated
        frvrs_logs_df.loc[row_index, 'injury_record_injury_treated_with_wrong_treatment'] = row_series[10] # injuryTreatedWithWrongTreatment
        frvrs_logs_df.loc[row_index, 'injury_record_injury_injury_locator'] = row_series[11] # injuryLocator
    elif (action_type == 'INJURY_TREATED'): # InjuryTreated
        frvrs_logs_df.loc[row_index, 'injury_treated_id'] = row_series[4] # Id
        frvrs_logs_df.loc[row_index, 'injury_treated_patient_id'] = row_series[5] # patientId
        frvrs_logs_df.loc[row_index, 'injury_treated_required_procedure'] = row_series[6] # requiredProcedure
        frvrs_logs_df.loc[row_index, 'injury_treated_severity'] = row_series[7] # severity
        frvrs_logs_df.loc[row_index, 'injury_treated_body_region'] = row_series[8] # bodyRegion
        frvrs_logs_df.loc[row_index, 'injury_treated_injury_treated'] = row_series[9] # injuryTreated
        frvrs_logs_df.loc[row_index, 'injury_treated_injury_treated_with_wrong_treatment'] = row_series[10] # injuryTreatedWithWrongTreatment
        frvrs_logs_df.loc[row_index, 'injury_treated_injury_injury_locator'] = row_series[11] # injuryLocator
    elif (action_type == 'PATIENT_DEMOTED'): # PatientDemoted
        frvrs_logs_df.loc[row_index, 'patient_demoted_health_level'] = row_series[4] # healthLevel
        frvrs_logs_df.loc[row_index, 'patient_demoted_health_time_remaining'] = row_series[5] # healthTimeRemaining
        frvrs_logs_df.loc[row_index, 'patient_demoted_id'] = row_series[6] # id
        frvrs_logs_df.loc[row_index, 'patient_demoted_position'] = row_series[7] # position
        frvrs_logs_df.loc[row_index, 'patient_demoted_rotation'] = row_series[8] # rotation
        frvrs_logs_df.loc[row_index, 'patient_demoted_salt'] = row_series[9] # salt
        frvrs_logs_df.loc[row_index, 'patient_demoted_sort'] = row_series[10] # sort
        frvrs_logs_df.loc[row_index, 'patient_demoted_pulse'] = row_series[11] # pulse
        frvrs_logs_df.loc[row_index, 'patient_demoted_breath'] = row_series[12] # breath
        frvrs_logs_df.loc[row_index, 'patient_demoted_hearing'] = row_series[13] # hearing
        frvrs_logs_df.loc[row_index, 'patient_demoted_mood'] = row_series[14] # mood
        frvrs_logs_df.loc[row_index, 'patient_demoted_pose'] = row_series[15] # pose
    elif (action_type == 'PATIENT_ENGAGED'): # PatientEngaged
        frvrs_logs_df.loc[row_index, 'patient_engaged_health_level'] = row_series[4] # healthLevel
        frvrs_logs_df.loc[row_index, 'patient_engaged_health_time_remaining'] = row_series[5] # healthTimeRemaining
        frvrs_logs_df.loc[row_index, 'patient_engaged_id'] = row_series[6] # id
        frvrs_logs_df.loc[row_index, 'patient_engaged_position'] = row_series[7] # position
        frvrs_logs_df.loc[row_index, 'patient_engaged_rotation'] = row_series[8] # rotation
        frvrs_logs_df.loc[row_index, 'patient_engaged_salt'] = row_series[9] # salt
        frvrs_logs_df.loc[row_index, 'patient_engaged_sort'] = row_series[10] # sort
        frvrs_logs_df.loc[row_index, 'patient_engaged_pulse'] = row_series[11] # pulse
        frvrs_logs_df.loc[row_index, 'patient_engaged_breath'] = row_series[12] # breath
        frvrs_logs_df.loc[row_index, 'patient_engaged_hearing'] = row_series[13] # hearing
        frvrs_logs_df.loc[row_index, 'patient_engaged_mood'] = row_series[14] # mood
        frvrs_logs_df.loc[row_index, 'patient_engaged_pose'] = row_series[15] # pose
    elif (action_type == 'PATIENT_RECORD'): # PatientRecord
        frvrs_logs_df.loc[row_index, 'patient_record_health_level'] = row_series[4] # healthLevel
        frvrs_logs_df.loc[row_index, 'patient_record_health_time_remaining'] = row_series[5] # healthTimeRemaining
        frvrs_logs_df.loc[row_index, 'patient_record_id'] = row_series[6] # id
        frvrs_logs_df.loc[row_index, 'patient_record_position'] = row_series[7] # position
        frvrs_logs_df.loc[row_index, 'patient_record_rotation'] = row_series[8] # rotation
        frvrs_logs_df.loc[row_index, 'patient_record_salt'] = row_series[9] # salt
        frvrs_logs_df.loc[row_index, 'patient_record_sort'] = row_series[10] # sort
        frvrs_logs_df.loc[row_index, 'patient_record_pulse'] = row_series[11] # pulse
        frvrs_logs_df.loc[row_index, 'patient_record_breath'] = row_series[12] # breath
        frvrs_logs_df.loc[row_index, 'patient_record_hearing'] = row_series[13] # hearing
        frvrs_logs_df.loc[row_index, 'patient_record_mood'] = row_series[14] # mood
        frvrs_logs_df.loc[row_index, 'patient_record_pose'] = row_series[15] # pose
    elif (action_type == 'PULSE_TAKEN'): # PulseTaken
        frvrs_logs_df.loc[row_index, 'pulse_taken_pulse_name'] = row_series[4] # pulseName
        frvrs_logs_df.loc[row_index, 'pulse_taken_patient_id'] = row_series[5] # patientId
    elif (action_type == 'S_A_L_T_WALKED'): # SALTWalked
        frvrs_logs_df.loc[row_index, 's_a_l_t_walked_sort_location'] = row_series[4] # sortLocation
        frvrs_logs_df.loc[row_index, 's_a_l_t_walked_sort_command_text'] = row_series[5] # sortCommandText
        frvrs_logs_df.loc[row_index, 's_a_l_t_walked_patient_id'] = row_series[6] # patientId
    elif (action_type == 'S_A_L_T_WALK_IF_CAN'): # SALTWalkIfCan
        frvrs_logs_df.loc[row_index, 's_a_l_t_walk_if_can_sort_location'] = row_series[4] # sortLocation
        frvrs_logs_df.loc[row_index, 's_a_l_t_walk_if_can_sort_command_text'] = row_series[5] # sortCommandText
        frvrs_logs_df.loc[row_index, 's_a_l_t_walk_if_can_patient_id'] = row_series[6] # patientId
    elif (action_type == 'S_A_L_T_WAVED'): # SALTWave
        frvrs_logs_df.loc[row_index, 's_a_l_t_waved_sort_location'] = row_series[4] # sortLocation
        frvrs_logs_df.loc[row_index, 's_a_l_t_waved_sort_command_text'] = row_series[5] # sortCommandText
        frvrs_logs_df.loc[row_index, 's_a_l_t_waved_patient_id'] = row_series[6] # patientId
    elif (action_type == 'S_A_L_T_WAVE_IF_CAN'): # SALTWaveIfCan
        frvrs_logs_df.loc[row_index, 's_a_l_t_wave_if_can_sort_location'] = row_series[4] # sortLocation
        frvrs_logs_df.loc[row_index, 's_a_l_t_wave_if_can_sort_command_text'] = row_series[5] # sortCommandText
        frvrs_logs_df.loc[row_index, 's_a_l_t_wave_if_can_patient_id'] = row_series[6] # patientId
    elif (action_type == 'TAG_APPLIED'): # TagApplied
        frvrs_logs_df.loc[row_index, 'tag_applied_patient_id'] = row_series[4] # patientId
        frvrs_logs_df.loc[row_index, 'tag_applied_type'] = row_series[5] # type
    elif (action_type == 'TAG_DISCARDED'): # TagDiscarded
        frvrs_logs_df.loc[row_index, 'tag_discarded_type'] = row_series[4] # Type
        frvrs_logs_df.loc[row_index, 'tag_discarded_location'] = row_series[5] # Location
    elif (action_type == 'TAG_SELECTED'): # TagSelected
        frvrs_logs_df.loc[row_index, 'tag_selected_type'] = row_series[4] # Type
    elif (action_type == 'TELEPORT'): # Teleport
        frvrs_logs_df.loc[row_index, 'teleport_location'] = row_series[4] # Location
    elif (action_type == 'TOOL_APPLIED'): # ToolApplied
        tool_applied_patient_id = row_series[4]
        if ' Root' in tool_applied_patient_id:
            frvrs_logs_df.loc[row_index, 'tool_applied_patient_id'] = tool_applied_patient_id # patientId
        frvrs_logs_df.loc[row_index, 'tool_applied_type'] = row_series[5] # type
        frvrs_logs_df.loc[row_index, 'tool_applied_attachment_point'] = row_series[6] # attachmentPoint
        frvrs_logs_df.loc[row_index, 'tool_applied_tool_location'] = row_series[7] # toolLocation
        frvrs_logs_df.loc[row_index, 'tool_applied_data'] = row_series[8] # data
        frvrs_logs_df.loc[row_index, 'tool_applied_sender'] = row_series[9] # sender
        frvrs_logs_df.loc[row_index, 'tool_applied_attach_message'] = row_series[10] # attachMessage
    elif (action_type == 'TOOL_DISCARDED'): # ToolDiscarded
        frvrs_logs_df.loc[row_index, 'tool_discarded_type'] = row_series[4] # Type
        frvrs_logs_df.loc[row_index, 'tool_discarded_count'] = row_series[5] # Count
        frvrs_logs_df.loc[row_index, 'tool_discarded_location'] = row_series[6] # Location
    elif (action_type == 'TOOL_HOVER'): # ToolHover
        frvrs_logs_df.loc[row_index, 'tool_hover_type'] = row_series[4] # Type
        frvrs_logs_df.loc[row_index, 'tool_hover_count'] = row_series[5] # Count
    elif (action_type == 'TOOL_SELECTED'): # ToolSelected
        frvrs_logs_df.loc[row_index, 'tool_selected_type'] = row_series[4] # Type
        frvrs_logs_df.loc[row_index, 'tool_selected_count'] = row_series[5] # Count
    elif (action_type == 'VOICE_CAPTURE'): # VoiceCapture
        frvrs_logs_df.loc[row_index, 'voice_capture_message'] = row_series[4] # Message
        frvrs_logs_df.loc[row_index, 'voice_capture_command_description'] = row_series[5] # commandDescription
    elif (action_type == 'VOICE_COMMAND'): # VoiceCommand
        frvrs_logs_df.loc[row_index, 'voice_command_message'] = row_series[4] # Message
        frvrs_logs_df.loc[row_index, 'voice_command_command_description'] = row_series[5] # commandDescription
    elif (action_type == 'PLAYER_LOCATION'): # PlayerLocation
        frvrs_logs_df.loc[row_index, 'player_location_location'] = row_series[4] # Location (x,y,z)
        frvrs_logs_df.loc[row_index, 'player_location_left_hand_location'] = row_series[5] # Left Hand Location (x,y,z); deactivated in v1.3
        frvrs_logs_df.loc[row_index, 'player_location_right_hand_location'] = row_series[6] # Right Hand Location (x,y,z); deactivated in v1.3
    elif (action_type == 'PLAYER_GAZE'): # PlayerGaze
        if ' Root' in row_series[4]:
            frvrs_logs_df.loc[row_index, 'player_gaze_patient_id'] = row_series[4] # PatientID
            frvrs_logs_df.loc[row_index, 'player_gaze_location'] = row_series[5] # Location (x,y,z)
        elif ' Root' in row_series[5]:
            frvrs_logs_df.loc[row_index, 'player_gaze_location'] = row_series[4] # Location (x,y,z)
            frvrs_logs_df.loc[row_index, 'player_gaze_patient_id'] = row_series[5] # PatientID
        frvrs_logs_df.loc[row_index, 'player_gaze_distance_to_patient'] = row_series[6] # Distance to Patient
        frvrs_logs_df.loc[row_index, 'player_gaze_direction_of_gaze'] = row_series[7] # Direction of Gaze (vector3)

    return frvrs_logs_df

In [9]:

import csv

# Get all logs into one DataFrame
if nu.pickle_exists('frvrs_logs_df'):
    frvrs_logs_df = nu.load_object('frvrs_logs_df')
else:

    # Add the CSVs to the data frame
    frvrs_logs_df = concatonate_logs()
        
    # Name the global columns
    columns_list = ['action_type', 'elapsed_time', 'event_time', 'session_uuid']
    frvrs_logs_df.columns = columns_list + frvrs_logs_df.columns.tolist()[len(columns_list):]

    # Section off player actions by session start and end
    gaze_rows_list, frvrs_logs_df = set_time_groups(frvrs_logs_df)

    # Remove numerically-named columns
    columns_list = [x for x in frvrs_logs_df.columns if not re.search(r'\d+', str(x))]
    frvrs_logs_df = frvrs_logs_df[columns_list]
    
    # Convert 'TRUE' and 'FALSE' to boolean values
    for cn in [
        'injury_record_injury_treated_with_wrong_treatment', 'injury_record_injury_treated',
        'injury_treated_injury_treated_with_wrong_treatment', 'injury_treated_injury_treated'
    ]:
        frvrs_logs_df[cn] = frvrs_logs_df[cn].map({'TRUE': True, 'FALSE': False, 'True': True, 'False': False})#.astype('bool')
    
    # Modalize into one patient ID column if possible
    frvrs_logs_df = modalize_patient_id(frvrs_logs_df)
    
    # Modalize into one location ID column if possible
    frvrs_logs_df = modalize_location_id(frvrs_logs_df)
    
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    nu.save_dataframes(frvrs_logs_df=frvrs_logs_df)
print(frvrs_logs_df.shape) # (168528, 106)
df = frvrs_logs_df.sample(4).dropna(axis='columns', how='all')
df.T

Pickling to C:\Users\DaveBabbitt\Documents\GitHub\itm-analysis-reporting\saves\pkl\frvrs_logs_df.pkl
Saving to C:\Users\DaveBabbitt\Documents\GitHub\itm-analysis-reporting\saves\csv\frvrs_logs_df.csv
(168528, 108)


Unnamed: 0,124107,16376,129842,117718
action_type,TOOL_APPLIED,TOOL_HOVER,VOICE_COMMAND,TOOL_HOVER
elapsed_time,490446,3786366,386240,180986
event_time,2023-06-23 09:49:00,2023-05-16 17:18:00,2023-04-21 12:14:00,2023-05-31 10:37:00
session_uuid,e0b7bea0-1cf9-42a8-a02c-e8699a962779,724fdf45-3165-43b8-b9ca-dc07102d2886,60d42fe6-f30b-4eab-ba01-644dde714e4d,d605640c-fde5-412f-8d33-b9b2744172ab
file_name,clean-e0b7bea0-1cf9-42a8-a02c-e8699a962779.csv,Clean 724fdf45-3165-43b8-b9ca-dc07102d2886.csv,clean-max1.csv,clean-d605640c-fde5-412f-8d33-b9b2744172ab.csv
logger_version,1.0,1.0,1.0,1.0
time_group,1,10,2,1
voice_command_message,,,wave if you can,
voice_command_command_description,,,alright if you can hear my voice wave your arm...,
tool_hover_type,,Needle,,Gauze


In [10]:

import numpy as np

def modalize_injury_id(frvrs_logs_df):
    columns_list= [
        'injury_record_id', 'injury_treated_id'
    ]
    mask_series = (frvrs_logs_df[columns_list].apply(pd.Series.nunique, axis='columns') == 1)
    frvrs_logs_df.loc[~mask_series, 'injury_id'] = np.nan
    def f(srs):
        cn = srs.first_valid_index()
        
        return srs[cn]
    frvrs_logs_df.loc[mask_series, 'injury_id'] = frvrs_logs_df[mask_series][columns_list].apply(f, axis='columns')

    return frvrs_logs_df

# Modalize into one injury ID column if possible
frvrs_logs_df = modalize_injury_id(frvrs_logs_df)

nu.store_objects(frvrs_logs_df=frvrs_logs_df)
nu.save_dataframes(frvrs_logs_df=frvrs_logs_df)
print(frvrs_logs_df.shape) # (168528, 106)

Pickling to C:\Users\DaveBabbitt\Documents\GitHub\itm-analysis-reporting\saves\pkl\frvrs_logs_df.pkl
Saving to C:\Users\DaveBabbitt\Documents\GitHub\itm-analysis-reporting\saves\csv\frvrs_logs_df.csv
(168528, 109)


In [11]:

# Mask voice capture PII
columns_list = ['voice_command_command_description', 'voice_capture_message']
if not frvrs_logs_df[columns_list].applymap(lambda x: '[PERSON]' in str(x), na_action='ignore').sum().sum():
    import spacy
    nlp = spacy.load('en_core_web_sm')
    import en_core_web_sm
    nlp = en_core_web_sm.load()
    
    mask_series = frvrs_logs_df.voice_command_command_description.isnull() & frvrs_logs_df.voice_capture_message.isnull()
    df = frvrs_logs_df[~mask_series]
    def mask_pii(srs):
        for idx in columns_list:
            new_text = srs[idx]
            if str(new_text) != 'nan':
                doc = nlp(new_text)
                for entity in doc.ents:
                    if entity.label_ == 'PERSON':
                        new_text = re.sub('\\b' + entity.text + '\\b', '[PERSON]', new_text)
                srs[idx] = new_text
    
        return srs
    
    for row_index, row_series in df.apply(mask_pii, axis='columns')[columns_list].iterrows():
        for column_name, column_value in row_series.items():
            if str(column_value) != 'nan':
                frvrs_logs_df.loc[row_index, column_name] = column_value
    
    nu.store_objects(frvrs_logs_df=frvrs_logs_df)
    nu.save_dataframes(frvrs_logs_df=frvrs_logs_df)

Pickling to C:\Users\DaveBabbitt\Documents\GitHub\itm-analysis-reporting\saves\pkl\frvrs_logs_df.pkl
Saving to C:\Users\DaveBabbitt\Documents\GitHub\itm-analysis-reporting\saves\csv\frvrs_logs_df.csv
