In [1]:

%pprint
import sys
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))

Pretty printing has been turned OFF


In [2]:

from FRVRS import (DataFrame, nu, re, concat, np)
import os
import pandas as pd

In [3]:

# Get all CSVs into one data frame
if nu.pickle_exists('frvrs_logs_df'):
    frvrs_logs_df = nu.load_object('frvrs_logs_df')
    print(frvrs_logs_df.shape)
    # df = frvrs_logs_df.sample(4).dropna(axis='columns', how='all')
    # display(df.T)

(832366, 110)



# Triage Accuracy

(How many patients did they get right and do they do it in the right order, for example: still or obvious life threat first (Red, Gray, Black), then wavers, then walkers last.)


### Build a model to predict *tag_applied_type*

In [9]:

# Display a sample of the data frame, dropping columns with all NaN values and transposing it
# display(frvrs_logs_df[mask_series].sample(min(4, frvrs_logs_df[mask_series].shape[0])).dropna(axis='columns', how='all').T)

# Get the supervised learning and group by columns
input_features = [
    'patient_record_salt', 'patient_record_sort', 'patient_record_pulse', 'patient_record_breath', 'patient_record_hearing',
    'patient_record_mood',
    'patient_record_pose', 'injury_record_id', 'injury_record_required_procedure', 'injury_record_severity', 'injury_record_body_region',
    'injury_record_injury_treated', 'injury_record_injury_treated_with_wrong_treatment', 'pulse_taken_pulse_name', 'patient_engaged_salt',
    'patient_engaged_sort', 'patient_engaged_pulse', 'patient_engaged_breath', 'patient_engaged_hearing', 'patient_engaged_mood',
    'patient_engaged_pose',
    'injury_treated_id', 'injury_treated_required_procedure', 'injury_treated_severity', 'injury_treated_body_region',
    'injury_treated_injury_treated',
    'injury_treated_injury_treated_with_wrong_treatment', 'tool_applied_type', 'tool_applied_attachment_point', 'tool_applied_tool_location',
    'tool_applied_data', 'tool_applied_sender'
    ]
target_variable = 'tag_applied_type'


def one_hot_encode(df, columns):
    '''
    One-hot encodes the given columns in the given DataFrame.
    
    Args:
        df: A DataFrame.
        columns: A list of column names to encode.
    
    Returns:
        A DataFrame with the encoded columns.
    '''
    
    dummies = pd.get_dummies(df[columns], dummy_na=True)
    df = concat([df, dummies], axis='columns').drop(columns, axis='columns')
    
    return df


#### Create a data frame with all the feature columns

In [10]:

# Initialize (if necessary) an empty data frame to store patient history data and iterate over the patient groups
if nu.pickle_exists('patient_history_df'): patient_history_df = nu.load_object('patient_history_df')
else:
    patient_history_df = DataFrame([], columns=renamed_columns_list)
    
    # Keep track of the target variable and input features columns and the input features columns prepended with each of the action types
    columns_list = [target_variable] + input_features
    renamed_columns_list = [target_variable] + [
        action_type.lower() + '_' + cn for cn in input_features for action_type in frvrs_logs_df.action_type.unique()
    ]
    
    # Group each patient, run, and session with non-null tag applied types
    tag_mask_series = ~frvrs_logs_df.tag_applied_type.isnull()
    gb = frvrs_logs_df[tag_mask_series].sort_values(['action_tick']).groupby(fu.patient_groupby_columns)
    
    from tqdm import tqdm
    for (session_uuid, scene_id, patient_id), patient_df in tqdm(gb, total=gb.size().shape[0]):
        
        # Create a mask for the specific patient
        patient_mask_series = True
        for cn in fu.patient_groupby_columns: patient_mask_series &= (frvrs_logs_df[cn] == eval(cn))
    
        # Create a mask for the patient's entire history, excluding the rows with applied tag types
        mask_series = ~tag_mask_series & patient_mask_series
    
        # If the patient's history is empty, create a new data frame with the patient's data
        if not mask_series.any():
    
            # Expand the data frame to the patient's entire history
            patient_df = frvrs_logs_df[patient_mask_series].sort_values('action_tick')
            
            # Initialize an empty data frame for each action type and loop over the action type groups
            action_type_df = DataFrame([], columns=renamed_columns_list)
            gb1 = patient_df.groupby('action_type')
            for action_type, df in gb1:
                
                # Rename the columns of the data frame to match the desired format and fill
                # in any missing values using the forward fill and backward fill methods
                df = df[columns_list].rename(
                    columns={cn: action_type.lower() + '_' + cn for cn in input_features}
                ).fillna(method='ffill').fillna(method='bfill')
    
                # Concatenate the data frame to the action_type_df data frame
                action_type_df = concat([action_type_df, df], axis='index')
            
            # Rename the columns of the action type data frame to match the desired format, and
            # fill in any missing values using the forward fill and backward fill methods
            df = action_type_df[renamed_columns_list].fillna(method='ffill').fillna(method='bfill')
    
            # Concatenate the data frame to the patient history data frame
            patient_history_df = concat([patient_history_df, df], axis='index')
    
    # Store the patient history data frame
    nu.store_objects(patient_history_df=patient_history_df, verbose=False)

In [11]:

print(patient_history_df.shape)
# sorted(patient_history_df.columns)

(129867, 833)


In [12]:

df = patient_history_df.sample(min(4, patient_history_df.shape[0])).dropna(axis='columns', how='all').T
display(df.sample(min(20, df.shape[0])).sort_index())

Unnamed: 0,82414,317618,620826,795079
injury_record_injury_record_body_region,chest,leftArm,rightLeg,abdomen
injury_record_injury_record_injury_treated,False,False,False,False
injury_record_injury_record_injury_treated_with_wrong_treatment,False,False,False,False
injury_record_injury_record_required_procedure,woundpack,gauzePressure,tourniquet,woundpack
injury_record_injury_record_severity,medium,low,high,medium
injury_treated_injury_treated_injury_treated,True,,True,True
injury_treated_injury_treated_severity,medium,,high,medium
patient_engaged_patient_engaged_breath,normal,normal,fast,normal
patient_engaged_patient_engaged_hearing,normal,normal,normal,normal
patient_engaged_patient_engaged_pulse,fast,normal,fast,fast



#### One-hot encode it

In [13]:

# One-hot encode the input features columns in the one-hot encode data frame
if nu.pickle_exists('one_hot_encode_df'): one_hot_encode_df = nu.load_object('one_hot_encode_df')
else:
    ascii_regex = re.compile('[^a-z0-9]+')
    one_hot_encode_df = one_hot_encode(patient_history_df, [
        action_type.lower() + '_' + cn for cn in input_features for action_type in frvrs_logs_df.action_type.unique()
    ])
    one_hot_encode_df = one_hot_encode_df.rename(
        columns={cn: ascii_regex.sub('_', cn.lower()).strip('_') for cn in one_hot_encode_df.columns}
    )
    columns_list = [cn for cn in one_hot_encode_df.columns if any(map(lambda x: cn.endswith(x), ['_null', '_nan']))]
    
    # Store the patient history data frame
    nu.store_objects(one_hot_encode_df=one_hot_encode_df, verbose=False)
    
# print(columns_list)
df = one_hot_encode_df.sample(min(4, one_hot_encode_df.shape[0])).dropna(axis='columns', how='all').T
display(df.sample(min(20, df.shape[0])).sort_index())

Unnamed: 0,257508,332065,775681,756861
injury_record_injury_record_id_l_stomach_puncture,0,0,0,0
injury_record_tool_applied_type_nan,1,1,1,1
injury_treated_injury_treated_body_region_abdomen,0,1,0,0
injury_treated_injury_treated_id_r_chest_collapse,0,0,0,0
injury_treated_injury_treated_injury_treated_with_wrong_treatment_true,1,1,0,1
patient_demoted_patient_record_mood_nan,1,1,1,1
player_gaze_tool_applied_type_nan,1,1,1,1
s_a_l_t_walk_if_can_injury_record_id_nan,1,1,1,1
s_a_l_t_walk_if_can_injury_record_severity_nan,1,1,1,1
s_a_l_t_walk_if_can_patient_engaged_pose_nan,1,1,1,1


In [14]:

one_hot_encode_df.shape, one_hot_encode_df.dropna(axis='index', how='any').shape

((129867, 1025), (129867, 1025))

In [15]:

# Analyze the input features
if nu.pickle_exists('one_hot_column_descriptions_df'): one_hot_column_descriptions_df = nu.load_object('one_hot_column_descriptions_df')
else:
    one_hot_column_descriptions_df = nu.get_column_descriptions(one_hot_encode_df.sample(min(20, one_hot_encode_df.shape[0])))
    # mask_series = one_hot_column_descriptions_df.min_salt.isnull()
    
    # Store the column description data frame
    nu.store_objects(one_hot_column_descriptions_df=one_hot_column_descriptions_df, verbose=False)
    
display(one_hot_column_descriptions_df.sample(min(20, one_hot_column_descriptions_df.shape[0])).sort_index())

Unnamed: 0,column_name,dtype,count_blanks,count_uniques,count_zeroes,has_dates,min_value,max_value,only_integers
41,s_a_l_t_walked_patient_record_sort_nan,uint8,0,1,0,True,1,1,True
142,tag_selected_patient_record_hearing_nan,uint8,0,1,0,True,1,1,True
197,bag_access_patient_record_pose_nan,uint8,0,1,0,True,1,1,True
243,injury_record_injury_record_id_r_thigh_laceration,uint8,0,2,19,True,0,1,True
266,patient_demoted_injury_record_required_procedu...,uint8,0,1,0,True,1,1,True
311,teleport_injury_record_severity_nan,uint8,0,1,0,True,1,1,True
356,tool_discarded_injury_record_body_region_nan,uint8,0,1,0,True,1,1,True
363,voice_command_injury_record_injury_treated_nan,uint8,0,1,0,True,1,1,True
380,tag_selected_injury_record_injury_treated_nan,uint8,0,1,0,True,1,1,True
426,bag_access_pulse_taken_pulse_name_nan,uint8,0,1,0,True,1,1,True



#### Train a classifier on it

In [16]:

# Train a classifier on the patient history data frame
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
df = one_hot_encode_df.dropna(axis='index', how='any')
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(target_variable, axis='columns'),
    df.tag_applied_type,
    test_size=0.25,
    random_state=42
)

# Convert the uint8 features to floats
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [17]:

# Create a random forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# Evaluate the classifier on the test set
y_pred = rf_classifier.predict(X_test)
rf_accuracy = np.mean(y_pred == y_test)

# Calculate the feature importances
feature_importances = rf_classifier.feature_importances_

# Create a data frame to store the feature names and feature importances
if nu.pickle_exists('feature_importances_df'): feature_importances_df = nu.load_object('feature_importances_df')
else:
    feature_importances_df = DataFrame()
    feature_importances_df['feature_name'] = df.drop(target_variable, axis='columns').columns
    feature_importances_df['feature_importance'] = feature_importances
    
    # Store the feature importances data frame
    nu.store_objects(feature_importances_df=feature_importances_df, verbose=True)

In [18]:

# Train a logistic regression classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

# Evaluate the classifier on the test set
y_pred = lr_classifier.predict(X_test)
lr_accuracy = np.mean(y_pred == y_test)

# Get the coefficients of the logistic regression model
feature_coefficients = lr_classifier.coef_[0]

# Create a data frame to store the feature names and feature coefficients
if nu.pickle_exists('feature_coefficients_df'): feature_coefficients_df = nu.load_object('feature_coefficients_df')
else:
    feature_coefficients_df = DataFrame()
    feature_coefficients_df['feature_name'] = df.drop(target_variable, axis='columns').columns
    feature_coefficients_df['feature_coefficient'] = feature_coefficients
    feature_coefficients_df['absolute_coefficient'] = feature_coefficients_df.feature_coefficient.map(lambda x: abs(x))
    
    # Store the feature coefficients data frame
    nu.store_objects(feature_coefficients_df=feature_coefficients_df, verbose=True)

In [19]:

# Train a histogram-based gradient boosting classifier
if nu.pickle_exists('hgb_classifier'): hgb_classifier = nu.load_object('hgb_classifier')
else:
    hgb_classifier = HistGradientBoostingClassifier().fit(X_train, y_train)
    
    # Store the hgb classifier
    nu.store_objects(hgb_classifier=hgb_classifier, verbose=True)

# Evaluate the classifier on the test set
y_pred = hgb_classifier.predict(X_test)
hgb_accuracy = np.mean(y_pred == y_test)

# Calculate the feature importances using the Permutation Importance algorithm
if nu.pickle_exists('hgb_permutation_importances'): hgb_permutation_importances = nu.load_object('hgb_permutation_importances')
else:
    from sklearn.inspection import permutation_importance
    hgb_permutation_importances = permutation_importance(hgb_classifier, X_test, y_test)
    
    # Store the hgb permutation importances data frame
    nu.store_objects(hgb_permutation_importances=hgb_permutation_importances, verbose=True)

# Create a data frame to store the feature names and feature coefficients
if nu.pickle_exists('hgb_permutation_importances_df'): hgb_permutation_importances_df = nu.load_object('hgb_permutation_importances_df')
else:
    hgb_permutation_importances_df = DataFrame()
    hgb_permutation_importances_df['feature_name'] = df.drop(target_variable, axis='columns').columns
    for fn in dir(hgb_permutation_importances):
        if (fn == 'importances'):
            for action_type, importances in zip(frvrs_logs_df.action_type.unique(), hgb_permutation_importances.importances):
                hgb_permutation_importances_df[f'{action_type.lower()}_importance'] = feature_importances
        else: hgb_permutation_importances_df[fn] = eval(f'hgb_permutation_importances.{fn}')
    
    # Store the hgb permutation importances
    nu.store_objects(hgb_permutation_importances_df=hgb_permutation_importances_df, verbose=True)

In [20]:

[fn for fn in dir(hgb_permutation_importances)]

['importances', 'importances_mean', 'importances_std']

In [21]:

display(hgb_classifier.classes_)

array(['black', 'gray', 'green', 'red', 'yellow'], dtype=object)

In [22]:

display(hgb_classifier.feature_names_in_)

array(['patient_demoted_patient_record_salt_nan',
       'voice_capture_patient_record_salt_nan',
       'voice_command_patient_record_salt_nan', ...,
       'session_end_tool_applied_sender_nan',
       'player_location_tool_applied_sender_nan',
       'player_gaze_tool_applied_sender_nan'], dtype=object)

In [23]:

display(hgb_classifier.n_features_in_)

1024


#### Evaluate the accuracies and importances

In [24]:

# Print the accuracies
print('RF Accuracy:', rf_accuracy)
print('LR Accuracy:', lr_accuracy)
print('HGB Accuracy:', hgb_accuracy)

# Display the feature importances data frame
display(feature_importances_df.sort_values('feature_importance', ascending=False).head(10))

# Display the feature coefficients data frame
columns_list = ['feature_name', 'feature_coefficient']
display(feature_coefficients_df.sort_values('absolute_coefficient', ascending=False)[columns_list].head(10))

# Display the permutation importances data frame
columns_list = ['feature_name', 'importances_mean']
df = hgb_permutation_importances_df.drop(columns_list, axis='columns')
max_importance = df.max().max()
columns_list += df.columns[df.eq(max_importance).any()].tolist()[:7-len(columns_list)]
display(hgb_permutation_importances_df.sort_values('importances_mean', ascending=False)[columns_list].head(10))

RF Accuracy: 0.8954322850894755
LR Accuracy: 0.8710690855330028
HGB Accuracy: 0.8946006714510117


Unnamed: 0,feature_name,feature_importance
67,patient_record_patient_record_pulse_normal,0.031695
335,injury_record_injury_record_body_region_leftarm,0.030701
65,patient_record_patient_record_pulse_fast,0.029878
463,patient_engaged_patient_engaged_salt_immediate,0.028024
645,patient_engaged_patient_engaged_pose_standing,0.0278
523,patient_engaged_patient_engaged_pulse_normal,0.02597
304,injury_record_injury_record_severity_low,0.021834
521,patient_engaged_patient_engaged_pulse_fast,0.021723
305,injury_record_injury_record_severity_medium,0.021155
981,tool_applied_tool_applied_data_null,0.019367


Unnamed: 0,feature_name,feature_coefficient
429,pulse_taken_pulse_taken_pulse_name_pulse_none,0.347312
96,patient_record_patient_record_breath_none,0.291966
4,patient_record_patient_record_salt_dead,0.291966
66,patient_record_patient_record_pulse_none,0.287684
156,patient_record_patient_record_mood_dead,0.277065
552,patient_engaged_patient_engaged_breath_none,0.264113
460,patient_engaged_patient_engaged_salt_dead,0.264113
522,patient_engaged_patient_engaged_pulse_none,0.259831
612,patient_engaged_patient_engaged_mood_dead,0.249212
126,patient_record_patient_record_hearing_none,0.248663


Unnamed: 0,feature_name,importances_mean,patient_demoted_importance,voice_capture_importance,voice_command_importance,session_start_importance,patient_record_importance
950,tool_applied_tool_applied_tool_location_null,0.07272,0.018928,0.018928,0.018928,0.018928,0.018928
335,injury_record_injury_record_body_region_leftarm,0.064151,0.01045,0.01045,0.01045,0.01045,0.01045
645,patient_engaged_patient_engaged_pose_standing,0.050864,0.024678,0.024678,0.024678,0.024678,0.024678
7,patient_record_patient_record_salt_immediate,0.035168,0.023434,0.023434,0.023434,0.023434,0.023434
67,patient_record_patient_record_pulse_normal,0.030936,0.02446,0.02446,0.02446,0.02446,0.02446
690,injury_treated_injury_treated_id_r_chest_collapse,0.025706,0.002159,0.002159,0.002159,0.002159,0.002159
428,pulse_taken_pulse_taken_pulse_name_pulse_fast,0.020981,0.008683,0.008683,0.008683,0.008683,0.008683
881,tool_applied_tool_applied_type_nan,0.019626,0.016165,0.016165,0.016165,0.016165,0.016165
691,injury_treated_injury_treated_id_r_forearm_lac...,0.018493,0.002626,0.002626,0.002626,0.002626,0.002626
225,injury_record_injury_record_id_l_forearm_lacer...,0.017932,0.021697,0.021697,0.021697,0.021697,0.021697



#### Perform a sample inference

In [25]:

# Display the one-hot encoded sample
input_encode_df = one_hot_encode_df.sample(1)
input_encode_idx = input_encode_df.index.iloc[0]
print(input_encode_idx)
print(input_encode_df.shape)
mask_series = (input_encode_df.T[input_encode_idx] == 0)
df = input_encode_df.T[~mask_series]
enc_idx_list = df.index.tolist()
display(df)

711351
(1, 1025)


Unnamed: 0,711351
tag_applied_type,green
patient_demoted_patient_record_salt_nan,1
voice_capture_patient_record_salt_nan,1
voice_command_patient_record_salt_nan,1
session_start_patient_record_salt_nan,1
...,...
tool_discarded_tool_applied_sender_nan,1
tag_discarded_tool_applied_sender_nan,1
session_end_tool_applied_sender_nan,1
player_location_tool_applied_sender_nan,1


In [26]:

# Display the FRVRS Logs sample
mask_series = (frvrs_logs_df.index == input_encode_idx)
df = frvrs_logs_df[mask_series]
print(df.shape)
cn_set = set()
for cn in frvrs_logs_df.columns:
    for enc_idx in enc_idx_list:
        if enc_idx.startswith(cn): cn_set.add(cn)
columns_list = [target_variable] + list(cn_set)
display(df[columns_list].T)

(1, 110)


Unnamed: 0,711351
tag_applied_type,
patient_engaged_pulse,
injury_treated_injury_treated,
injury_record_injury_treated,
patient_demoted_pulse,
patient_record_pulse,
tag_applied_type,


In [27]:

# Display the patient history sample
mask_series = (patient_history_df.index == input_encode_idx)
df = patient_history_df[mask_series]
print(df.shape)
cn_set = set()
for cn in patient_history_df.columns:
    for enc_idx in enc_idx_list:
        if enc_idx.startswith(cn): cn_set.add(cn)
columns_list = [target_variable] + list(cn_set - set([target_variable]))
display(df[columns_list].T)

(1, 833)


Unnamed: 0,711351
tag_applied_type,green
pulse_taken_injury_treated_id,
session_end_patient_record_sort,
session_end_patient_engaged_hearing,
player_gaze_patient_engaged_salt,
...,...
tool_applied_patient_engaged_pulse,
tool_hover_patient_record_sort,
tag_applied_injury_record_injury_treated_with_wrong_treatment,
s_a_l_t_waved_injury_record_id,


In [28]:

# Convert the input features to a NumPy array
input_features_array = np.array(input_encode_df.drop(target_variable, axis='columns').values)
actual_tag_applied_type = input_encode_df.tag_applied_type.squeeze()

In [29]:

# Predict the applied tag type
predicted_tag_applied_type = sorted(
    [(c, p) for c, p in zip(lr_classifier.classes_, lr_classifier.predict_proba(input_features_array)[0])], key=lambda x: x[1], reverse=True
)[0][0]
print(f'LR: predicted: {predicted_tag_applied_type}, actual: {actual_tag_applied_type}')

LR: predicted: green, actual: green


In [30]:

# Predict the applied tag type
predicted_tag_applied_type = sorted(
    [(c, p) for c, p in zip(rf_classifier.classes_, rf_classifier.predict_proba(input_features_array)[0])], key=lambda x: x[1], reverse=True
)[0][0]
print(f'RF: predicted: {predicted_tag_applied_type}, actual: {actual_tag_applied_type}')

RF: predicted: green, actual: green


In [31]:

# Predict the applied tag type
predicted_tag_applied_type = sorted(
    [(c, p) for c, p in zip(hgb_classifier.classes_, hgb_classifier.predict_proba(input_features_array)[0])], key=lambda x: x[1], reverse=True
)[0][0]
print(f'HGB: predicted: {predicted_tag_applied_type}, actual: {actual_tag_applied_type}')

HGB: predicted: green, actual: green
