In [1]:

# Set up notebook
%pprint
import sys
if ('../py' not in sys.path): sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

# load libraries
from FRVRS import nu, fu
from numpy import nan
from pandas import DataFrame, read_csv, read_excel, concat, get_dummies, isna
from re import split, search, sub
from scipy.stats import f_oneway, ttest_ind, kruskal, norm
import itertools
import os.path as osp
import statsmodels.api as sm


According to the program, KDMA values help define or rather help influence behavior of a person in these triage situations
The scenario data gives you information of what was the situation they were put in
now the overall question is, how can we describe if the KMA values are influencing these behaviors?

In [5]:

# load data frames
data_frames_dict = nu.load_data_frames(
    verbose=False, metrics_evaluation_open_world_csv_stats_df='', metrics_evaluation_open_world_json_stats_df='',
    metrics_evaluation_open_world_scene_stats_df='', metrics_evaluation_open_world_anova_df=''
)
logs_df = data_frames_dict['metrics_evaluation_open_world_csv_stats_df']
print(logs_df.shape) # (199476, 124)
json_stats_df = data_frames_dict['metrics_evaluation_open_world_json_stats_df']
print(json_stats_df.shape) # (51, 3709)
scene_stats_df = data_frames_dict['metrics_evaluation_open_world_scene_stats_df']
print(scene_stats_df.shape) # (68, 61)
anova_df = data_frames_dict['metrics_evaluation_open_world_anova_df']
print(anova_df.shape) # (199476, 63)

(199476, 124)
(51, 2141)
(68, 61)
(199476, 62)


In [8]:

# Could the feedback to stop searching be training the participant to be less maximizing?
# columns_list = [cn for cn in logs_df.columns if any(map(lambda x: x in cn, ['message', 'command', 'text']))]
mask_series = logs_df.button_command_message.isin(['modbutton yes', 'modbutton no']) | logs_df.voice_command_message.isin(['modbutton yes', 'modbutton no'])
groupby_columns = ['participant_id', 'session_uuid', 'scene_id']
for (participant_id, session_uuid, scene_id), scene_df in logs_df[mask_series].groupby(groupby_columns):
    # df = scene_df.dropna(axis='columns', how='all')
    mask_series = True
    for cn in groupby_columns: mask_series &= (anova_df[cn] == eval(cn))
    df = anova_df[mask_series].dropna(axis='columns', how='all')
    # display(df.sample(min(4, df.shape[0])).T)
    # print(df.mean_PropTrust.squeeze(), sorted(scene_df.action_tick.unique()))
    break

In [6]:

# Break up the metadata columns into their own columns
import re

metadata_columns = sorted([cn for cn in anova_df.columns if cn.endswith('_metadata')])
for cn in metadata_columns:
    str_prefix = re.split('_metadata', cn, 0)[0]
    
    # Split the pipe-delimited values into a DataFrame
    split_df = anova_df[cn].str.split('|', expand=True)

    # Change the column names to reflect the content
    split_df.columns = [
        f'{str_prefix}_patient_id', f'{str_prefix}_engagement_start', f'{str_prefix}_location_tuple', f'{str_prefix}_patient_sort',
        f'{str_prefix}_predicted_priority', f'{str_prefix}_injury_severity'
    ]
    
    # Add the split columns to the original DataFrame
    anova_df = concat([anova_df, split_df], axis='columns')
    
    # Drop the original column
    anova_df = anova_df.drop(columns=[cn, f'{str_prefix}_predicted_priority'])


Action tick at first engagement | location (x, y, z) at first engagement | SORT designation | estimated priority group score (not used) | severity of highest injury severity

In [9]:

print("\nThe numeric columns we want to take the mean of:")
injury_severity_columns = [cn for cn in anova_df.columns if cn.endswith('_injury_severity')]
numeric_columns = sorted(
    set(nu.get_numeric_columns(anova_df))
    .difference(set(metadata_columns + groupby_columns + injury_severity_columns))
)
print(numeric_columns)

print("\nThe other columns we want to one-hot encode:")
other_columns = set()
for cn in set(anova_df.columns).difference(set(numeric_columns + groupby_columns)):
    if anova_df[cn].nunique() < 20:
        other_columns.add(cn)
other_columns = sorted(other_columns)
print(other_columns)


The numeric columns we want to take the mean of:
['mean_AD_KDMA_Sim', 'mean_AD_KDMA_Text', 'mean_PropTrust', 'mean_ST_KDMA_Sim', 'mean_ST_KDMA_Text', 'mean_YrsMilExp', 'mean_actual_engagement_distance', 'mean_first_engagement', 'mean_first_treatment', 'mean_injury_correctly_treated_count', 'mean_injury_not_treated_count', 'mean_injury_treatments_count', 'mean_injury_wrongly_treated_count', 'mean_last_engagement', 'mean_last_still_engagement', 'mean_measure_of_right_ordering', 'mean_patient_count', 'mean_percent_hemorrhage_controlled', 'mean_pulse_taken_count', 'mean_stills_value', 'mean_teleport_count', 'mean_time_to_hemorrhage_control_per_patient', 'mean_time_to_last_hemorrhage_controlled', 'mean_total_actions_count', 'mean_triage_time', 'mean_voice_capture_count', 'mean_walk_command_count', 'mean_walk_value', 'mean_walkers_value', 'mean_wave_command_count', 'mean_wave_value']

The other columns we want to one-hot encode:
['encounter_layout', 'engaged_patient00_injury_severity', 'eng

In [10]:

for cn in other_columns:
    anova_df = nu.one_hot_encode(anova_df, [cn], dummy_na=False)

In [11]:

print("\nThe numeric columns we want to use in modeling:")
injury_severity_columns = [cn for cn in anova_df.columns if cn.endswith('_injury_severity')]
numeric_columns = sorted(
    set(nu.get_numeric_columns(anova_df))
    .difference(set(metadata_columns + groupby_columns + injury_severity_columns))
)
print(numeric_columns)

print("\nThe other columns we want to one-hot encode:")
other_columns = set()
for cn in set(anova_df.columns).difference(set(numeric_columns + groupby_columns)):
    if anova_df[cn].nunique() < 20:
        other_columns.add(cn)
other_columns = sorted(other_columns)
print(other_columns)


The numeric columns we want to use in modeling:
['encounter_layout_Desert', 'encounter_layout_Jungle', 'encounter_layout_Submarine', 'encounter_layout_Urban', 'engaged_patient00_injury_severity_high', 'engaged_patient00_injury_severity_medium', 'engaged_patient00_injury_severity_nan', 'engaged_patient00_location_tuple_(-19.9, -10.0)', 'engaged_patient00_location_tuple_(-3.8, -0.5)', 'engaged_patient00_location_tuple_(-3.9, -0.5)', 'engaged_patient00_location_tuple_(-3.9, -0.6)', 'engaged_patient00_location_tuple_(0.0, 0.0)', 'engaged_patient00_location_tuple_(2.2, 1.9)', 'engaged_patient00_location_tuple_(2.2, 2.2)', 'engaged_patient00_location_tuple_(2.3, 1.9)', 'engaged_patient00_location_tuple_(2.7, 2.9)', 'engaged_patient00_location_tuple_(3.4, 2.4)', 'engaged_patient00_location_tuple_(4.2, -0.4)', 'engaged_patient00_location_tuple_(4.2, -0.5)', 'engaged_patient00_location_tuple_(4.6, 3.8)', 'engaged_patient00_location_tuple_(4.6, 3.9)', 'engaged_patient00_location_tuple_(4.7, 3.8

In [12]:

print("Get column and value descriptions")
file_path = osp.join(nu.data_folder, 'xlsx', 'Metrics_Evaluation_Dataset_organization_for_BBAI.xlsx')
dataset_organization_df = read_excel(file_path)

print("Fix the doubled up descriptions")
mask_series = dataset_organization_df.Labels.map(lambda x: ';' in str(x))
for row_index, label in dataset_organization_df[mask_series].Labels.items():
    labels_list = split(' *; *', str(label), 0)
    dataset_organization_df.loc[row_index, 'Labels'] = labels_list[0]
    
    # Get a copy of the row
    new_row = dataset_organization_df.loc[row_index].copy()
    
    # Modify the desired column value
    new_row['Labels'] = labels_list[1]
    
    # Append the new row to the DataFrame
    dataset_organization_df = concat([dataset_organization_df, new_row], ignore_index=True)

# Get a copy of the row
mask_series = (dataset_organization_df.Variable == 'AD_Del_Omni')
new_row = dataset_organization_df.loc[mask_series].copy()

# Modify the desired column value
new_row['Variable'] = 'AD_Del_Omni_Text'

# Append the new row to the DataFrame
dataset_organization_df = concat([dataset_organization_df, new_row], ignore_index=True)

print("Get the column value descriptions")
mask_series = ~dataset_organization_df.Description.isnull()
df = dataset_organization_df[mask_series]
value_description_dict = df.set_index('Variable').Description.to_dict()
new_description_dict = value_description_dict.copy()
for k, v in value_description_dict.items():
    new_description_dict[k] = v
    if (not k.endswith('_Text')):
        new_key_name = f'{k}_Text'
        new_description_dict[new_key_name] = new_description_dict.get(new_key_name, v)
value_description_dict = new_description_dict.copy()

Get column and value descriptions
Fix the doubled up descriptions
Get the column value descriptions


In [14]:

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split your data into features (X) and target variable (y)
kdma_columns = [cn for cn in anova_df.columns if 'KDMA' in cn]
df = anova_df[numeric_columns].dropna(axis='columns', how='all').dropna(axis='index', how='any')
for cn in ['mean_PropTrust']:
    print()
    column_description = value_description_dict[cn.replace('mean_', '')]
    print(f'{cn} ({column_description})')
    columns_list = sorted(set(df.columns).difference(set([cn] + kdma_columns)))
    X = df[columns_list]
    y = df[cn]
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create the decision tree model
    model = DecisionTreeRegressor()
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = model.predict(X_test)
    
    # Calculate MSE
    mse = mean_squared_error(y_test, y_pred)
    
    # Print the MSE value
    print("Mean Squared Error:", mse)
    
    # Get feature importances
    feature_importances = model.feature_importances_
    
    # Print feature names and importances (assuming you have the column names)
    importances_df = DataFrame([{'feature': feature, 'importance': importance} for feature, importance in zip(columns_list, feature_importances)])
    display(importances_df.sort_values('importance', ascending=False).head())


mean_PropTrust (Average rating on 3-item propensity to trust measure; higher is higher propensity to trust)
Mean Squared Error: 1.3334432167114253e-25


Unnamed: 0,feature,importance
174,engaged_patient07_patient_id_Navy Soldier 3 Ma...,0.377253
217,"engaged_patient09_location_tuple_(0.6, -2.7)",0.197766
208,engaged_patient09_injury_severity_nan,0.166195
307,mean_first_treatment,0.059938
328,mean_wave_command_count,0.044439


In [15]:

# Is there a correlation between propensity to trust and command counts (walk or wave)?
[cn for cn in anova_df.columns if 'command' in cn]

['mean_walk_command_count', 'mean_wave_command_count']

In [47]:

print("Create the value description function")
numeric_categories_mask_series = dataset_organization_df.Labels.map(lambda x: '=' in str(x))
value_descriptions_columns = dataset_organization_df[numeric_categories_mask_series].Variable.unique().tolist()
def get_value_description(column_name, column_value):
    value_description = ''
    if not isna(column_value):
        mask_series = (dataset_organization_df.Variable == column_name) & ~dataset_organization_df.Labels.isnull()
        if mask_series.any():
            df = dataset_organization_df[mask_series]
            mask_series = df.Labels.map(lambda label: split(' *= *', str(label), 0)[0] == str(int(float(column_value))))
            if mask_series.any():
                label = df[mask_series].Labels.squeeze()
                value_description = split(' *= *', str(label), 0)[1]
    
    return value_description

Create the value description function


In [46]:

mask_series = dataset_organization_df.Variable.map(lambda x: 'KDMA' in str(x))
columns_list = sorted(dataset_organization_df[mask_series].Variable.unique())
for cn in columns_list:
    mask_series = (dataset_organization_df.Variable == cn) & ~dataset_organization_df.Labels.isnull()
    if mask_series.any():
        df = dataset_organization_df[mask_series]
        mask_series = df.Labels.map(lambda label: '=' in label)
        print(cn, mask_series.any())

AD_KDMA_Sim False
AD_KDMA_Text False
ST_KDMA_Sim False
ST_KDMA_Text False


In [27]:

# Split your data into features (X) and target variable (y)
rows_list = []
df = anova_df[numeric_columns].dropna(axis='columns', how='all').dropna(axis='index', how='any')
for cn in numeric_columns:
    row_dict = {'column_name': cn, 'variance': df[cn].var()}
    X = df[kdma_columns]
    y = df[cn]
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create the decision tree model
    model = DecisionTreeRegressor()
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = model.predict(X_test)
    
    # Calculate MSE
    row_dict['mse'] = mean_squared_error(y_test, y_pred)

    rows_list.append(row_dict)
kdma_mse_df = DataFrame(rows_list)


According to the program, KDMA values help define or rather help influence behavior of a person in these triage situations
The scenario data gives you information of what was the situation they were put in
now the overall question is, how can we describe if the KMA values are influencing these behaviors?

In [52]:

# A lower MSE value indicates a better fit, meaning the predictions from the model are closer to the actual values.
# Conversely, a higher MSE signifies larger differences between predictions and actual values, suggesting a poorer model fit.
mask_series = (kdma_mse_df.mse == 0.0) & (kdma_mse_df.variance > 0.0)
print(nu.conjunctify_nouns(kdma_mse_df[mask_series].column_name.tolist()))
# kdma_mse_df.sort_values('mse').head(20)

engaged_patient10_patient_sort_still, mean_YrsMilExp, medical_role_EM faculty, medical_role_EM resident, medical_role_Medical student, medical_role_Other, and medical_role_Paramedic


In [53]:

anova_df.mean_stills_value.value_counts()

0.0    146681
1.0     52795
Name: mean_stills_value, dtype: int64

In [3]:

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# Assuming stills_value and AD_KDMA_Sim are columns in anova_df
sns.boxplot(
    x = 'stills_value',
    y = 'AD_KDMA_Sim',
    showmeans=True,
    data=anova_df
)

# Rotate x labels to prevent overlapping
plt.xticks(rotation=45)
plt.show()

NameError: name 'anova_df' is not defined