In [1]:

# Set up the notebook
%pprint
import sys
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))

Pretty printing has been turned OFF


In [2]:

from FRVRS import (
    fu, nu, warnings, osp, read_excel, re, concat, isna, nan
)
import json

warnings.filterwarnings('ignore')


# ANOVA Stats Created for Metrics Evaluation Open World

In [3]:

# Load data frames to get a reliable representation
data_frames_dict = nu.load_data_frames(
    metrics_evaluation_open_world_csv_stats_df='', metrics_evaluation_open_world_json_stats_df='',
    metrics_evaluation_open_world_scene_stats_df=''
)
csv_stats_df = data_frames_dict['metrics_evaluation_open_world_csv_stats_df']
json_stats_df = data_frames_dict['metrics_evaluation_open_world_json_stats_df']
scene_stats_df = data_frames_dict['metrics_evaluation_open_world_scene_stats_df']

No pickle exists for metrics_evaluation_open_world_csv_stats_df - attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_csv_stats_df.csv.
Attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_json_stats_df.pkl.
Attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_scene_stats_df.pkl.


In [4]:

# Merge in the JSON scenario dataset
on_columns = sorted(set(csv_stats_df.columns).intersection(set(json_stats_df.columns)))
print(on_columns)
senario_columns = sorted(
    [cn for cn in json_stats_df.columns if 'scenarioData' in cn] + 
    ['AD_KDMA_Sim', 'AD_KDMA_Text', 'PropTrust', 'ST_KDMA_Sim', 'ST_KDMA_Text', 'YrsMilExp']
)
print(senario_columns)
columns_list = on_columns + senario_columns
merge1_df = csv_stats_df.merge(json_stats_df[columns_list], on=on_columns, how='left')
print(csv_stats_df.shape) # (199476, 124)
print(json_stats_df[columns_list].shape) # (51, 5)
print(merge1_df.shape) # (199476, 127)

['csv_file_name', 'participant_id', 'session_uuid']
['AD_KDMA_Sim', 'AD_KDMA_Text', 'PropTrust', 'ST_KDMA_Sim', 'ST_KDMA_Text', 'YrsMilExp', 'configData_scenarioData_description', 'configData_scenarioData_difficulty', 'configData_scenarioData_name']
(199476, 124)
(43, 12)
(199476, 133)


In [5]:

# Merge in the scene stats dataset
on_columns = sorted(set(merge1_df.columns).intersection(set(scene_stats_df.columns)))
print(on_columns)
analysis_columns = [
    'actual_engagement_distance', 'first_engagement', 'first_treatment', 'injury_correctly_treated_count', 'injury_not_treated_count',
    'injury_treatments_count', 'injury_wrongly_treated_count', 'last_engagement', 'last_still_engagement', 'measure_of_right_ordering', 'patient_count',
    'percent_hemorrhage_controlled', 'pulse_taken_count', 'stills_value', 'teleport_count', 'time_to_hemorrhage_control_per_patient',
    'time_to_last_hemorrhage_controlled', 'total_actions_count', 'triage_time', 'voice_capture_count', 'walk_command_count', 'walk_value', 'walkers_value',
    'wave_command_count', 'wave_value'
]
print(analysis_columns)
columns_list = on_columns + analysis_columns
merge2_df = merge1_df.merge(scene_stats_df[columns_list], on=on_columns, how='left')
print(merge1_df.shape) # (199476, 127)
print(scene_stats_df[columns_list].shape) # (60, 29)
print(merge2_df.shape) # (199476, 153)

['participant_id', 'scene_id', 'session_uuid']
['actual_engagement_distance', 'first_engagement', 'first_treatment', 'injury_correctly_treated_count', 'injury_not_treated_count', 'injury_treatments_count', 'injury_wrongly_treated_count', 'last_engagement', 'last_still_engagement', 'measure_of_right_ordering', 'patient_count', 'percent_hemorrhage_controlled', 'pulse_taken_count', 'stills_value', 'teleport_count', 'time_to_hemorrhage_control_per_patient', 'time_to_last_hemorrhage_controlled', 'total_actions_count', 'triage_time', 'voice_capture_count', 'walk_command_count', 'walk_value', 'walkers_value', 'wave_command_count', 'wave_value']
(199476, 133)
(76, 28)
(199476, 158)


In [6]:

# Aggregate the data from the merged datasets and group by participant, session, and scene to get the means of the numeric columns
columns_list = on_columns + [
    'AD_KDMA_Sim', 'AD_KDMA_Text', 'PropTrust', 'ST_KDMA_Sim', 'ST_KDMA_Text', 'YrsMilExp', 'configData_scenarioData_difficulty'
] + analysis_columns
print(columns_list)
anova_df = merge2_df[columns_list].groupby(on_columns).mean().rename(
    columns={cn: 'mean_'+cn for cn in senario_columns + analysis_columns}
).reset_index(drop=False)

['participant_id', 'scene_id', 'session_uuid', 'AD_KDMA_Sim', 'AD_KDMA_Text', 'PropTrust', 'ST_KDMA_Sim', 'ST_KDMA_Text', 'YrsMilExp', 'configData_scenarioData_difficulty', 'actual_engagement_distance', 'first_engagement', 'first_treatment', 'injury_correctly_treated_count', 'injury_not_treated_count', 'injury_treatments_count', 'injury_wrongly_treated_count', 'last_engagement', 'last_still_engagement', 'measure_of_right_ordering', 'patient_count', 'percent_hemorrhage_controlled', 'pulse_taken_count', 'stills_value', 'teleport_count', 'time_to_hemorrhage_control_per_patient', 'time_to_last_hemorrhage_controlled', 'total_actions_count', 'triage_time', 'voice_capture_count', 'walk_command_count', 'walk_value', 'walkers_value', 'wave_command_count', 'wave_value']


In [7]:

# Get column and value descriptions
file_path = osp.join(fu.data_folder, 'xlsx', 'Metrics_Evaluation_Dataset_organization_for_BBAI.xlsx')
dataset_organization_df = read_excel(file_path)

# Fix the doubled up descriptions
mask_series = dataset_organization_df.Labels.map(lambda x: ';' in str(x))
for row_index, label in dataset_organization_df[mask_series].Labels.items():
    labels_list = re.split(' *; *', str(label), 0)
    dataset_organization_df.loc[row_index, 'Labels'] = labels_list[0]
    
    # Get a copy of the row
    new_row = dataset_organization_df.loc[row_index].copy()
    
    # Modify the desired column value
    new_row['Labels'] = labels_list[1]
    
    # Append the new row to the Data Frame
    dataset_organization_df = concat([dataset_organization_df, new_row], ignore_index=True)

# Get a copy of the row
mask_series = (dataset_organization_df.Variable == 'AD_Del_Omni')
new_row = dataset_organization_df.loc[mask_series].copy()

# Modify the desired column value
new_row['Variable'] = 'AD_Del_Omni_Text'

# Append the new row to the Data Frame
dataset_organization_df = concat([dataset_organization_df, new_row], ignore_index=True)

# Get the column value descriptions
mask_series = ~dataset_organization_df.Description.isnull()
df = dataset_organization_df[mask_series]
value_description_dict = df.set_index('Variable').Description.to_dict()
new_description_dict = value_description_dict.copy()
for k, v in value_description_dict.items():
    new_description_dict[k] = v
    if (not k.endswith('_Text')):
        new_key_name = f'{k}_Text'
        new_description_dict[new_key_name] = new_description_dict.get(new_key_name, v)
value_description_dict = new_description_dict.copy()

# Create the value description function
numeric_categories_mask_series = dataset_organization_df.Labels.map(lambda x: '=' in str(x))
value_descriptions_columns = dataset_organization_df[numeric_categories_mask_series].Variable.unique().tolist()
def get_value_description(column_name, column_value):
    value_description = ''
    if not isna(column_value):
        mask_series = (dataset_organization_df.Variable == column_name) & ~dataset_organization_df.Labels.isnull()
        if mask_series.any():
            df = dataset_organization_df[mask_series]
            mask_series = df.Labels.map(lambda label: re.split(' *= *', str(label), 0)[0] == str(int(float(column_value))))
            if mask_series.any():
                label = df[mask_series].Labels.squeeze()
                value_description = re.split(' *= *', str(label), 0)[1]
    
    return value_description

In [8]:

# Add medical role back in
new_column = 'MedRole'
column_name = 'medical_role'
if new_column in json_stats_df.columns:
    on_columns = sorted(set(anova_df.columns).intersection(set(json_stats_df.columns)))
    columns_list = on_columns + [new_column]
    anova_df = anova_df.merge(
        json_stats_df[columns_list], on=on_columns, how='left'
    ).rename(columns={new_column: column_name})
    anova_df[column_name] = anova_df[column_name].map(
        lambda cv: get_value_description('MedRole', cv)
    ).replace('', nan)
print(anova_df.groupby(column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

                 record_count
medical_role                 
Other                      24
Medical student            23
Paramedic                   3
EM faculty                  2
EM resident                 2


In [9]:

# Add the sim environment back in
new_column = 'encounter_layout'
if new_column in json_stats_df.columns:
    on_columns = sorted(set(anova_df.columns).intersection(set(json_stats_df.columns)))
    columns_list = on_columns + [new_column]
    anova_df = anova_df.merge(
        json_stats_df[columns_list], on=on_columns, how='left'
    )
print(anova_df.groupby(new_column).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

                  record_count
encounter_layout              
Desert                      18
Submarine                   16
Jungle                      15
Urban                       11


In [10]:

# Store the results and show the new data frame shape
nu.store_objects(metrics_evaluation_open_world_anova_df=anova_df, verbose=True)
nu.save_data_frames(metrics_evaluation_open_world_anova_df=anova_df, verbose=True)
print(anova_df.shape)
print(anova_df.columns.tolist())

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_anova_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_anova_df.csv
(68, 37)
['participant_id', 'scene_id', 'session_uuid', 'mean_AD_KDMA_Sim', 'mean_AD_KDMA_Text', 'mean_PropTrust', 'mean_ST_KDMA_Sim', 'mean_ST_KDMA_Text', 'mean_YrsMilExp', 'mean_configData_scenarioData_difficulty', 'mean_actual_engagement_distance', 'mean_first_engagement', 'mean_first_treatment', 'mean_injury_correctly_treated_count', 'mean_injury_not_treated_count', 'mean_injury_treatments_count', 'mean_injury_wrongly_treated_count', 'mean_last_engagement', 'mean_last_still_engagement', 'mean_measure_of_right_ordering', 'mean_patient_count', 'mean_percent_hemorrhage_controlled', 'mean_pulse_taken_count', 'mean_stills_value', 'mean_teleport_count', 'mean_time_to_hemorrhage_control_per_patient', 'mean_time_to_last_hemorrhage_contr