In [None]:

# Set up the notebook
%pprint
import sys
if (osp.join(os.pardir, 'py') not in sys.path): sys.path.insert(1, osp.join(os.pardir, 'py'))

In [None]:

from FRVRS import (nu, fu, warnings, read_excel, re, isna, nan, display, osp)
import json

warnings.filterwarnings('ignore')


# Data Fixes for Metrics Evaluation Open World

In [None]:

# load data frames
data_frames_dict = nu.load_data_frames(
    metrics_evaluation_open_world_csv_stats_df='', metrics_evaluation_open_world_json_stats_df='',
    metrics_evaluation_open_world_scene_stats_df=''
)
csv_stats_df = data_frames_dict['metrics_evaluation_open_world_csv_stats_df']
json_stats_df = data_frames_dict['metrics_evaluation_open_world_json_stats_df']
scene_stats_df = data_frames_dict['metrics_evaluation_open_world_scene_stats_df']

In [None]:

# Fix the encounter_layout column based on the set of patients in the scene
fu.add_encounter_layout_column_to_json_stats(csv_stats_df, json_stats_df, verbose=True)

In [None]:

scene_columns_set = set(scene_stats_df.columns)
logs_columns_set = set(csv_stats_df.columns)
intersection_columns = set(['is_scene_aborted'])

# Drop the logs columns already recorded in the scene stats data frames
drop_columns = sorted(scene_columns_set.intersection(logs_columns_set).intersection(intersection_columns))
print(drop_columns)
if drop_columns:
    csv_stats_df = csv_stats_df.drop(columns=drop_columns)
    print(csv_stats_df.shape) # (171766, 107)
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)

In [None]:

logs_columns_set = set(csv_stats_df.columns)
file_columns_set = set(json_stats_df.columns)
intersection_columns = set(['logger_version', 'is_scene_aborted'])

# Drop the logs columns already recorded in the JSON and scene stats data frames
drop_columns = list(logs_columns_set.intersection(file_columns_set).intersection(intersection_columns))
print(drop_columns)
if drop_columns:
    csv_stats_df = csv_stats_df.drop(columns=drop_columns)
    print(csv_stats_df.shape) # (171766, 124)
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)

In [None]:

logs_columns_set = set(csv_stats_df.columns)
file_columns_set = set(json_stats_df.columns)
intersection_columns = set([
    'injury_record_injury_treated', 'injury_record_injury_treated_with_wrong_treatment', 'patient_demoted_health_level',
    'patient_demoted_health_time_remaining',
    'patient_demoted_hearing', 'patient_hearing', 'patient_record_health_level', 'patient_record_hearing',
    'player_location_left_hand_location',
    'player_location_right_hand_location', 'bag_access_location', 'patient_engaged_health_level', 'voice_capture_command_description',
    'csv_file_name',
    'csv_file_subpath'
])

# Drop the JSON Stats columns that came with the process but are covered well enough in the logs data frame and add no value here
drop_columns = sorted(logs_columns_set.intersection(file_columns_set).intersection(intersection_columns))
print(drop_columns)
if drop_columns:
    json_stats_df = json_stats_df.drop(columns=drop_columns)
    print(json_stats_df.shape) # (43, 3589)
    nu.store_objects(metrics_evaluation_open_world_json_stats_df=json_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_json_stats_df=json_stats_df)

In [None]:

file_columns_set = set(json_stats_df.columns)
scene_columns_set = set(scene_stats_df.columns)
intersection_columns = set(['logger_version'])

# Drop the scene columns already recorded in the JSON Stats data frames
drop_columns = sorted(file_columns_set.intersection(scene_columns_set).intersection(intersection_columns))
if drop_columns:
    scene_stats_df = scene_stats_df.drop(columns=drop_columns)
    print(scene_stats_df.shape) # (76, 48)
    nu.store_objects(metrics_evaluation_open_world_scene_stats_df=scene_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_scene_stats_df=scene_stats_df)

In [None]:

# Remove duplicates from the JSON Stats data frame
subset_columns = ['session_uuid']
mask_series = json_stats_df.duplicated(subset=subset_columns)
if mask_series.any():
    json_stats_df = json_stats_df[~mask_series]
    print(json_stats_df.shape)
    nu.store_objects(metrics_evaluation_open_world_json_stats_df=json_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_json_stats_df=json_stats_df)

In [None]:

# Remove duplicates from the scene stats data frame
subset_columns = ['session_uuid', 'scene_id']
mask_series = scene_stats_df.duplicated(subset=subset_columns)
if mask_series.any():
    scene_stats_df = scene_stats_df[~mask_series]
    print(scene_stats_df.shape)
    nu.store_objects(metrics_evaluation_open_world_scene_stats_df=scene_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_scene_stats_df=scene_stats_df)


### Get column and value descriptions

In [None]:

# Attempt to manufacture some better column names
file_path = '../data/xlsx/Metrics_Evaluation_Dataset_organization_for_BBAI.xlsx'
dataset_organization_df = read_excel(file_path)

# Fix the doubled up descriptions
mask_series = dataset_organization_df.Labels.map(lambda x: ';' in str(x))
for row_index, label in dataset_organization_df[mask_series].Labels.items():
    labels_list = re.split(' *; *', str(label), 0)
    dataset_organization_df.loc[row_index, 'Labels'] = labels_list[0]
    
    # Get a copy of the row
    new_row = dataset_organization_df.loc[row_index].copy()
    
    # Modify the desired column value
    new_row['Labels'] = labels_list[1]
    
    # Append the new row to the Data Frame
    dataset_organization_df = dataset_organization_df.append(new_row, ignore_index=True)

# Get a copy of the row
mask_series = (dataset_organization_df.Variable == 'AD_Del_Omni')
new_row = dataset_organization_df.loc[mask_series].copy()

# Modify the desired column value
new_row['Variable'] = 'AD_Del_Omni_Text'

# Append the new row to the Data Frame
dataset_organization_df = dataset_organization_df.append(new_row, ignore_index=True)

# Turn the data frame into value description getter
mask_series = dataset_organization_df.Labels.map(lambda x: '=' in str(x))
value_descriptions_columns = dataset_organization_df[mask_series].Variable.unique().tolist()
def get_value_description(column_name, column_value):
    value_description = ''
    if not isna(column_value):
        mask_series = (dataset_organization_df.Variable == column_name) & ~dataset_organization_df.Labels.isnull()
        if mask_series.any():
            df = dataset_organization_df[mask_series]
            mask_series = df.Labels.map(lambda label: re.split(' *= *', str(label), 0)[0] == str(int(float(column_value))))
            if mask_series.any():
                label = df[mask_series].Labels.squeeze()
                value_description = re.split(' *= *', str(label), 0)[1]
    
    return value_description
column_name = 'MedRole'
column_value = nan
get_value_description(column_name, column_value)


## Provide Correctly Grouped Responder Type Stats

In [None]:

# Add a column that correctly groups responder types
new_column_name = 'responder_category'
if (new_column_name in json_stats_df.columns):
    json_stats_df = json_stats_df.drop(columns=[new_column_name])
    print(json_stats_df.shape)
if (new_column_name not in json_stats_df.columns):
    json_stats_df[new_column_name] = json_stats_df.MedRole.map(
        lambda x: ' '.join([r.title() for r in get_value_description('MedRole', x).split(' ')]).replace('Em ', 'EM ')
    )
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_json_stats_df=json_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_json_stats_df=json_stats_df)
    print(json_stats_df.shape) # (43, 3564)

display(json_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

In [None]:

# Add responder types subgrouping columns
groupby_columns = ['overall_category', 'global_category', 'global_description', 'sub_category', 'sub_description', 'responder_type']
if any(map(lambda x: x not in json_stats_df.columns, groupby_columns)):
    file_path = osp.join(fu.data_folder, 'xlsx', 'Responder_Categories_and_Counts_DPW.xlsx')
    dpw_responder_categories_df = read_excel(file_path)

    # Get description data frame
    mask_series = dpw_responder_categories_df.isna().all(axis='columns')
    idx = dpw_responder_categories_df[mask_series].index.min()
    where_df = dpw_responder_categories_df.iloc[idx+1:]
    
    # Get categories data frame
    dpw_responder_categories_df = dpw_responder_categories_df.iloc[:idx].dropna(axis='columns', how='all')
    dpw_responder_categories_df.columns = ['overall_category', 'responder_type', 'global_category', 'sub_category', 'record_count']
    
    # Create global description column
    columns_list = ['global_category', 'global_description', 'record_count']
    df = where_df.iloc[:, 1:4].dropna(axis='index', how='all')
    df.columns = columns_list
    global_description_dict = df.set_index('global_category').global_description.to_dict()
    dpw_responder_categories_df['global_description'] = dpw_responder_categories_df.global_category.map(global_description_dict)
    
    # Create sub description column
    columns_list = ['sub_category', 'sub_description', 'record_count']
    df = where_df.iloc[:, 5:8].dropna(axis='index', how='all')
    df.columns = columns_list
    sub_description_dict = df.set_index('sub_category').sub_description.to_dict()
    dpw_responder_categories_df['sub_description'] = dpw_responder_categories_df.sub_category.map(sub_description_dict)
    
    # Add columns to JSON Stats data frame
    df = dpw_responder_categories_df[groupby_columns].groupby(groupby_columns).size().reset_index(drop=False)
    assert not (df[0] != 1).any(), "You will have a problem with responder types"
    df = df.drop(columns=[0])
    assert not (df.groupby('responder_type').size() != 1).any(), "You have a problem with responder types"
    if 'responder_type' not in json_stats_df.columns:
        json_stats_df['responder_type'] = json_stats_df.MedRole.map(
            lambda x: re.sub('^Other$', 'Other HP', ' '.join(
                [r.title() for r in get_value_description('MedRole', x).split(' ')]
            ).replace('Em ', 'EM ').replace('EM Faculty', 'EM-Faculty'))
        )
    # print()
    
    overall_category_dict = dpw_responder_categories_df.set_index('responder_type').overall_category.to_dict()
    # print(f'overall_category_dict = {overall_category_dict}')
    json_stats_df['overall_category'] = json_stats_df.responder_type.map(overall_category_dict)

    global_category_dict = dpw_responder_categories_df.set_index('responder_type').global_category.to_dict()
    # print(f'global_category_dict = {global_category_dict}')
    json_stats_df['global_category'] = json_stats_df.responder_type.map(global_category_dict)

    global_description_dict = dpw_responder_categories_df.set_index('responder_type').global_description.to_dict()
    # print(f'global_description_dict = {global_description_dict}')
    json_stats_df['global_description'] = json_stats_df.responder_type.map(global_description_dict)

    sub_category_dict = dpw_responder_categories_df.set_index('responder_type').sub_category.to_dict()
    # print(f'sub_category_dict = {sub_category_dict}')
    json_stats_df['sub_category'] = json_stats_df.responder_type.map(sub_category_dict)

    sub_description_dict = dpw_responder_categories_df.set_index('responder_type').sub_description.to_dict()
    # print(f'sub_description_dict = {sub_description_dict}')
    json_stats_df['sub_description'] = json_stats_df.responder_type.map(sub_description_dict)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_json_stats_df=json_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_json_stats_df=json_stats_df)
    print(json_stats_df.shape) # (43, 3570)

if all(map(lambda x: x in json_stats_df.columns, groupby_columns)):
    print(json_stats_df[groupby_columns].nunique()) 
    display(json_stats_df.groupby(groupby_columns).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
        'record_count', ascending=False
    ).head(5))

In [14]:

# [cn for cn in scene_stats_df.columns if ('action' in cn) and ('total' in cn)]
scene_stats_df = scene_stats_df.rename(columns={'total_actions': 'total_actions_count'})
nu.store_objects(metrics_evaluation_open_world_scene_stats_df=scene_stats_df)
nu.save_data_frames(metrics_evaluation_open_world_scene_stats_df=scene_stats_df)

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_scene_stats_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/metrics_evaluation_open_world_scene_stats_df.csv
