In [None]:

%pprint
import sys
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))

In [None]:

from FRVRS import (nu, fu, warnings, DataFrame, osp, listdir, re, nan, concat, Series, remove, subprocess, notnull)
import json

warnings.filterwarnings('ignore')


# Dataset Built for Metrics Evaluation Open World

Dave you should be ignoring all the files except the zip folder I sent you.
In the zip I sent you, there are 51 folders, (51 JSON, 51 CSV).
Zip file attached.
All the files are named appropriated in the folder/CSV/json UUID_ParticipantID.
Some of the internal Participants IDs might be off because the moderator forgot to enter a Participant ID or didn't enter the Participant ID correctly so we needed to figure out which participant it was.
Please only utilize the UUID and Participant ID that is on the file name to identify and ignore the internal Participant IDs.
Maybe that will help.

In [None]:

# Get all the Open World logs into one data frame
csv_stats_df = DataFrame([])
logs_path = osp.join(fu.data_folder, 'logs', 'Human_Sim_Metrics_Data_4-12-2024')
directories_list = listdir(logs_path)
for dir_name in directories_list:
    
    # Add the CSVs to the data frame
    folder_path = osp.join(logs_path, dir_name)
    df = fu.concatonate_logs(logs_folder=folder_path)
    
    session_uuid, participant_id = dir_name.split('_')
    df['session_uuid'] = session_uuid
    df['participant_id'] = int(participant_id)
    
    # Remove numerically-named columns
    columns_list = [x for x in df.columns if not re.search(r'\d+', str(x))]
    df = df[columns_list]
    
    # Convert 'TRUE' and 'FALSE' to boolean values
    for cn in fu.boolean_columns_list:
        df[cn] = df[cn].map({'TRUE': True, 'FALSE': False, 'True': True, 'False': False})
    
    # Convert the nulls into NaNs
    for cn in df.columns: df[cn] = df[cn].replace(['null', 'nan'], nan)
    
    # Append the data frame for the current subdirectory to the main data frame and break the participant ID loop
    csv_stats_df = concat([csv_stats_df, df], axis='index')

csv_stats_df = csv_stats_df.reset_index(drop=True)
csv_stats_df['csv_file_name'] = csv_stats_df.csv_file_subpath.map(lambda x: str(x).split('/')[-1])
nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
print(csv_stats_df.participant_id.nunique()) # 22
print(csv_stats_df.shape) # (171766, 112)


## Check for proper ingestion (duplicate file ingestion, et al)

In [None]:

# Check you even ingested anything
assert len(csv_stats_df.columns) > 4, "Nothing ingested"
assert csv_stats_df.participant_id.nunique() == 26, f"Participant count should be 26, it's {csv_stats_df.participant_id.nunique()} instead"
print(csv_stats_df.shape) # (171766, 112)

display(csv_stats_df.groupby('logger_version').size().to_frame().rename(columns={0: 'record_count'})) # 276926

In [None]:

# Filter all the rows that have more than one unique value in the csv_file_subpath column for each value in the session_uuid column
mask_series = (csv_stats_df.groupby('session_uuid').csv_file_subpath.transform(Series.nunique) > 1)
assert not mask_series.any(), "You have duplicate files"
# columns_list = ['session_uuid', 'csv_file_subpath']
# for (session_uuid, csv_file_subpath), df in csv_stats_df[mask_series][columns_list].drop_duplicates().sort_values(columns_list).groupby(
#     columns_list
# ):
#     if not csv_file_subpath.startswith('Double runs removed/'):
#         file_path = osp.join(fu.data_logs_folder, *csv_file_subpath.split('/'))
#         remove(file_path)

In [None]:

# Check that all your junk scenes are the last scenes
display(csv_stats_df.groupby('is_scene_aborted').size().to_frame().rename(columns={0: 'record_count'}))
mask_series = csv_stats_df.is_scene_aborted
for (session_uuid, scene_id), scene_df in csv_stats_df[mask_series].groupby(fu.scene_groupby_columns):
    mask_series = (csv_stats_df.session_uuid == session_uuid)
    max_scene_id = csv_stats_df[mask_series].scene_id.max()
    assert max_scene_id == scene_id, "You've got junk scenes in strange places"


## Add new features according to your increasing domain knowledge


### Modalize separate columns into one

In [None]:

# Modalize into one patient ID column if possible
new_column_name = 'patient_id'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.patient_id_columns_list, new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    print(csv_stats_df.shape) # (171766, 98)

print(csv_stats_df[new_column_name].nunique()) # 39
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

In [None]:

# Modalize into one injury ID column if possible
new_column_name = 'injury_id'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.injury_id_columns_list, new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 99)

print(csv_stats_df[new_column_name].nunique()) # 34
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

In [None]:

# Modalize into one location ID column if possible
new_column_name = 'location_id'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.location_id_columns_list, new_column_name)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 100)

print(csv_stats_df[new_column_name].nunique()) # 9239
display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}).sort_values(
    'record_count', ascending=False
).head(5))

In [None]:

# Modalize into one patient sort column if possible
new_column_name = 'patient_sort'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.patient_sort_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.sort_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 101)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Modalize into one patient pulse column if possible
new_column_name = 'patient_pulse'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.pulse_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.pulse_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 102)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Modalize into one patient salt column if possible
new_column_name = 'patient_salt'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.patient_salt_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.salt_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 103)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Modalize into one patient hearing column if possible
new_column_name = 'patient_hearing'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.hearing_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.hearing_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 104)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Modalize into one patient breath column if possible
new_column_name = 'patient_breath'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.breath_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.breath_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 105)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Modalize into one patient mood column if possible
new_column_name = 'patient_mood'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.mood_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.mood_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 106)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Modalize into one patient pose column if possible
new_column_name = 'patient_pose'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.pose_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.pose_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 107)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Modalize into one injury severity column if possible
new_column_name = 'injury_severity'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.injury_severity_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.severity_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 108)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Modalize into one injury required_procedure column if possible
new_column_name = 'injury_required_procedure'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.injury_required_procedure_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.required_procedure_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 109)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Modalize into one injury body_region column if possible
new_column_name = 'injury_body_region'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.body_region_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.body_region_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 110)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

# Modalize into one tool type column if possible
new_column_name = 'tool_type'
if (new_column_name not in csv_stats_df.columns):
    csv_stats_df = nu.modalize_columns(csv_stats_df, fu.tool_type_columns_list, new_column_name)
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.tool_type_category_order)
    
    # Store the results and show the new data frame shape
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    
    print(csv_stats_df.shape) # (171766, 111)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))


### Convert text columns to categorical

In [None]:

new_column_name = 'pulse_taken_pulse_name'
if (new_column_name in csv_stats_df.columns):
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.pulse_name_category_order)
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

In [None]:

new_column_name = 'tool_applied_data'
if (new_column_name in csv_stats_df.columns):
    csv_stats_df[new_column_name] = csv_stats_df[new_column_name].astype(fu.tool_applied_data_category_order)
    nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
    nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)

display(csv_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))


## Mask voice capture PII

In [None]:

# OSU screened all of the **VOICE_COMMAND** and **VOICE_CAPTURE** lines and
# replaced any names with either Max or Jane, regardless of whether the name was that of the responder.
# But, just to make sure...
columns_list = ['voice_command_command_description', 'voice_capture_message']
if not csv_stats_df[columns_list].applymap(lambda x: '[PERSON]' in str(x), na_action='ignore').sum().sum():
    try:
        import spacy
        try: nlp = spacy.load('en_core_web_sm')
        except OSError as e:
            print(str(e).strip())
            command_str = f'{sys.executable} -m spacy download en_core_web_sm --quiet'
            subprocess.run(command_str.split())
            nlp = spacy.load('en_core_web_sm')
        import en_core_web_sm
        nlp = en_core_web_sm.load()
        
        mask_series = csv_stats_df.voice_command_command_description.isnull() & csv_stats_df.voice_capture_message.isnull()
        df = csv_stats_df[~mask_series]
        def mask_pii(srs):
            for idx in columns_list:
                new_text = srs[idx]
                if notnull(new_text):
                    doc = nlp(new_text)
                    for entity in doc.ents:
                        if entity.label_ == 'PERSON': new_text = re.sub('\\b' + entity.text + '\\b', '[PERSON]', new_text)
                    srs[idx] = new_text
        
            return srs
        
        for row_index, row_series in df.apply(mask_pii, axis='columns')[columns_list].iterrows():
            for column_name, column_value in row_series.items():
                if notnull(column_value): csv_stats_df.loc[row_index, column_name] = column_value
        
        # Store the results and show the new data frame shape
        nu.store_objects(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
        nu.save_data_frames(metrics_evaluation_open_world_csv_stats_df=csv_stats_df)
        print(csv_stats_df.shape) # (199476, 109)
    except Exception as e: print(f'{e.__class__.__name__} error in PII masking: {str(e).strip()}')