In [1]:

%pprint
import sys
if (osp.join(os.pardir, 'py') not in sys.path): sys.path.insert(1, osp.join(os.pardir, 'py'))

Pretty printing has been turned OFF


In [2]:

from FRVRS import (nu, fu, read_excel, to_datetime, Series, np, osp, read_csv, csv, DataFrame, re, listdir)

In [3]:

# load data frames
data_frames_dict = nu.load_data_frames(frvrs_logs_df='frvrs_logs_df')
frvrs_logs_df = data_frames_dict['frvrs_logs_df']
print(frvrs_logs_df.shape) # (829116, 125)

Attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/frvrs_logs_df.pkl.
(829116, 120)



# Create a Data Frame of all the File Stats

In [4]:

if nu.pickle_exists('file_stats_df'): file_stats_df = nu.load_object('file_stats_df')
else:
    
    # Get the columns that consistently have only one value in them per session
    single_value_cols_set = set(frvrs_logs_df.columns)
    for session_uuid, session_df in frvrs_logs_df.groupby('session_uuid'):
        single_value_cols = set([col for col in session_df.columns if session_df[col].nunique() == 1])
        single_value_cols_set = single_value_cols_set.intersection(single_value_cols)
    
    # Get the learner types from the spreadsheet
    if nu.pickle_exists('learner_types_df'): learner_types_df = nu.load_object('learner_types_df')
    else:
        file_path = '../data/xlsx/uuid_Master_with_Site_and_Learner_Type20240213.xlsx'
        learner_types_df = read_excel(file_path)
        learner_types_df.columns = [
            'session_file_date', 'session_file_name', 'session_uuid', 'responder_name', 'responder_type', 'site_name', 'encounter_layout'
        ]
        learner_types_df = learner_types_df.dropna(axis='columns', how='all')
        if ('session_file_date' in learner_types_df.columns):
            learner_types_df['session_file_date'] = to_datetime(learner_types_df['session_file_date'], infer_datetime_format=True)
        print(learner_types_df.shape) # (346, 6)
        nu.store_objects(learner_types_df=learner_types_df)
        nu.save_data_frames(learner_types_df=learner_types_df)
    
    file_stats_df = frvrs_logs_df[single_value_cols_set].merge(
        learner_types_df, how='left', on='session_uuid'
    ).drop_duplicates().reset_index(drop=True)
    for session_uuid, session_df in file_stats_df.groupby('session_uuid'):
        mask_series = session_df.session_file_date.isnull()
        if mask_series.any():
            session_file_date = fu.get_session_file_date(frvrs_logs_df, session_uuid)
            file_stats_df.loc[session_df[mask_series].index, 'session_file_date'] = session_file_date
    nu.store_objects(file_stats_df=file_stats_df)
    nu.save_data_frames(file_stats_df=file_stats_df)
print(file_stats_df.shape) # (429, 11)

(426, 14)


In [6]:

# Fix the null file dates
mask_series = file_stats_df.session_file_date.isnull()
if mask_series.any():
    print(f'I have {mask_series.sum()} scenes in my stats data frame without file dates.')
    for session_uuid, idx_df in file_stats_df[mask_series].groupby('session_uuid'):
        
        # Get the whole session history
        mask_series = (frvrs_logs_df.session_uuid == session_uuid)
        session_df = frvrs_logs_df[mask_series]
        
        session_file_date = session_df.event_time.min().date()
        file_stats_df.loc[idx_df.index, 'session_file_date'] = session_file_date
    nu.store_objects(file_stats_df=file_stats_df)
    nu.save_data_frames(file_stats_df=file_stats_df)
    mask_series = file_stats_df.session_file_date.isnull()
    print(f'I now have {mask_series.sum()} scenes in my stats data frame without file dates.')

In [7]:

# Add file start time
if ('file_start_time' not in file_stats_df.columns) or ('file_stop_time' not in file_stats_df.columns):
    sub_directory = '../data/logs'
    for file_name, idx_df in file_stats_df.groupby('file_name'):
        
        # Construct the full path to the file
        file_path = osp.join(sub_directory, file_name)
        
        # Attempt to read CSV file using pandas
        try:
            import pandas as pd
            file_df = read_csv(file_path, header=None, index_col=False)
        
        # If unsuccessful, try using a reader
        except:
            rows_list = []
            with open(file_path, 'r') as f:
                reader = csv.reader(f, delimiter=',', quotechar='"')
                for values_list in reader:
                    if (values_list[-1] == ''): values_list.pop(-1)
                    rows_list.append({i: v for i, v in enumerate(values_list)})
            file_df = DataFrame(rows_list)
        
        ts_series = to_datetime(file_df[2], infer_datetime_format=True)
        file_stats_df.loc[idx_df.index, 'file_start_time'] = ts_series.min().to_pydatetime()
        file_stats_df.loc[idx_df.index, 'file_stop_time'] = ts_series.max().to_pydatetime()
    nu.store_objects(file_stats_df=file_stats_df)
    nu.save_data_frames(file_stats_df=file_stats_df)

In [8]:

# Label the military files
if 'scenario_environment' not in file_stats_df.columns:
    for session_uuid, idx_df in file_stats_df.groupby('session_uuid'):

        # Get the whole session history
        mask_series = (frvrs_logs_df.session_uuid == session_uuid)
        session_df = frvrs_logs_df[mask_series]

        if any(session_df.patient_id.dropna().map(lambda x: bool(re.search('(Military|Intelligence|Marine)', x, re.IGNORECASE)))):
            file_stats_df.loc[idx_df.index, 'scenario_environment'] = 'jungle'
    nu.store_objects(file_stats_df=file_stats_df)
    nu.save_data_frames(file_stats_df=file_stats_df)
mask_series = (file_stats_df.scenario_environment == 'jungle')
df = file_stats_df[mask_series].drop_duplicates()
print(df.shape) # (15, 11)
display(df.dropna(axis='columns', how='all'))

(15, 13)


Unnamed: 0,is_scene_aborted,logger_version,session_uuid,is_a_one_triage_file,file_name,session_file_date,scenario_environment,file_start_time,file_stop_time
413,False,1.3,0b630a14-5acd-4e11-99df-8f59d804f807,False,v.1.3/0b630a14-5acd-4e11-99df-8f59d804f807.csv,2023-09-07,jungle,2023-09-07 14:41:36,2023-09-07 14:46:47
414,False,1.3,2a5106e4-984c-42a7-9edd-2fb3e6325d10,False,v.1.3/2a5106e4-984c-42a7-9edd-2fb3e6325d10.csv,2023-09-11,jungle,2023-09-11 12:45:45,2023-09-11 12:52:55
415,False,1.3,2faaa766-35d7-4fff-910c-f7b044bb913b,False,v.1.3/2faaa766-35d7-4fff-910c-f7b044bb913b.csv,2023-09-11,jungle,2023-09-11 09:41:43,2023-09-11 09:47:28
416,False,1.3,384462fc-969c-42cd-944f-726634faba4f,False,v.1.3/384462fc-969c-42cd-944f-726634faba4f.csv,2023-09-07,jungle,2023-09-07 12:43:11,2023-09-07 12:49:45
417,False,1.3,666ce61c-2ebc-40ee-902e-6f6aa42801ad,False,v.1.3/666ce61c-2ebc-40ee-902e-6f6aa42801ad.csv,2023-09-07,jungle,2023-09-07 15:27:29,2023-09-07 15:31:15
418,False,1.3,8b979d02-dbea-4d22-864b-7031425815cb,True,v.1.3/8b979d02-dbea-4d22-864b-7031425815cb.csv,2023-09-07,jungle,2023-09-07 01:00:01,2023-09-07 12:59:54
419,False,1.3,a3d6d913-7755-4e8d-a174-d5e491c4eac7,False,v.1.3/a3d6d913-7755-4e8d-a174-d5e491c4eac7.csv,2023-09-11,jungle,2023-09-11 14:17:12,2023-09-11 14:36:36
420,False,1.3,b06475e0-5ddd-4ff8-ba82-abf5331d6c9c,False,v.1.3/b06475e0-5ddd-4ff8-ba82-abf5331d6c9c.csv,2023-09-07,jungle,2023-09-07 13:49:49,2023-09-07 13:52:19
421,False,1.3,b0f6e371-e548-4e1d-adc1-92891a6ca6ca,True,v.1.3/b0f6e371-e548-4e1d-adc1-92891a6ca6ca.csv,2023-09-07,jungle,2023-09-07 12:07:26,2023-09-07 12:13:22
422,False,1.3,b11f14c2-8de9-4247-9753-1434cb392804,False,v.1.3/b11f14c2-8de9-4247-9753-1434cb392804.csv,2023-09-11,jungle,2023-09-11 10:10:23,2023-09-11 10:23:45


In [9]:

# Get a sample with a clear count of responders
new_column_name = 'is_a_one_triage_file'
# if (new_column_name in file_stats_df.columns): file_stats_df = file_stats_df.drop(columns=new_column_name)
if (new_column_name not in file_stats_df.columns):
    file_stats_df[new_column_name] = False
    for file_name in file_stats_df.file_name.unique():
        
        # Filter out the triage files in this file name
        base_mask_series = (file_stats_df.file_name == file_name)
        mask_series = base_mask_series & (file_stats_df.scene_type == 'Triage') & (file_stats_df.is_scene_aborted == False)
        
        # Get whether the file has only one triage run
        triage_scene_count = len(file_stats_df[mask_series].groupby('scene_id').groups)
        is_a_one_triage_file = bool(triage_scene_count == 1)
        
        file_stats_df.loc[base_mask_series, new_column_name] = is_a_one_triage_file
    
    # Store the results and show the new data frame shape
    nu.store_objects(file_stats_df=file_stats_df)
    nu.save_data_frames(file_stats_df=file_stats_df)
    
    print(file_stats_df.shape) # (829116, 113)
display(file_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Unnamed: 0_level_0,record_count
is_a_one_triage_file,Unnamed: 1_level_1
False,69
True,360


In [25]:

# Remove any duplicate session IDs
mask_series = file_stats_df.duplicated(subset='session_uuid')
if mask_series.any():
    display(file_stats_df[mask_series].dropna(axis='columns', how='all'))
    file_stats_df = file_stats_df[~mask_series]
    
    # Store the results and show the new data frame shape
    nu.store_objects(file_stats_df=file_stats_df)
    nu.save_data_frames(file_stats_df=file_stats_df)
    print(file_stats_df.shape) # (426, 14)

In [24]:

# The 385 files in the "CSV Files Renamed by Date.zip" file that OSU uploaded
# to Data and Videos (on Sep 23, 2023) make up the "registry" for now
new_column_name = 'is_in_registry'
# if (new_column_name in file_stats_df.columns): file_stats_df = file_stats_df.drop(columns=new_column_name)
if (new_column_name not in file_stats_df.columns):
    file_stats_df[new_column_name] = False
    import os
    sub_directory = '../data/temp'
    file_names_list = listdir(sub_directory)
    session_uuids_list = []
    for file_name in file_names_list:

        # Construct the full path to the file
        file_path = osp.join(sub_directory, file_name)

        # Attempt to read CSV file using pandas
        try:
            import pandas as pd
            file_df = read_csv(file_path, header=None, index_col=False)

        # If unsuccessful, try using a reader
        except:
            rows_list = []
            with open(file_path, 'r') as f:
                reader = csv.reader(f, delimiter=',', quotechar='"')
                for values_list in reader:
                    if (values_list[-1] == ''): values_list.pop(-1)
                    rows_list.append({i: v for i, v in enumerate(values_list)})
            file_df = DataFrame(rows_list)
        
        for session_uuid in file_df[3].unique(): session_uuids_list.append(session_uuid)
        mask_series = file_stats_df.session_uuid.isin(session_uuids_list)
        file_stats_df.loc[mask_series, new_column_name] = True
    
    # Store the results and show the new data frame shape
    nu.store_objects(file_stats_df=file_stats_df)
    nu.save_data_frames(file_stats_df=file_stats_df)
    
    print(file_stats_df.shape) # (429, 14)
display(file_stats_df.groupby(new_column_name).size().to_frame().rename(columns={0: 'record_count'}))

Unnamed: 0_level_0,record_count
is_in_registry,Unnamed: 1_level_1
False,41
True,385



## Add new encounter layouts

In [5]:

file_path = '../data/xlsx/uuid_Master_with_Site_and_Learner_Type20240220.xlsx'
learner_types_df = read_excel(file_path)
learner_types_df.columns = [
    'file_start_date', 'file_end_date', 'session_file_name', 'session_uuid', 'responder_name', 'responder_type', 'site_name', 'encounter_layout', 'file_notes'
]

In [22]:

mask_series = ~learner_types_df.session_uuid.isnull()
on_columns = sorted(set(learner_types_df.columns).intersection(set(file_stats_df.columns)))
df = learner_types_df[mask_series][on_columns].drop_duplicates()
assert df.session_uuid.value_counts().sort_values(ascending=False).head(1).tolist()[0] == 1, "You have duplicate session UUIDs"
missing_columns_list = ['responder_type', 'site_name', 'encounter_layout']
mask_series = False
for cn in missing_columns_list: mask_series |= file_stats_df[cn].isnull()
mask_series &= file_stats_df.is_in_registry
for session_uuid, idx_df in file_stats_df[mask_series].groupby('session_uuid'):
    mask_series = (df.session_uuid == session_uuid)
    file_stats_df.loc[idx_df.index, 'encounter_layout'] = df[mask_series].encounter_layout.squeeze()
    file_stats_df.loc[idx_df.index, 'responder_name'] = df[mask_series].responder_name.squeeze()
    file_stats_df.loc[idx_df.index, 'responder_type'] = df[mask_series].responder_type.squeeze()
    file_stats_df.loc[idx_df.index, 'site_name'] = df[mask_series].site_name.squeeze()

In [25]:

missing_columns_list = ['responder_type', 'site_name', 'encounter_layout']
mask_series = False
for cn in missing_columns_list: mask_series |= file_stats_df[cn].isnull()
mask_series &= ~file_stats_df.is_in_registry
for session_uuid, idx_df in file_stats_df[mask_series].groupby('session_uuid'):
    mask_series = (df.session_uuid == session_uuid)
    file_stats_df.loc[idx_df.index, 'encounter_layout'] = df[mask_series].encounter_layout.squeeze()
    file_stats_df.loc[idx_df.index, 'responder_name'] = df[mask_series].responder_name.squeeze()
    file_stats_df.loc[idx_df.index, 'responder_type'] = df[mask_series].responder_type.squeeze()
    file_stats_df.loc[idx_df.index, 'site_name'] = df[mask_series].site_name.squeeze()

In [26]:

# Store the results and show the new data frame shape
nu.store_objects(file_stats_df=file_stats_df)
nu.save_data_frames(file_stats_df=file_stats_df)
print(file_stats_df.shape) # (426, 14)

Pickling to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/file_stats_df.pkl
Saving to /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/csv/file_stats_df.csv
(426, 14)



# Visualize

In [50]:

file_stats_df.sample(7).T

Unnamed: 0,85,66,405,184,167,75,68
is_scene_aborted,False,False,False,False,False,False,False
logger_version,1.0,1.3,1.0,1.0,1.0,1.0,1.0
session_uuid,dd46bfb4-d2df-4b1c-923f-5f3568a14f29,6cc2fd2a-88a9-4ffa-9230-cfab1f206fce,6925e2d8-3f2b-443d-8e59-7bac6684464d,983344e9-c2d0-46b5-a8ba-da6e37d04f1d,54aaf31a-22bc-46f2-a810-8564161bf8d0,63b272d4-ae7c-4cc3-8e5b-d474b09a44c2,7407aba4-aef4-4012-be6c-a237d225809d
is_a_one_triage_file,True,True,True,True,True,True,True
file_name,All CSV files renamed by date/04.20.23.1358.csv,All CSV files renamed by date/04.19.23.1353.csv,v.1.0/clean-max2.csv,DCEMS Round 2 only triage sessions/983344e9-c2...,DCEMS Round 2 only triage sessions/54aaf31a-22...,All CSV files renamed by date/04.20.23.1008.csv,All CSV files renamed by date/04.19.23.1432.csv
session_file_date,2023-04-20 13:58:00,2023-04-19 01:53:00,2023-04-21 08:58:00,2023-08-02 10:12:00,2023-08-10 00:00:00,2023-04-20 10:08:00,2023-04-19 02:32:00
responder_name,Mike K,Anastacio R,Alex M,Sam Sk,,Jason M,Matthew C
responder_type,Paramedic,Paramedic,Paramedic,Paramedic,,Paramedic,Paramedic
site_name,Madison Twp,Madison Twp,Madison Twp,DCEMS-RND 2,,Madison Twp,Madison Twp
encounter_layout,First 11,First 11,First 11,First 11,,First 11,First 11


In [53]:

# Get the unique count of each column
for cn in file_stats_df.columns: print(cn, file_stats_df[cn].nunique())

is_scene_aborted 1
logger_version 2
session_uuid 426
is_a_one_triage_file 2
file_name 426
session_file_date 285
responder_name 234
responder_type 13
site_name 8
encounter_layout 2
scenario_environment 1
file_start_time 425
file_stop_time 426
