In [1]:

%pprint
import sys
if (osp.join('..', 'py') not in sys.path): sys.path.insert(1, osp.join('..', 'py'))

Pretty printing has been turned OFF


In [5]:

# load libraries
from FRVRS import (fu, nu, DataFrame, to_datetime, Series, concat, np, osp, walk, read_csv, csv, sep, nan)
from datetime import date, timedelta
import os
from IPython.display import HTML
import pandas as pd


# Find the Whereabouts of the 389 Files

In [39]:

data_frames_dict = nu.load_data_frames(
    frvrs_logs_df='', file_stats_df='', scene_stats_df='',
    verbose=False
)
ldf = data_frames_dict['frvrs_logs_df']
fdf = data_frames_dict['file_stats_df']
sdf = data_frames_dict['scene_stats_df']

# Get the column sets
triage_columns = ['file_name', 'scene_type', 'is_scene_aborted', 'logger_version']
for cn in triage_columns:
    for (df_name, df) in [('frvrs_logs_df', ldf), ('file_stats_df', fdf), ('scene_stats_df', sdf)]:
        if cn in df.columns: print(f'The {cn} column is in the {df_name} Data Frame.')

The file_name column is in the file_stats_df DataFrame.
The scene_type column is in the scene_stats_df DataFrame.
The is_scene_aborted column is in the file_stats_df DataFrame.
The is_scene_aborted column is in the scene_stats_df DataFrame.
The logger_version column is in the file_stats_df DataFrame.


In [40]:

# Drop the redundant file stats columns
file_columns_set = set(fdf.columns)
drop_columns = list(set(['is_scene_aborted']).intersection(file_columns_set))
if drop_columns: fdf = fdf.drop(columns=drop_columns)

In [41]:

# Merge the latest FRVRS logs to get the original columns
patient_count_filter_fn = lambda scene_df: True
merge_df = fu.get_elevens_dataframe(
    ldf, fdf, sdf,
    needed_columns=triage_columns,
    patient_count_filter_fn=patient_count_filter_fn
)
mask_series = ~merge_df.file_name.isnull()
one_triage_file_filter_fn = lambda file_name_df: fu.get_is_a_one_triage_file(file_name_df)
df = merge_df[mask_series].groupby('file_name').filter(one_triage_file_filter_fn)
print(df.shape[0], df.session_uuid.nunique(), df.shape[1])
frvrs_session_uuids_list = sorted(df.session_uuid.unique())

257704 357 122



## Trying summing up CSV folder-counts

In [52]:

logs_folder = '../data/logs'

# Iterate through subfolders
target_sum = 389
subfolder_counts = []
for root, _, files in walk(logs_folder):
    
    # Count CSV files
    csv_count = sum(1 for f in files if f.endswith(".csv"))
    
    # Display subfolder name and CSV count (if any)
    if csv_count > 0:
        # print(f"Subfolder: {root.replace(logs_folder, '')}, CSV count: {csv_count}")
        subfolder_tuple = (root.replace(logs_folder, ''), csv_count)
        subfolder_counts.append(subfolder_tuple)

There exists a combination of subfolders that sums to 389.


In [55]:

subfolder_counts =  [
    ('/All CSV files renamed by date', 144), ('/DCEMS Round 2 only triage sessions', 81), ('/Disaster Day 2022', 29),
    ('/Double runs removed', 28),
    ('/v.1.0', 128), ('/v.1.3', 17)
]

# Function to check if a combination of subfolders sums to the target
def check_combination(counts, target, used=[]):
    if target == 0:
        print(used)
        return True
    if target < 0 or not counts: return False
    for i, (subfolder_name, count) in enumerate(counts):
        if count not in used:
            if check_combination(counts[:i] + counts[i+1:], target - count, used + [count]): return True
    return False

# Check if a combination of subfolders sums to the target
if check_combination(subfolder_counts, target_sum): print(f"There exists a combination of subfolders that sums to {target_sum}.")
else: print(f"No combination of subfolders sums to {target_sum}.")

No combination of subfolders sums to 389.



## Get a dataset of all file names and session UUIDs

In [23]:

# Walk through the logs, getting only file names and session_uuids
important_columns_list = ['file_name', 'session_uuid']
logs_df = DataFrame([], columns=important_columns_list)

# Iterate over the subdirectories, directories, and files in the logs folder
logs_folder = '../data/logs'
for sub_directory, directories_list, files_list in walk(logs_folder):
    
    # Create a data frame to store the data for the current subdirectory
    sub_directory_df = DataFrame([], columns=important_columns_list)
    
    # Iterate over the files in the current subdirectory
    for file_name in files_list:
        
        # If the file is a CSV file, merge it into the subdirectory data frame
        if file_name.endswith('.csv'):
            # sub_directory_df = fu.process_files(sub_directory_df, sub_directory, file_name, verbose=verbose)
            
            # Construct the full path to the file
            file_path = osp.join(sub_directory, file_name)
            
            # Attempt to read CSV file using pandas
            try: file_df = read_csv(file_path, header=None, index_col=False)
            
            # If unsuccessful, try using a reader
            except:
                rows_list = []
                with open(file_path, 'r') as f:
                    reader = csv.reader(f, delimiter=',', quotechar='"')
                    for values_list in reader:
                        if (values_list[-1] == ''): values_list.pop(-1)
                        rows_list.append({i: v for i, v in enumerate(values_list)})
                file_df = DataFrame(rows_list)
            
            # Ignore small files and return the subdirectory data frame unharmed
            if (file_df.shape[1] >= 16):
                
                # Add file name  to the data frame
                file_dir_suffix = osp.abspath(sub_directory).replace(osp.abspath(logs_folder) + sep, '')
                file_df['file_name'] = '/'.join(file_dir_suffix.split(sep)) + '/' + file_name
                
                # Name the global columns
                columns_list = ['action_type', 'action_tick', 'event_time', 'session_uuid']
                file_df.columns = columns_list + file_df.columns.tolist()[len(columns_list):]

                # Remove all but the file name and session columns
                file_df = file_df[important_columns_list].drop_duplicates()
                
                # Append the data frame for the current file to the data frame for the current subdirectory
                sub_directory_df = concat([sub_directory_df, file_df], axis='index')
    
    # Append the data frame for the current subdirectory to the main data frame
    logs_df = concat([logs_df, sub_directory_df], axis='index')
    
logs_df = logs_df.reset_index()
print(logs_df.shape[0], logs_df.session_uuid.nunique(), logs_df.shape[1])

867 518 3


In [13]:

# load triage paper data frame and get the session UUIDs
data_frames_dict = nu.load_data_frames(
    first_responder_master_registry_df='',
    verbose=False
)
triage_paper_df = data_frames_dict['first_responder_master_registry_df']
print(triage_paper_df.shape[0], triage_paper_df.session_uuid.nunique(), triage_paper_df.shape[1])
mask_series = ~triage_paper_df.session_uuid.isnull()
registry_session_uuids_list = sorted(triage_paper_df[mask_series].session_uuid.unique())

401177 332 111


In [31]:

# Get the session UUIDs not in the registry or subsequent file additions
logs_df['sub_folder'] = logs_df.file_name.map(lambda x: x.split('/')[0])
mask_series = ~logs_df.session_uuid.isin(registry_session_uuids_list) & ~logs_df.sub_folder.isin([
    'Disaster Day 3.6.2024 ITM Files 405E', 'Disaster day 3.6.2024 ITM files 405F', 'Metrics Evaluation Open World'
]) & ~logs_df.session_uuid.isnull()
df = logs_df[mask_series]
extra_session_uuids_list = sorted(df.session_uuid.unique())
extra_session_uuids_count = len(extra_session_uuids_list)
extra_session_uuids_count

94

In [27]:

# Do the extra sessions add up?
len(registry_session_uuids_list) + extra_session_uuids_count

426

In [28]:

# How many session UUIDs are missing from the registry?
389 - len(registry_session_uuids_list)

57

In [None]:

mask_series = merge_df.session_uuid.isin(extra_session_uuids_list)
frvrs_logs_df = merge_df[mask_series]


# Identify any Anomalous Files

In [30]:

anomalous_files_set = set()
anomalous_files_str = ''

In [43]:

# If all the patients in a file are all named Mike, that is a training simulation
files_list = []

# Group the Data Frame by 'file_name'
for file_name, file_name_df in frvrs_logs_df.groupby('file_name'):

    # Get a list of the unique patient IDs in the file
    patient_ids = file_name_df.patient_id.unique().tolist()

    # Check if all the patient IDs in the file contain the string "mike" in lowercase
    if all(map(lambda x: 'mike' in str(x).lower(), patient_ids)):

        # Add the file name to the files_list list
        files_list.append(file_name)

# If there are files with all patients named "Mike," print and update the results
if files_list:

    # Create a string with the list of files having "Mike" patients
    print_str = f'\n\nThese files have patients that are all named "Mike":\n\t{nu.conjunctify_nouns(files_list)}'

    # Add carriage returns for better readability
    print_str = print_str.replace(' and ', ', and\n\t')
    
    # Print the results
    print(print_str)

    # Update the anomalous files string
    anomalous_files_str += print_str

    # Update the anomalous files set
    anomalous_files_set.update(files_list)

In [44]:

# Count the number of user actions
def add_scene_columns_to_row(scene_df, row_dict):
    logger_version = fu.get_logger_version(scene_df)
    row_dict['logger_version'] = logger_version
    row_dict['total_actions'] = fu.get_total_actions_count(scene_df)
    
    return row_dict
rows_list = []
for (session_uuid, scene_id), scene_df in frvrs_logs_df.groupby(fu.scene_groupby_columns):
    row_dict = {cn: eval(cn) for cn in fu.scene_groupby_columns}
    row_dict = add_scene_columns_to_row(scene_df, row_dict)
    rows_list.append(row_dict)

# Create a data frame from the list of rows
total_actions_df = DataFrame(rows_list)

# Group by session UUID and sum total actions
df = total_actions_df.groupby('session_uuid').sum()

# Create a mask for sessions with zero total actions
mask_series = (df.total_actions == 0)

# Get a list of session UUIDs with no user actions
session_uuids_list = df[mask_series].index.tolist()

# Create a mask to filter for UUIDs with no user actions
mask_series = frvrs_logs_df.session_uuid.isin(session_uuids_list)

# Get unique file names associated with sessions with zero total actions
files_list = frvrs_logs_df[mask_series].file_name.unique().tolist()

# Check if there are files with zero actions
if files_list:
    
    # Create a formatted string of anomalous files
    print_str = f'\n\nThese files have no user action taken:\n\t{nu.conjunctify_nouns(files_list)}'
    
    # Add carriage returns for readability
    print_str = print_str.replace(' and ', ', and\n\t')

    # Print and update the anomalous files string and set
    print(print_str); anomalous_files_str += print_str
    anomalous_files_set.update(files_list)



These files have no user action taken:
	DCEMS Round 2 only triage sessions/7c2549d4-97a4-4389-bd03-029396714f59.csv, and
	v.1.0/Clean e78faf41-7bbd-410b-8750-e4e72b951216.csv


In [45]:

# Total Number of Teleports
def add_scene_columns_to_row(scene_df, row_dict):
    logger_version = fu.get_logger_version(scene_df)
    row_dict['logger_version'] = logger_version
    row_dict['teleport_count'] = fu.get_teleport_count(scene_df)
    
    return row_dict
rows_list = []
for (session_uuid, scene_id), scene_df in frvrs_logs_df.groupby(fu.scene_groupby_columns):
    row_dict = {cn: eval(cn) for cn in fu.scene_groupby_columns}
    row_dict = add_scene_columns_to_row(scene_df, row_dict)
    rows_list.append(row_dict)

# Create a data frame from the list of dictionaries
teleport_count_df = DataFrame(rows_list)

# Group the data frame by UUID and sum the teleport counts
df = teleport_count_df.groupby('session_uuid').sum()

# Create a mask to filter sessions with teleport count equal to 0
mask_series = (df.teleport_count == 0)

# Get a list of session UUIDs with no teleportation
session_uuids_list = df[mask_series].index.tolist()

# Create a mask to filter files associated with the identified session UUIDs
mask_series = frvrs_logs_df.session_uuid.isin(session_uuids_list)

# Get the file names with no teleportation being done
files_list = frvrs_logs_df[mask_series].file_name.unique().tolist()

# Check if there are files with no teleportation
if files_list:

    # Create a printable string of the file names
    print_str = f'\n\nThese files have no teleportation being done:\n\t{nu.conjunctify_nouns(files_list)}'
    
    # Format the printed string for readability
    print_str = print_str.replace(', ', ',\n\t').replace(',\n\tand ', ', and\n\t')
    
    # Print the list of files and update the anomalous_files_str and set
    print(print_str); anomalous_files_str += print_str
    anomalous_files_set.update(files_list)



These files have no teleportation being done:
	DCEMS Round 2 only triage sessions/1066671d-2a1d-4744-b66f-e4b48548701f.csv,
	DCEMS Round 2 only triage sessions/54aaf31a-22bc-46f2-a810-8564161bf8d0.csv,
	DCEMS Round 2 only triage sessions/7c2549d4-97a4-4389-bd03-029396714f59.csv,
	v.1.0/Clean 2310f107-d9d2-418e-a2d7-dd7a17924544.csv,
	v.1.0/Clean c6a48228-d864-4b20-93dd-8ad0d78d59c0.csv, and
	v.1.0/Clean e78faf41-7bbd-410b-8750-e4e72b951216.csv


In [46]:

# Create a mask to filter rows where action_type is PULSE_TAKEN or INJURY_TREATED
mask_series = frvrs_logs_df.action_type.isin(['PULSE_TAKEN', 'INJURY_TREATED'])

# Define columns to group by and initialize lists


rows_list = []
indices_list = []

# Iterate over groups based on the specified columns
for (session_uuid, scene_id, patient_id), patient_df in frvrs_logs_df[mask_series].groupby(fu.patient_groupby_columns):

    # Create a tuple to store the index of the current row
    index_tuple = (session_uuid, scene_id, patient_id)
    indices_list.append(index_tuple)

    # Create a dictionary to store the results for the current row
    row_dict = {}

    # Add the logger_version for the current row
    row_dict['logger_version'] = fu.get_logger_version(patient_df)
    
    # Count the number of 'PULSE_TAKEN' actions in this group
    row_dict['pulses_count'] = fu.get_pulse_taken_count(patient_df)
    
    # Count the number of 'INJURY_TREATED' actions in this group
    row_dict['treated_count'] = fu.get_injury_treatments_count(patient_df)
    
    # Try to calculate the number of pulses taken per injury treated
    try: row_dict['pulses_by_treated'] = row_dict['pulses_count'] / row_dict['treated_count']

    # Handle the case where 'treated_count' is zero to avoid division by zero
    except ZeroDivisionError: row_dict['pulses_by_treated'] = nan
    
    # Add the row_dict to the rows_list
    rows_list.append(row_dict)

# Create a data frame from the rows and set a multi index based on the grouped columns
pulses_count_df = DataFrame(rows_list, index=pd.MultiIndex.from_tuples(tuples=indices_list, names=fu.patient_groupby_columns))

# Group the pulses count data frame by session_uuid and sum the values
df = pulses_count_df.groupby('session_uuid').sum()

# Check for sessions with 'treated_count' equal to zero
mask_series = (df.treated_count == 0)
session_uuids_list = df[mask_series].index.tolist()

# Filter logs for sessions with no 'INJURY_TREATED' actions
mask_series = frvrs_logs_df.session_uuid.isin(session_uuids_list)
files_list = frvrs_logs_df[mask_series].file_name.unique().tolist()

# Print files with no injury treatment being done
if files_list:
    print_str = f'\n\nThese files have no injury treatment being done:\n\t{nu.conjunctify_nouns(files_list)}'
    print_str = print_str.replace(' and ', ', and\n\t')
    print(print_str); anomalous_files_str += print_str
    anomalous_files_set.update(files_list)

# Check for sessions with 'pulses_count' equal to zero
mask_series = (df.pulses_count == 0)
session_uuids_list = df[mask_series].index.tolist()

# Filter logs for sessions with no 'PULSE_TAKEN' actions
mask_series = frvrs_logs_df.session_uuid.isin(session_uuids_list)
files_list = frvrs_logs_df[mask_series].file_name.unique().tolist()

# Print files with no pulses being taken
if files_list:
    print_str = f'\n\nThese files have no pulses being taken:\n\t{nu.conjunctify_nouns(files_list)}'
    print_str = print_str.replace(', ', ',\n\t').replace(',\n\tand ', ', and\n\t')
    print(print_str); anomalous_files_str += print_str
    anomalous_files_set.update(files_list)



These files have no injury treatment being done:
	All CSV files renamed by date/03.14.23.0919.csv, All CSV files renamed by date/11.30.20.0828.csv, DCEMS Round 2 only triage sessions/1066671d-2a1d-4744-b66f-e4b48548701f.csv, DCEMS Round 2 only triage sessions/54aaf31a-22bc-46f2-a810-8564161bf8d0.csv,, and
	v.1.0/Clean c6a48228-d864-4b20-93dd-8ad0d78d59c0.csv


These files have no pulses being taken:
	All CSV files renamed by date/03.14.23.1219.csv,
	All CSV files renamed by date/03.15.23.0944.csv,
	v.1.0/Clean 5fa79a8e-a2df-4bb9-b614-f3ce36a5edb0.csv, and
	v.1.0/Clean a6b7ff70-3b20-48c6-86e8-744bad19f7d7.csv


In [47]:

# Patient accuracy rate (how many patients correct / number of patients treated)

# Initialize an empty list to store dictionaries
rows_list = []

# Iterate over the scenes
for (session_uuid, scene_id), scene_df in frvrs_logs_df.groupby(fu.scene_groupby_columns):

    # Get the logger version for the group
    logger_version = fu.get_logger_version(scene_df)
    
    # Create a dictionary to store the results for the group
    row_dict = {}
    for cn in fu.scene_groupby_columns: row_dict[cn] = eval(cn)
    row_dict['logger_version'] = logger_version

    # Get the total number of patients treated
    total_treated = fu.get_injury_treatments_count(scene_df)

    # If there were any patients treated, calculate the patient accuracy rate
    if total_treated:
        
        # Filter the dataframe to only include rows where the injury was treated correctly
        correctly_treated = fu.get_injury_correctly_treated_count(scene_df)

        # Calculate the patient accuracy rate
        row_dict['injury_treated_total_treated'] = total_treated
        row_dict['injury_treated_correctly_treated'] = correctly_treated
        row_dict['injury_treated_patient_accuracy_rate'] = correctly_treated / total_treated
    
    # Filter the data frame to only include rows where the injury record was treated.
    total_mask = (scene_df.injury_record_injury_treated == True)
    df2 = scene_df[total_mask]

    # Get the total number of patients treated
    total_treated = df2.shape[0]

    # If there were any patients treated, calculate the patient accuracy rate
    if total_treated:
        
        # Filter the dataframe to only include rows where the injury record was treated correctly
        correct_mask = (df2.injury_record_injury_treated_with_wrong_treatment == False)
        correctly_treated = df2[correct_mask].shape[0]

        # Calculate the patient accuracy rate
        row_dict['injury_record_total_treated'] = total_treated
        row_dict['injury_record_correctly_treated'] = correctly_treated
        row_dict['injury_record_patient_accuracy_rate'] = correctly_treated / total_treated

    # Add the row dictionary to the list of results
    rows_list.append(row_dict)

# Create a data frame from the list of dictionaries
patient_accuracy_rate_df = DataFrame(rows_list)

# Modalize into one patient accuracy rate column if possible
if 'injury_record_patient_accuracy_rate' in patient_accuracy_rate_df.columns:

    # Get a list of columns that contain the patient accuracy rate
    columns_list = [
        'injury_treated_patient_accuracy_rate', 'injury_record_patient_accuracy_rate'
    ]
    
    # Check if there's only one unique value across the specified columns
    mask_series = (patient_accuracy_rate_df[columns_list].apply(Series.nunique, axis='columns') == 1)
    
    # Set the patient accuracy rate column for the rows identified by the mask series to the non-null value in one of the patient accuracy rate columns
    patient_accuracy_rate_df.loc[~mask_series, 'patient_accuracy_rate'] = nan
    
    # Define a function to select the first valid value
    def f(srs):
        cn = srs.first_valid_index()
        
        return srs[cn]
    
    # Modalize the patient accuracy rate columns to get a single column
    patient_accuracy_rate = patient_accuracy_rate_df[mask_series][columns_list].apply(f, axis='columns')
    
    # Set the patient accuracy rate column for the rows identified by the mask series to the single patient accuracy rate column
    patient_accuracy_rate_df.loc[mask_series, 'patient_accuracy_rate'] = patient_accuracy_rate
    
    # Group by 'session_uuid' and sum the patient accuracy rates
    df = patient_accuracy_rate_df.groupby('session_uuid').sum()
    mask_series = (df.patient_accuracy_rate == 0)
else:
    
    # Group by 'session_uuid' and sum the patient accuracy rates for 'injury_treated' data
    df = patient_accuracy_rate_df.groupby('session_uuid').sum()
    mask_series = (df.injury_treated_patient_accuracy_rate == 0)

# Get a list of session_uuids with no patient accuracy rate above zero
session_uuids_list = df[mask_series].index.tolist()

# Create a mask to filter rows in 'frvrs_logs_df' based on session_uuids
mask_series = frvrs_logs_df.session_uuid.isin(session_uuids_list)

# Get a list of unique file names from the filtered rows
files_list = frvrs_logs_df[mask_series].file_name.unique().tolist()

# Print the list of files with no patient accuracy rate above zero
if files_list:
    print_str = f'\n\nThese files have no patient accuracy rate above zero:\n\t{nu.conjunctify_nouns(files_list)}'
    print_str = print_str.replace(', ', ',\n\t').replace(',\n\tand ', ', and\n\t')
    print(print_str); anomalous_files_str += print_str
    anomalous_files_set.update(files_list)



These files have no patient accuracy rate above zero:
	All CSV files renamed by date/03.14.23.0919.csv,
	All CSV files renamed by date/11.30.20.0828.csv,
	All CSV files renamed by date/12.06.22.1331.csv,
	DCEMS Round 2 only triage sessions/1066671d-2a1d-4744-b66f-e4b48548701f.csv,
	DCEMS Round 2 only triage sessions/54aaf31a-22bc-46f2-a810-8564161bf8d0.csv,
	DCEMS Round 2 only triage sessions/7c2549d4-97a4-4389-bd03-029396714f59.csv,
	DCEMS Round 2 only triage sessions/8ec8afba-8533-4915-898f-5769c1258c61.csv,
	DCEMS Round 2 only triage sessions/91f31664-43ad-4405-a763-0d58a8afc36a.csv,
	v.1.0/Clean 158e6365-673b-4030-8b36-6704be5996a2.csv,
	v.1.0/Clean 2310f107-d9d2-418e-a2d7-dd7a17924544.csv,
	v.1.0/Clean 845d87c5-7b8b-4bf3-bfc6-91c74e285243.csv,
	v.1.0/Clean c6a48228-d864-4b20-93dd-8ad0d78d59c0.csv,
	v.1.0/Clean d90f2d85-e91c-4f12-b070-5929d95be1c5.csv,
	v.1.0/Clean db948ce1-783d-4dff-a1f8-2be49570f327.csv, and
	v.1.0/Clean e78faf41-7bbd-410b-8750-e4e72b951216.csv


In [48]:

# Number of patients engaged

# Initialize an empty list to store rows
rows_list = []

# Iterate over the sessions, grouped by scene
for (session_uuid, scene_id), scene_df in frvrs_logs_df.groupby(fu.scene_groupby_columns):

    # Get the logger version
    logger_version = fu.get_logger_version(scene_df)

    # Get the number of patients in the session
    patients_count = fu.get_patient_count(scene_df)

    # Initialize the number of patients engaged
    patients_engaged = 0
    
    # Loop over patients within this scene
    for patient_id, patient_df in scene_df.groupby('patient_id'):
        
        # Create a mask to filter rows where action_type is 'PATIENT_ENGAGED'
        mask_series = (patient_df.action_type == 'PATIENT_ENGAGED')
        
        # If there are any rows that match the mask, increment the number of patients engaged
        if mask_series.any(): patients_engaged += 1

    # Check if there are patients in this scene
    if patients_count:
        # Create a dictionary to store row data
        row_dict = {}

        # Add the session UUID and the scene to the dictionary
        for cn in fu.scene_groupby_columns: row_dict[cn] = eval(cn)

        # Add the logger version to the dictionary
        row_dict['logger_version'] = logger_version

        # Add the number of patients to the dictionary
        row_dict['patients_count'] = patients_count

        # Add the number of patients engaged to the dictionary
        row_dict['patients_engaged'] = patients_engaged

        # Calculate the percentage of patients engaged
        row_dict['percentage_engaged'] = patients_engaged / patients_count

        # Add the row dictionary to the list of rows
        rows_list.append(row_dict)

# Create a data frame from the list of rows
percentage_engaged_df = DataFrame(rows_list)

# Group the Data Frame by UUID and sum the number of patients engaged
df = percentage_engaged_df.groupby('session_uuid').sum()

# Create a mask to identify UUIDs with no engaged patients
mask_series = (df.patients_engaged == 0)

# List the UUIDs that meet the condition
session_uuids_list = df[mask_series].index.tolist()

# Create a mask to filter rows in the main data frame based on UUID
mask_series = frvrs_logs_df.session_uuid.isin(session_uuids_list)

# Get unique file_names that match the UUIDs
files_list = frvrs_logs_df[mask_series].file_name.unique().tolist()

# Check if there are any files with no patients engaged
if files_list:

    # Create a formatted string to list files with no engaged patients
    print_str = f'\n\nThese files have no patients being engaged:\n\t{nu.conjunctify_nouns(files_list)}'
    
    # Format the string to have line breaks after commas
    print_str = print_str.replace(', ', ',\n\t').replace(',\n\tand ', ', and\n\t')

    # Print the results and add the results to the anomalous_files_str string
    print(print_str); anomalous_files_str += print_str

    # Add the files to the anomalous_files_set set
    anomalous_files_set.update(files_list)



These files have no patients being engaged:
	DCEMS Round 2 only triage sessions/1066671d-2a1d-4744-b66f-e4b48548701f.csv,
	DCEMS Round 2 only triage sessions/54aaf31a-22bc-46f2-a810-8564161bf8d0.csv,
	DCEMS Round 2 only triage sessions/7c2549d4-97a4-4389-bd03-029396714f59.csv,
	v.1.0/Clean 158e6365-673b-4030-8b36-6704be5996a2.csv,
	v.1.0/Clean 2310f107-d9d2-418e-a2d7-dd7a17924544.csv,
	v.1.0/Clean c6a48228-d864-4b20-93dd-8ad0d78d59c0.csv,
	v.1.0/Clean d90f2d85-e91c-4f12-b070-5929d95be1c5.csv, and
	v.1.0/Clean e78faf41-7bbd-410b-8750-e4e72b951216.csv


In [49]:

print("\nDoug,\n\nHere is the set of anomalous files I'm concerned about:")
print('\t' + '\n\t'.join(sorted(anomalous_files_set)))
print(anomalous_files_str)


Doug,

Here is the set of anomalous files I'm concerned about:
	All CSV files renamed by date/03.14.23.0919.csv
	All CSV files renamed by date/03.14.23.1219.csv
	All CSV files renamed by date/03.15.23.0944.csv
	All CSV files renamed by date/11.30.20.0828.csv
	All CSV files renamed by date/12.06.22.1331.csv
	DCEMS Round 2 only triage sessions/1066671d-2a1d-4744-b66f-e4b48548701f.csv
	DCEMS Round 2 only triage sessions/54aaf31a-22bc-46f2-a810-8564161bf8d0.csv
	DCEMS Round 2 only triage sessions/7c2549d4-97a4-4389-bd03-029396714f59.csv
	DCEMS Round 2 only triage sessions/8ec8afba-8533-4915-898f-5769c1258c61.csv
	DCEMS Round 2 only triage sessions/91f31664-43ad-4405-a763-0d58a8afc36a.csv
	v.1.0/Clean 158e6365-673b-4030-8b36-6704be5996a2.csv
	v.1.0/Clean 2310f107-d9d2-418e-a2d7-dd7a17924544.csv
	v.1.0/Clean 5fa79a8e-a2df-4bb9-b614-f3ce36a5edb0.csv
	v.1.0/Clean 845d87c5-7b8b-4bf3-bfc6-91c74e285243.csv
	v.1.0/Clean a6b7ff70-3b20-48c6-86e8-744bad19f7d7.csv
	v.1.0/Clean c6a48228-d864-4b20-93dd

In [28]:

# How many session UUIDs are missing from the registry?
389 - len(registry_session_uuids_list)

57

In [50]:

len(anomalous_files_set)

19