In [1]:

# Set up the notebook
%pprint
import sys
if ('../py' not in sys.path): sys.path.insert(1, '../py')

Pretty printing has been turned OFF


In [2]:

from FRVRS import fu, nu
from numpy import nan, isnan
from os import listdir as listdir, makedirs as makedirs, path as osp, remove as remove, sep as sep, walk as walk
from pandas import CategoricalDtype, DataFrame, Index, NaT, Series, concat, isna, notnull, read_csv, read_excel, read_pickle, to_datetime, to_numeric
import csv
import json
import math
import numpy as np
import re
import statsmodels.api as sm
import subprocess
import sys
import warnings
try: import dill as pickle
except:
    try: import pickle5 as pickle
    except: import pickle

warnings.filterwarnings('ignore')

# Check for presence of 'get_ipython' function (exists in Jupyter)
try:
    get_ipython()
    from IPython.display import display
except NameError:
    display = lambda message: print(message)


# Write up Steps to do ANOVA Stats columns Calculations

In [3]:

# Load data frames to get a reliable representation
data_frames_dict = nu.load_data_frames(
    metrics_evaluation_open_world_anova_df=''
)
anova_df = data_frames_dict['metrics_evaluation_open_world_anova_df']

Attempting to load /mnt/c/Users/DaveBabbitt/Documents/GitHub/itm-analysis-reporting/saves/pkl/metrics_evaluation_open_world_anova_df.pkl.


In [4]:

# Get column and value descriptions
file_path = osp.join(fu.data_folder, 'xlsx', 'Metrics_Evaluation_Dataset_organization_for_BBAI.xlsx')
dataset_organization_df = read_excel(file_path)

# Fix the doubled up descriptions
mask_series = dataset_organization_df.Labels.map(lambda x: ';' in str(x))
for row_index, label in dataset_organization_df[mask_series].Labels.items():
    labels_list = re.split(' *; *', str(label), 0)
    dataset_organization_df.loc[row_index, 'Labels'] = labels_list[0]
    
    # Get a copy of the row
    new_row = dataset_organization_df.loc[row_index].copy()
    
    # Modify the desired column value
    new_row['Labels'] = labels_list[1]
    
    # Append the new row to the DataFrame
    dataset_organization_df = concat([dataset_organization_df, new_row], ignore_index=True)

# Get a copy of the row
mask_series = (dataset_organization_df.Variable == 'AD_Del_Omni')
new_row = dataset_organization_df.loc[mask_series].copy()

# Modify the desired column value
new_row['Variable'] = 'AD_Del_Omni_Text'

# Append the new row to the DataFrame
dataset_organization_df = concat([dataset_organization_df, new_row], ignore_index=True)

# Get the column value descriptions
mask_series = ~dataset_organization_df.Description.isnull()
df = dataset_organization_df[mask_series]
value_description_dict = df.set_index('Variable').Description.to_dict()
new_description_dict = value_description_dict.copy()
for k, v in value_description_dict.items():
    new_description_dict[k] = v
    if (not k.endswith('_Text')):
        new_key_name = f'{k}_Text'
        new_description_dict[new_key_name] = new_description_dict.get(new_key_name, v)
value_description_dict = new_description_dict.copy()

# Create the value description function
numeric_categories_mask_series = dataset_organization_df.Labels.map(lambda x: '=' in str(x))
value_descriptions_columns = dataset_organization_df[numeric_categories_mask_series].Variable.unique().tolist()
def get_value_description(column_name, column_value):
    value_description = ''
    if not isna(column_value):
        mask_series = (dataset_organization_df.Variable == column_name) & ~dataset_organization_df.Labels.isnull()
        if mask_series.any():
            df = dataset_organization_df[mask_series]
            mask_series = df.Labels.map(lambda label: re.split(' *= *', str(label), 0)[0] == str(int(float(column_value))))
            if mask_series.any():
                label = df[mask_series].Labels.squeeze()
                value_description = re.split(' *= *', str(label), 0)[1]
    
    return value_description

In [7]:

def entitle_column_name(column_name):
    if column_name.startswith('mean_') and (column_name[5:] in value_description_dict):
        entitled_name = value_description_dict[column_name[5:]]
        if not entitled_name.startswith('Average '):
            entitled_name = 'Average ' + entitled_name
    else:
        new_parts_list = []
        old_parts_list = [op for op in re.split('_', column_name, 0) if op]
        for name_part in old_parts_list:
            if re.search('[A-Z][a-z]+', name_part):
                humps_list = [hp for hp in re.split('([A-Z][a-z]+)', name_part, 0) if hp]
                for i, hump_part in enumerate(humps_list):
                    if hump_part == hump_part.lower():
                        humps_list[i] = hump_part.title()
                    elif hump_part == 'Sim':
                        humps_list[i] = 'Simulation'
                    elif hump_part == 'Yrs':
                        humps_list[i] = 'Years of'
                    elif hump_part == 'Mil':
                        humps_list[i] = 'Military'
                    elif hump_part == 'Exp':
                        humps_list[i] = 'Experience'
                new_parts_list.extend(humps_list)
            else:
                if name_part == name_part.lower():
                    if (len(name_part) > 2) and (name_part != 'uuid'):
                        name_part = name_part.title()
                    elif name_part not in ['to', 'of', 'per']:
                        name_part = name_part.upper()
                new_parts_list.append(name_part)
        if new_parts_list[0] == 'Mean':
            new_parts_list[0] = 'Average'
        entitled_name = ' '.join(new_parts_list)

    return entitled_name

In [12]:

import inspect

comment_regex = re.compile('^ *# ([^\r\n]+)', re.MULTILINE)
function_call_dict = {'encounter_layout': 'fu.add_encounter_layout_column', 'medical_role': 'fu.add_medical_role_column'}
file_path = '../saves/txt/how_to_do_calculations.txt'
with open(file_path, mode='w', encoding=nu.encoding_type) as f: print('', file=f)
with open(file_path, mode='a', encoding=nu.encoding_type) as f:
    for cn in anova_df.columns:
        print('', file=f)
        print(f'{cn} ({entitle_column_name(cn)})', file=f)
        print('Steps Needed to do Calculations:', file=f)
        if cn in ['participant_id', 'scene_id', 'session_uuid']:
            if cn == 'scene_id':
                print('1. The scene_id is derived from the CSV SESSION_START and SESSION_END entries.', file=f)
            else:
                print('1. The participant_id and session_uuid are found in both the CSV and the JSON data.', file=f)
            comments_list = []
        else:
            print('1. Group your dataset by participant_id, session_uuid, and scene_id.', file=f)
        try:
            if cn in ['mean_AD_KDMA_Sim', 'mean_AD_KDMA_Text', 'mean_PropTrust', 'mean_ST_KDMA_Sim', 'mean_ST_KDMA_Text', 'mean_YrsMilExp']:
                comments_list = [
                    f'Find the {cn.replace("mean_", "")} column in the participant_data_0420 spreadsheet provided by CACI for that participant',
                    f'The {cn.replace("mean_", "")} value is semi-continously numeric, and you can average it for whatever grouping you need'
                ]
            else:
                if cn in function_call_dict:
                    function_call = function_call_dict[cn]
                else:
                    function_call = cn.replace('mean_', 'fu.get_')
                source_code = inspect.getsource(eval(function_call))
                comments_list = [comment_str for comment_str in comment_regex.findall(source_code) if comment_str and ('verbose' not in comment_str)]
            for i, comment_str in enumerate(comments_list):
                print(f'{i+2}. {comment_str}.', file=f)
        except Exception as e:
            continue