In [1]:
from unicodedata import numeric

"""
lib installation instructions:

conda create -n faiss_env python=3.9
conda activate faiss_env
conda install -c conda-forge faiss
pip install pandas==1.2.5
pip install numpy==1.25.1
pip install transformers
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install pycap==1.1.2
pip install scipy==1.13.1
pip install scikit-learn==1.5.2
pip install accelerate>=0.26.0
"""

'\nlib installation instructions:\n\nconda create -n faiss_env python=3.9\nconda activate faiss_env\nconda install -c conda-forge faiss\npip install pandas==1.2.5\npip install numpy==1.25.1\npip install transformers\npip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\npip install pycap==1.1.2\npip install scipy==1.13.1\npip install scikit-learn==1.5.2\npip install accelerate>=0.26.0\n'

In [2]:
import pickle
import os

from data_gen import *

In [3]:
"""Getting all CMR data from REDCap"""
rc_label = False
meta_data, demo, pt_hx, cath, cmr, echo, ex_str_tst, pt_stat_upd, cl_pt_level, cl_cmr_level = fetch_data(rc_label)

  dataframe = read_csv(buf, **df_kwargs)
  dat = dat.replace("NAVU", np.nan)
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  dat = dat.replace("NAVU", np.nan)
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, 

In [5]:
"""
Data cleaning
"""
# adding 0 for repeat instance for demo, pt_hx and pt_status_upd
demo['redcap_repeat_instance'] = 0
pt_hx['redcap_repeat_instance'] = 0
pt_stat_upd['redcap_repeat_instance'] = 0

# removing text data in numeric columns
pt_hx['surg_cardiac_total'] = pt_hx['surg_cardiac_total'].apply(lambda x: np.nan if isinstance(x, str) else x)
pt_stat_upd['hx_ecmo_cannulations'] = pt_stat_upd['hx_ecmo_cannulations'].apply(lambda x: np.nan if isinstance(x, str) else x)

# only retaining earliest/latest CMRs
earliest_latest_indicator = 'first' # 'first', 'last'
cmr['cmr_date'] = pd.to_datetime(cmr['cmr_date'])
cmr = cmr.sort_values(by=['force_id', 'cmr_date']).drop_duplicates(subset='force_id', keep=earliest_latest_indicator).reset_index(drop=True)
cmr['redcap_repeat_instance'] = 1

In [6]:
"""only using pts passed QA"""
not_passed_qa = [fid for fid, ready_pmt_date in zip(cl_cmr_level.force_id, cl_cmr_level.cl_payment_ready_date) if ready_pmt_date is np.nan]
passed_qa = [fid for fid in cl_cmr_level.force_id if fid not in not_passed_qa]

dfs = [demo, cmr, cath, pt_stat_upd, pt_hx, echo, ex_str_tst]
new_dfs = []
for df in dfs:
    # getting rid of mean/stdev cols
    std_cols = [col for col in df.columns if 'stdev' in col]
    mean_cols = [col.replace('_stdev', '_mean') for col in std_cols]
    df = df[[col for col in df.columns if col not in std_cols and col not in mean_cols]]
    new_dfs.append(df[df['force_id'].isin(passed_qa)])
    
demo = new_dfs[0]
cmr = new_dfs[1]
cath = new_dfs[2]
pt_stat_upd = new_dfs[3]
pt_hx = new_dfs[4]
echo = new_dfs[5]
ex_str_tst = new_dfs[6]


In [7]:
"""
looking for text vs numeric fields
"""
calc_map = {'cmr_sv_ef': "Single ventricle ejection fraction",
            "cmr_sv_sv": "Single ventricle systolic volume",
            "cmr_bmi": "Body mass index",
            "cmr_mass_height27": "Mass to height volume indexed on BSA (exp = 2.7)",
            'cmr_sv_esv': 'Single ventricle end systolic volume',
            'cmr_sv_esv_bsa1':'Single ventricle end systolic volume indexed on BSA (exp = 1)',
            'cmr_sv_esv_bsa13':'Single ventricle end systolic volume indexed on BSA (exp = 1.3)' ,
            'cmr_sv_edv': 'Single ventricle end diastolic volume',
            'cmr_sv_edv_bsa1':'Single ventricle end diastolic volume indexed on BSA (exp = 1)',
            'cmr_sv_edv_bsa13':'Single ventricle end diastolic volume indexed on BSA (exp = 1.3)' ,
            'cmr_sv_mass': 'Single ventricle mass',
            'cmr_sv_mass_bsa1':'Single ventricle mass indexed on BSA (exp = 1)',
            'cmr_sv_mass_bsa13': 'Single ventricle mass indexed on BSA (exp = 1.3)',
            'cmr_sv_mvr': 'Single ventricle mass to volume ratio',
            'cmr_sv_mvr_bsa1':'Single ventricle mass to volume ratio indexed on BSA (exp = 1)',
            'cmr_sv_mvr_bsa13':'Single ventricle mass to volume ratio indexed on BSA (exp = 1.3)'
           }


text_fields = meta_data[meta_data['field_type'] == 'text']
df_dict_map = {'demographic_data': demo, 'pt_clinical_surgical_history': pt_hx, 'catheterization': cath, 'cmr': cmr, 'echocardiogram': echo, 'exercise_stress_test': ex_str_tst, 'pt_status_event_update': pt_stat_upd}

text_cols = []
numeric_cols = []

field_names = meta_data.field_name.values
form_names = meta_data.form_name.values
field_types = meta_data.field_type.values
for field_name, form_name, field_type in zip(field_names, form_names, field_types):
    if field_type == 'text' and form_name in df_dict_map.keys():
        df_curr = df_dict_map[form_name]
        if field_name in df_curr.columns and df_curr.dropna(subset=[field_name]).shape[0] > 0:            
            series_curr = df_curr[field_name]
            try:
                series_curr.astype(float)
                numeric_cols.append(field_name)
            except:
                text_cols.append(field_name)

numeric_cols = numeric_cols + list(calc_map.keys())
year_cols = [col for col in numeric_cols if 'year' in col.lower()]
numeric_cols = [col for col in numeric_cols if col not in year_cols]

In [8]:
"""logic for coercing check box variables"""
chk_box_df = meta_data[meta_data['field_type'].isin(['checkbox'])]

chk_box_df_names = chk_box_df['form_name'].values
chk_box_cols = chk_box_df['field_name'].values

chk_cols = []
fail_cases = []
for col, dfn in zip(chk_box_cols, chk_box_df_names):
    if dfn in df_dict_map.keys():
        df_curr = df_dict_map[dfn]
        val_list=str(meta_data[meta_data['field_name']==col]['select_choices_or_calculations'].values)[2:].split('|')
        val_dic={",".join((v.split(',')[1:]))[1:-1].replace("'", ""): int(v.split(',')[0].replace(" ", "")) for v in val_list}
        for idx in val_dic.values():
            chk_col = col + '___' + str(idx)
            try:
                df_curr[chk_col]
                chk_cols.append(chk_col)
            except:
                fail_cases.append(chk_col)    

In [9]:
"""Getting rest of text/numeric fields"""
categorical_fields = meta_data[meta_data['field_type'].isin(['dropdown', 'radio', 'yesno'])]
categorical_fields = categorical_fields[categorical_fields['form_name'].isin(df_dict_map.keys())][
    'field_name'].values
categorical_fields = list(categorical_fields)
categorical_fields = categorical_fields + chk_cols
categorical_fields = [col for col in categorical_fields if col != 'site_name']

text_add_fields = meta_data[meta_data['field_type'].isin(['notes'])]
text_add_fields = text_add_fields[text_add_fields['form_name'].isin(df_dict_map.keys())]['field_name'].values
text_cols = text_cols + list(text_add_fields) + year_cols
text_cols = [col for col in text_cols if 'recorded' not in col]

In [10]:
"""
Pre-processing for data based on data type (numeric: standardize, text: create embeddings)
key references: 
df_dict_map: reference df based on form_name in metadata dictionary
text_cols: text based columns (reference col in metadata dictionary using field_name)
numeric_cols: numeric based freetext columns (reference col in metadata dictionary using field_name)
numeric_add_fields: categorical data (checkbox, yesno, dropdown, radio)` 
"""

'\nPre-processing for data based on data type (numeric: standardize, text: create embeddings)\nkey references: \ndf_dict_map: reference df based on form_name in metadata dictionary\ntext_cols: text based columns (reference col in metadata dictionary using field_name)\nnumeric_cols: numeric based freetext columns (reference col in metadata dictionary using field_name)\nnumeric_add_fields: categorical data (checkbox, yesno, dropdown, radio)` \n'

In [11]:
"""1. Normalizing numeric_cols"""
def noncategory_data_agg(df_dict_map, data_cols, meta_data, calc_map):
    # Dictionary to hold normalized data
    data_dict = {}

    # Normalizing numeric columns
    for col in data_cols:
        if col in calc_map.keys():
            form_nm_curr = 'cmr'
            df_curr = df_dict_map[form_nm_curr]
        else:
            form_nm_curr = meta_data[meta_data['field_name'] == col]['form_name'].values[0]
            df_curr = df_dict_map[form_nm_curr]

        df_curr[col] = df_curr[[col]]  #standard_scaler.fit_transform(df_curr[[col]])
        # Process each row in df_curr to store in the dictionary
        for _, row in df_curr.iterrows():
            force_id = row['force_id']
            repeat_instance = row['redcap_repeat_instance']

            if form_nm_curr not in data_dict.keys():
                data_dict[form_nm_curr] = {}

            # Initialize dictionary for each force_id if not exists
            if force_id not in data_dict[form_nm_curr].keys():
                data_dict[form_nm_curr][force_id] = {}

            if repeat_instance not in data_dict[form_nm_curr][force_id].keys():
                data_dict[form_nm_curr][force_id][repeat_instance] = {
                    "data": {},
                    "metadata": {
                        "site": force_id[0:3]
                    }
                }

            # Store normalized value in the dictionary
            if col in meta_data['field_name'].values:
                label_name = meta_data[meta_data['field_name'] == col]['field_label'].values[0]
            else:
                label_name = calc_map[col]
            data_dict[form_nm_curr][force_id][repeat_instance]['data'][label_name] = row[col]

    return data_dict


In [18]:
meta_data[meta_data['field_name'] == 'dx_vent_morph']

Unnamed: 0,field_name,form_name,section_header,field_type,field_label,select_choices_or_calculations,field_note,text_validation_type_or_show_slider_number,text_validation_min,text_validation_max,identifier,branching_logic,required_field,custom_alignment,question_number,matrix_group_name,matrix_ranking,field_annotation
16,dx_vent_morph,pt_clinical_surgical_history,,dropdown,Patients dominant ventricular morphology:,"1, Right | 2, Left | 3, Balanced or mixed | 99...","Use ""Balanced or mixed"" if the smaller ventric...",,,,,,,RH,,,,


In [12]:
def category_data_agg(ori_data_dict, df_dict_map, add_fields, meta_data):
    yn_dict = {1: 'yes', 0: 'no'}
    for col in add_fields:
        if '___' in col:
            form_nm_curr = meta_data[meta_data['field_name'] == col.split('___')[0]]['form_name'].values[0]
        else:
            form_nm_curr = meta_data[meta_data['field_name'] == col]['form_name'].values[0]

        df_curr = df_dict_map[form_nm_curr]
        # Process each row in df_curr to store in the dictionary
        for _, row in df_curr.iterrows():
            force_id = row['force_id']
            repeat_instance = row['redcap_repeat_instance']

            if form_nm_curr not in ori_data_dict.keys():
                ori_data_dict[form_nm_curr] = {}

            # Initialize dictionary for each force_id if not exists
            if force_id not in ori_data_dict[form_nm_curr].keys():
                ori_data_dict[form_nm_curr][force_id] = {}

            if repeat_instance not in ori_data_dict[form_nm_curr][force_id].keys():
                ori_data_dict[form_nm_curr][force_id][repeat_instance] = {
                    "data": {},
                    "metadata": {
                        "site": force_id[0:3]
                    }
                }
            else:
                if 'data' not in ori_data_dict[form_nm_curr][force_id][repeat_instance].keys():
                    ori_data_dict[form_nm_curr][force_id][repeat_instance]["data"] = {}

            # Store normalized value in the dictionary
            if str(row[col]) == 'nan':
                continue
                # ori_data_dict[form_nm_curr][force_id][repeat_instance]["categorical_data"][col] = "NaN"
            else:
                if '___' in col:
                    lab = meta_data[meta_data['field_name'] == col.split('___')[0]]['field_label'].iloc[0]
                    if '<p>' in lab:
                        lab = lab.split('<p>')[1].split('</p>')[0]

                    selects = \
                    meta_data[meta_data['field_name'] == col.split('___')[0]]['select_choices_or_calculations'].values[
                        0]
                    split_selects = selects.split(" | ")
                    selects_dict = {int(item.split(", ")[0]): item.split(", ")[1] for item in split_selects}
                    select_option = selects_dict[int(col.split('___')[1])]
                    ori_data_dict[form_nm_curr][force_id][repeat_instance]["data"][lab] = select_option
                else:
                    field_type = meta_data[meta_data['field_name'] == col]['field_type'].iloc[0]
                    lab = meta_data[meta_data['field_name'] == col]['field_label'].iloc[0]
                    if '<p>' in lab:
                        lab = lab.split('<p>')[1].split('</p>')[0]
                    if field_type in ['yesno', 'radio']:
                        if int(row[col]) in [0, 1]:
                            ori_data_dict[form_nm_curr][force_id][repeat_instance]["data"][lab] = yn_dict[int(row[col])]
                        else:
                            continue
                            # ori_data_dict[form_nm_curr][force_id][repeat_instance]["categorical_data"][lab] = 'NaN'
                    else:
                        selects = meta_data[meta_data['field_name'] == col]['select_choices_or_calculations'].values[0]
                        split_selects = selects.split(" | ")
                        selects_dict = {int(item.split(", ")[0]): item.split(", ")[1] for item in split_selects}
                        select_option = selects_dict[int(row[col])]
                        ori_data_dict[form_nm_curr][force_id][repeat_instance]["data"][lab] = select_option

    return ori_data_dict

In [13]:
noncategory_data_dict = noncategory_data_agg(df_dict_map, numeric_cols + text_cols, meta_data, calc_map)
data_dict = category_data_agg(noncategory_data_dict, df_dict_map, categorical_fields, meta_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_curr[col] = df_curr[[col]]  #standard_scaler.fit_transform(df_curr[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_curr[col] = df_curr[[col]]  #standard_scaler.fit_transform(df_curr[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_curr[col] = df_curr[[col]]  #standard_scaler.f

In [15]:
# Path to save the normalized numeric data dictionary
if not os.path.exists('./results'):
    os.mkdir('./results')

save_path = "report_data.pkl"

# Save the dictionary
with open('./results/' + save_path, "wb") as f:
    pickle.dump(data_dict, f)

In [16]:
[col for col in pt_stat_upd if 'other' in col]

['status_death_cod_other',
 'hx_thrombus_loc_other',
 'hx_ep_study_findings_other',
 'hx_ep_pacemaker_rsn_other',
 'hx_ep_defib_rsn_other',
 'meds_anticoag_other',
 'status_other_comments']

In [17]:
meta_data[meta_data['field_name'] == 'status_other_comments']

Unnamed: 0,field_name,form_name,section_header,field_type,field_label,select_choices_or_calculations,field_note,text_validation_type_or_show_slider_number,text_validation_min,text_validation_max,identifier,branching_logic,required_field,custom_alignment,question_number,matrix_group_name,matrix_ranking,field_annotation
635,status_other_comments,pt_status_event_update,,notes,"Other relevant comments or concerns (optional,...",,,,,,,,,,,,,


In [30]:
# """
# Saving groupings of pt. stat upd variables together
# """
psu_group_vars = pd.read_csv('Pt Status Update Table Associated Variables.csv')
psu_group_vars_t = psu_group_vars.T
all_groupings = {}
for col in psu_group_vars_t:
    all_groupings[col] = []
    for val in psu_group_vars_t[col].values:
        if str(val) != 'nan':
            all_groupings[col].append(meta_data[meta_data['field_name'] == val]['field_label'].iloc[0])
        else:
            all_groupings[col].append(np.nan)
label_psu_group_vars = pd.DataFrame(all_groupings).T
label_psu_group_vars.columns = psu_group_vars.columns
label_psu_group_vars = label_psu_group_vars.dropna(axis=1, thresh=1)
# label_psu_group_vars.to_csv('Labeled Pt Status Update Table Associated Variables.csv', index=False)

In [34]:
lab_chk_vals = label_psu_group_vars['Associated Variable 1'].values
lab_chk_vals = [val.replace('Does the patient have a ', '') for val in lab_chk_vals]
lab_chk_vals = [val[0].upper() + val[1:] for val in lab_chk_vals if val != 'nan']

In [26]:
demo['pt_sex'].value_counts()

pt_sex
1.0    2200
2.0    1544
Name: count, dtype: int64

In [35]:
psu_group_vars = pd.read_csv('Pt Status Update Table Associated Variables.csv')
chk_vars = psu_group_vars['Associated Variable 1'].values
for lab, var in zip(lab_chk_vals, chk_vars):
    chk_df = pt_stat_upd[var].replace(99, 0)
    print(lab, chk_df.value_counts())

Diagnosis of seizures requiring antiepileptic medications? hx_seizures
0.0    3512
1.0     245
Name: count, dtype: int64
History of Extra Corporeal Membrane Oxygenation (ECMO)? hx_ecmo
0.0    3566
1.0     185
Name: count, dtype: int64
History of renal dysfunction defined as a GFR < 60 or Dialysis; excluding peri-operative renal dysfunction? hx_renal_dysfxn
0.0    3623
1.0     122
Name: count, dtype: int64
History of cardiac arrest (peri-op)? hx_cardiacarrest_periop
0.0    3634
1.0     120
Name: count, dtype: int64
History of atrial tachyarrhythmias, atrial fibrillation, atrial flutter, or atrial tachycardia? hx_ep_afib
0.0    2940
1.0     811
Name: count, dtype: int64
History of pacemaker placement? hx_ep_pacemaker
0.0    3511
1.0     244
Name: count, dtype: int64
History of cardiac arrest (non peri-op)? hx_cardiacarrest_nonperiop
0.0    3679
1.0      75
Name: count, dtype: int64
History of post-Fontan electrophysiologic study? hx_ep_study
0.0    3408
1.0     348
Name: count, dtype: in

In [None]:
"""
N=3784
M/F 2200/1544
"""