In [1]:
"""
lib installation instructions:

conda create -n faiss_env python=3.9
conda activate faiss_env
conda install -c conda-forge faiss
pip install pandas==1.2.5
pip install numpy==1.25.1
pip install transformers
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install pycap==1.1.2
pip install scipy==1.13.1
pip install scikit-learn==1.5.2
pip install accelerate>=0.26.0
"""

'\nlib installation instructions:\n\nconda create -n faiss_env python=3.9\nconda activate faiss_env\nconda install -c conda-forge faiss\npip install pandas==1.2.5\npip install numpy==1.25.1\npip install transformers\npip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\npip install pycap==1.1.2\npip install scipy==1.13.1\npip install scikit-learn==1.5.2\npip install accelerate>=0.26.0\n'

In [2]:
import pickle
import os
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from data_gen import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
"""Getting all CMR data from REDCap"""
rc_label = False
meta_data, demo, pt_hx, cath, cmr, echo, ex_str_tst, pt_stat_upd, cl_pt_level, cl_cmr_level = fetch_data(rc_label)

  dataframe = read_csv(buf, **df_kwargs)
  dat = dat.replace("NAVU", np.nan)
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  dat = dat.replace("NAVU", np.nan)
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, errors='coerce')
  ret_instr_curr[ret_name]=pd.to_datetime(ret_instr_curr[ret_name], infer_datetime_format=True, 

In [4]:
"""
Data cleaning
"""
# adding 0 for repeat instance for demo, pt_hx and pt_status_upd
demo['redcap_repeat_instance'] = 0
pt_hx['redcap_repeat_instance'] = 0
pt_stat_upd['redcap_repeat_instance'] = 0

# removing text data in numeric columns
pt_hx['surg_cardiac_total'] = pt_hx['surg_cardiac_total'].apply(lambda x: np.nan if isinstance(x, str) else x)
pt_stat_upd['hx_ecmo_cannulations'] = pt_stat_upd['hx_ecmo_cannulations'].apply(lambda x: np.nan if isinstance(x, str) else x)

# only retaining earliest/latest CMRs
earliest_latest_indicator = 'first' # 'first', 'last'
cmr['cmr_date'] = pd.to_datetime(cmr['cmr_date'])
cmr = cmr.sort_values(by=['force_id', 'cmr_date']).drop_duplicates(subset='force_id', keep=earliest_latest_indicator).reset_index(drop=True)
cmr['redcap_repeat_instance'] = 1

In [5]:
"""only using pts passed QA"""
not_passed_qa = [fid for fid, ready_pmt_date in zip(cl_cmr_level.force_id, cl_cmr_level.cl_payment_ready_date) if ready_pmt_date is np.nan]
passed_qa = [fid for fid in cl_cmr_level.force_id if fid not in not_passed_qa]

dfs = [demo, cmr, cath, pt_stat_upd, pt_hx, echo, ex_str_tst]
new_dfs = []
for df in dfs:
    # getting rid of mean/stdev cols
    std_cols = [col for col in df.columns if 'stdev' in col]
    mean_cols = [col.replace('_stdev', '_mean') for col in std_cols]
    df = df[[col for col in df.columns if col not in std_cols and col not in mean_cols]]
    new_dfs.append(df[df['force_id'].isin(passed_qa)])
    
demo = new_dfs[0]
cmr = new_dfs[1]
cath = new_dfs[2]
pt_stat_upd = new_dfs[3]
pt_hx = new_dfs[4]
echo = new_dfs[5]
ex_str_tst = new_dfs[6]

In [6]:
"""
looking for text vs numeric fields
"""
calc_map = {'cmr_sv_ef': "Single ventricle ejection fraction",
            "cmr_sv_sv": "Single ventricle systolic volume",
            "cmr_bmi": "Body mass index",
            "cmr_mass_height27": "Mass to height volume indexed on BSA (exp = 2.7)",
            'cmr_sv_esv': 'Single ventricle end systolic volume',
            'cmr_sv_esv_bsa1':'Single ventricle end systolic volume indexed on BSA (exp = 1)',
            'cmr_sv_esv_bsa13':'Single ventricle end systolic volume indexed on BSA (exp = 1.3)' ,
            'cmr_sv_edv': 'Single ventricle end diastolic volume',
            'cmr_sv_edv_bsa1':'Single ventricle end diastolic volume indexed on BSA (exp = 1)',
            'cmr_sv_edv_bsa13':'Single ventricle end diastolic volume indexed on BSA (exp = 1.3)' ,
            'cmr_sv_mass': 'Single ventricle mass',
            'cmr_sv_mass_bsa1':'Single ventricle mass indexed on BSA (exp = 1)',
            'cmr_sv_mass_bsa13': 'Single ventricle mass indexed on BSA (exp = 1.3)',
            'cmr_sv_mvr': 'Single ventricle mass to volume ratio',
            'cmr_sv_mvr_bsa1':'Single ventricle mass to volume ratio indexed on BSA (exp = 1)',
            'cmr_sv_mvr_bsa13':'Single ventricle mass to volume ratio indexed on BSA (exp = 1.3)'
           }


text_fields = meta_data[meta_data['field_type'] == 'text']
df_dict_map = {'demographic_data': demo, 'pt_clinical_surgical_history': pt_hx, 'catheterization': cath, 'cmr': cmr, 'echocardiogram': echo, 'exercise_stress_test': ex_str_tst, 'pt_status_event_update': pt_stat_upd}

text_cols = []
numeric_cols = []

field_names = meta_data.field_name.values
form_names = meta_data.form_name.values
field_types = meta_data.field_type.values
for field_name, form_name, field_type in zip(field_names, form_names, field_types):
    if field_type == 'text' and form_name in df_dict_map.keys():
        df_curr = df_dict_map[form_name]
        if field_name in df_curr.columns and df_curr.dropna(subset=[field_name]).shape[0] > 0:            
            series_curr = df_curr[field_name]
            try:
                series_curr.astype(float)
                numeric_cols.append(field_name)
            except:
                text_cols.append(field_name)

numeric_cols = numeric_cols + list(calc_map.keys())
year_cols = [col for col in numeric_cols if 'year' in col.lower()]
numeric_cols = [col for col in numeric_cols if col not in year_cols]

In [7]:
"""logic for coercing check box variables"""
chk_box_df = meta_data[meta_data['field_type'].isin(['checkbox'])]

chk_box_df_names = chk_box_df['form_name'].values
chk_box_cols = chk_box_df['field_name'].values

chk_cols = []
fail_cases = []
for col, dfn in zip(chk_box_cols, chk_box_df_names):
    if dfn in df_dict_map.keys():
        df_curr = df_dict_map[dfn]
        val_list=str(meta_data[meta_data['field_name']==col]['select_choices_or_calculations'].values)[2:].split('|')
        val_dic={",".join((v.split(',')[1:]))[1:-1].replace("'", ""): int(v.split(',')[0].replace(" ", "")) for v in val_list}
        for idx in val_dic.values():
            chk_col = col + '___' + str(idx)
            try:
                df_curr[chk_col]
                chk_cols.append(chk_col)
            except:
                fail_cases.append(chk_col)  

In [8]:
"""Getting rest of text/numeric fields"""
categorical_fields = meta_data[meta_data['field_type'].isin(['dropdown', 'radio', 'yesno'])]
categorical_fields = categorical_fields[categorical_fields['form_name'].isin(df_dict_map.keys())][
    'field_name'].values
categorical_fields = list(categorical_fields)
categorical_fields = categorical_fields + chk_cols
categorical_fields = [col for col in categorical_fields if col != 'site_name']

text_add_fields = meta_data[meta_data['field_type'].isin(['notes'])]
text_add_fields = text_add_fields[text_add_fields['form_name'].isin(df_dict_map.keys())]['field_name'].values
text_cols = text_cols + list(text_add_fields) + year_cols

In [9]:
"""1. Normalizing numeric_cols"""
def normalize_numeric_data(df_dict_map, numeric_cols, meta_data, calc_map):
    # Initialize scalers
    standard_scaler = StandardScaler()
    min_max_scaler = MinMaxScaler()
    
    # Dictionary to hold normalized data
    normalized_numeric_data_dict = {}
    
    # Normalizing numeric columns
    for col in numeric_cols:
        if col in calc_map.keys():
            form_nm_curr = 'cmr'
            df_curr = df_dict_map[form_nm_curr]
        else:
            form_nm_curr = meta_data[meta_data['field_name'] == col]['form_name'].values[0]
            df_curr = df_dict_map[form_nm_curr]
        
        # Fit-transform or transform based on the scaler of choice
        # Here, we'll use z-score normalization as an example
        df_curr[col + "_norm"] = standard_scaler.fit_transform(df_curr[[col]])
        # Process each row in df_curr to store in the dictionary
        for _, row in df_curr.iterrows():
            force_id = row['force_id']
            repeat_instance = row['redcap_repeat_instance']
            
            if form_nm_curr not in normalized_numeric_data_dict.keys():
                normalized_numeric_data_dict[form_nm_curr] = {}
            
            # Initialize dictionary for each force_id if not exists
            if force_id not in normalized_numeric_data_dict[form_nm_curr].keys():
                normalized_numeric_data_dict[form_nm_curr][force_id] = {}
                
            if repeat_instance not in normalized_numeric_data_dict[form_nm_curr][force_id].keys():
                normalized_numeric_data_dict[form_nm_curr][force_id][repeat_instance] = {
                    "normalized_data": {},
                    "metadata": {
                        "site": force_id[0:3]
                    }
                }
                
            normalized_numeric_data_dict[form_nm_curr][force_id][repeat_instance]['normalized_data'][col] = row[col + "_norm"]
    
    return normalized_numeric_data_dict

In [10]:
"""Adding in data for categorical data"""
def data_agg(ori_data_dict, df_dict_map, add_fields, meta_data):
    yn_dict = {1: 'yes', 0: 'no'}
    for col in add_fields:
        if '___' in col:
            form_nm_curr = meta_data[meta_data['field_name'] == col.split('___')[0]]['form_name'].values[0]
        else:
            form_nm_curr = meta_data[meta_data['field_name'] == col]['form_name'].values[0]
        
        df_curr = df_dict_map[form_nm_curr]
        # Process each row in df_curr to store in the dictionary
        for _, row in df_curr.iterrows():    
            force_id = row['force_id']
            repeat_instance = row['redcap_repeat_instance']
            
            if form_nm_curr not in ori_data_dict.keys():
                ori_data_dict[form_nm_curr] = {}
            
            # Initialize dictionary for each force_id if not exists
            if force_id not in ori_data_dict[form_nm_curr].keys():
                ori_data_dict[form_nm_curr][force_id] = {}
                
            if repeat_instance not in ori_data_dict[form_nm_curr][force_id].keys():
                ori_data_dict[form_nm_curr][force_id][repeat_instance] = {
                    "normalized_data": {},
                    "categorical_data": {},
                    "metadata": {
                        "site": force_id[0:3]
                    }
                }
            else:
                if 'categorical_data' not in ori_data_dict[form_nm_curr][force_id][repeat_instance].keys():
                    ori_data_dict[form_nm_curr][force_id][repeat_instance]["categorical_data"] = {}
                    
            
            ori_data_dict[form_nm_curr][force_id][repeat_instance]["categorical_data"][col] = row[col]
                
    return ori_data_dict

In [11]:
normalized_numeric_data_dict = normalize_numeric_data(df_dict_map, numeric_cols, meta_data, calc_map)
numeric_data_dict = data_agg(normalized_numeric_data_dict, df_dict_map, categorical_fields, meta_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_curr[col + "_norm"] = standard_scaler.fit_transform(df_curr[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_curr[col + "_norm"] = standard_scaler.fit_transform(df_curr[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_curr[col + "_norm"] = standard_scaler.fit_transform(df_curr[

In [12]:
"""Extracting BERT embeddings for text data"""

'Extracting BERT embeddings for text data'

In [13]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Function to generate BERT embeddings for a single text input
def embed_text_bert(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

def extract_bert_embeddings(data_dict, meta_data, df_dict_map, text_cols):
    # Dictionary to store results    
    # Loop through each text column
    for col in text_cols:
        if col == 'force_id' or 'date' in col:
            continue

        # Get the correct form name for the column and retrieve the corresponding DataFrame
        form_name = meta_data[meta_data['field_name'] == col]['form_name'].values[0]
        df_curr = df_dict_map[form_name]
    
        # Apply BERT embeddings row-wise for the column in df_curr
        df_curr[f"{col}_bert_embedding"] = df_curr[col].apply(lambda x: embed_text_bert(x) if isinstance(x, str) else None)
    
        # Store embeddings in structured dictionary format
        for idx, row in df_curr.iterrows():
            force_id = row['force_id']  # Assuming you have a 'force_id' column
            repeat_instance = row.get('redcap_repeat_instance', 1)  # Default instance is 1 if no longitudinal data
    
            # Initialize structure if it doesn't exist
            if form_name not in data_dict.keys():
                data_dict[form_name] = {}
                
            if force_id not in data_dict[form_name].keys():
                data_dict[form_name][force_id] = {}
                
            if repeat_instance not in data_dict[form_name][force_id].keys():
                data_dict[form_name][force_id][repeat_instance] = {"text_embeddings": {}}
                
            elif 'text_embeddings' not in data_dict[form_name][force_id][repeat_instance].keys():
                data_dict[form_name][force_id][repeat_instance]["text_embeddings"] = {}
    
            # Add the BERT embedding for the specific text column
            data_dict[form_name][force_id][repeat_instance]["text_embeddings"][col] = row[f"{col}_bert_embedding"]

    return data_dict

data_dict = extract_bert_embeddings(numeric_data_dict, meta_data, df_dict_map, text_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_curr[f"{col}_bert_embedding"] = df_curr[col].apply(lambda x: embed_text_bert(x) if isinstance(x, str) else None)


In [14]:
# Path to save the normalized numeric data dictionary
if not os.path.exists('./results'):
    os.mkdir('./results')

save_path = "rag_dict.pkl"

# Save the dictionary
with open('./results/' + save_path, "wb") as f:
    pickle.dump(data_dict, f)

print(f"data w/embeddings saved to results/{save_path}")

data w/embeddings saved to results/rag_dict.pkl


In [None]:
data_dict = tr_numeric_data_dict.copy()
df_dict_map = tr_df_dict_map.copy()

# Dictionary to store results    
# Loop through each text column
for col in text_cols:
    if col == 'force_id' or 'date' in col:
        continue
    # Get the correct form name for the column and retrieve the corresponding DataFrame
    form_name = meta_data[meta_data['field_name'] == col]['form_name'].values[0]
    df_curr = df_dict_map[form_name]

    # Apply BERT embeddings row-wise for the column in df_curr
    df_curr[f"{col}_bert_embedding"] = df_curr[col].apply(lambda x: embed_text_bert(x) if isinstance(x, str) else None)

    # Store embeddings in structured dictionary format
    for idx, row in df_curr.iterrows():
        force_id = row['force_id']  # Assuming you have a 'force_id' column
        repeat_instance = row.get('redcap_repeat_instance', 1)  # Default instance is 1 if no longitudinal data

        # Initialize structure if it doesn't exist
        if force_id not in data_dict:
            data_dict[force_id] = {}
        if repeat_instance not in data_dict[force_id]:
            data_dict[force_id][repeat_instance] = {"text_embeddings": {}}
        elif 'text_embeddings' not in data_dict[force_id][repeat_instance].keys():
            data_dict[force_id][repeat_instance]["text_embeddings"] = {}

        # Add the BERT embedding for the specific text column
        data_dict[force_id][repeat_instance]["text_embeddings"][col] = row[f"{col}_bert_embedding"]
        break
    break

In [None]:
chk = df_curr[col].apply(lambda x: embed_text_bert(x) if isinstance(x, str) else None)

In [None]:
# Store embeddings in structured dictionary format
for idx, row in df_curr.iterrows():
    force_id = row['force_id']  # Assuming you have a 'force_id' column
    repeat_instance = row.get('redcap_repeat_instance', 1)  # Default instance is 1 if no longitudinal data

    # Initialize structure if it doesn't exist
    if force_id not in data_dict:
        data_dict[force_id] = {}
    if repeat_instance not in data_dict[force_id]:
        data_dict[force_id][repeat_instance] = {"text_embeddings": {}}
    elif 'text_embeddings' not in data_dict[force_id][repeat_instance].keys():
        data_dict[force_id][repeat_instance]["text_embeddings"] = {}

    # Add the BERT embedding for the specific text column
    data_dict[force_id][repeat_instance]["text_embeddings"][col] = row[f"{col}_bert_embedding"]
    print(row[f"{col}_bert_embedding"])

In [None]:
row[f"{col}_bert_embedding"]