# Merging Pretrained Embedding Datasets

In [None]:
import numpy as np
import pandas as pd

Load the embeddings and the balanced data

In [None]:
cc_embeddings = pd.read_csv('~/scratch/datasets/yale_new_haven/supplementary_info/pretrained_embeddings/PubMedBERT/chief_complaint_embeddings.csv')
pmh_embeddings = pd.read_csv('~/scratch/datasets/yale_new_haven/supplementary_info/pretrained_embeddings/PubMedBERT/past_medical_hist_embeddings.csv')

In [None]:
df_orig = pd.read_csv('~/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/features/normalized_preprocessing/regression_nn/yale_new_haven_balanced_test_features.csv')

Get the chief complaint and past medical history columns

In [None]:
disposition_var = {'disposition'}
demographic_vars = {'age', 'gender', 'ethnicity', 'race', 'lang',
       'religion', 'maritalstatus', 'employstatus', 'insurance_status'}
triage_evaluation_vars = {'dep_name', 'esi', 'arrivalmode', 'arrivalmonth', 'arrivalday', 'arrivalhour_bin'}.union({col for col in df_orig.columns if 'triage_vital' in col})
chief_complaint_vars = {col for col in df_orig.columns if "cc_" in col}
medication_vars = {col for col in df_orig.columns if 'meds_' in col}
hospital_usage_stats_vars = {'previousdispo', 'n_edvisits', 'n_admissions', 'n_surgeries'}
imaging_ekg_vars = {'cxr_count','echo_count','ekg_count','otherxr_count', 'otherus_count', 'headct_count', 'otherct_count', 'mri_count','otherimg_count'}
historical_vital_vars = {'dbp_last',
 'dbp_max',
 'dbp_median',
 'dbp_min',
 'o2_device_last',
 'o2_device_max',
 'o2_device_median',
 'o2_device_min',
 'pulse_last',
 'pulse_max',
 'pulse_median',
 'pulse_min',
 'resp_last',
 'resp_max',
 'resp_median',
 'resp_min',
 'sbp_last',
 'sbp_max',
 'sbp_median',
 'sbp_min',
 'spo2_last',
 'spo2_max',
 'spo2_median',
 'spo2_min',
 'temp_last',
 'temp_max',
 'temp_median',
 'temp_min'}
curr = disposition_var.union(demographic_vars.union(triage_evaluation_vars.union(chief_complaint_vars.union(medication_vars.union(hospital_usage_stats_vars.union(imaging_ekg_vars.union(historical_vital_vars)))))))
past_medical_hist_vars = {col for col in df_orig.columns if col not in curr and "_" not in col and col not in ['ID', 'previousdispo']}

In [None]:
cc_cols = list(chief_complaint_vars)
pmh_cols = list(past_medical_hist_vars)

Get ids for the dataset

In [None]:
ids = df_orig['ID']

Prep the input dataframes

In [None]:
# # re-sort and drop the index
# cc_embeddings = cc_embeddings.loc[ids].reset_index(drop=True)
# pmh_embeddings = pmh_embeddings.loc[ids].reset_index(drop=True)

# # drop the previous cc columns
# df_orig = df_orig[[col for col in df_orig.columns if col not in cc_cols and col not in pmh_cols]]

Concatenate

In [None]:
df = pd.concat([
    df_orig[[col for col in df_orig.columns if col not in cc_cols and col not in pmh_cols]], 
    cc_embeddings.loc[ids].reset_index(drop=True), 
    pmh_embeddings.loc[ids].reset_index(drop=True)
], axis='columns')

Save file

In [None]:
df.to_csv('~/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/features/pretrained_embeddings/PubMedBERT/regression_nn/balanced_test_set.csv', index=False)