In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
#load in crosstables: 
with open("../../crosstables/procedure_list.txt", "rb") as fp:   # Unpickling
    procedure_list = pickle.load(fp)
with open("../../crosstables/prescription_list.txt", "rb") as dl:   # Unpickling
    drug_list = pickle.load(dl)

In [3]:
def feature_eng(admit, icu, proc, drug):
    """
    INPUT: dataframes (admission, icu, procedures, and prescriptions)
    BEHAVIOR: 
    OUTPUT:
    """    
    admit_fields = ["subject_id","hadm_id", "admission_type", "admission_location", 
          "discharge_location", "insurance", "diagnosis", "hospital_expire_flag"]
    admitdf = pd.read_csv(admit, usecols = admit_fields)    
    icu_fields =["subject_id", "hadm_id", "icustay_id", "first_careunit", "last_careunit",
        "first_wardid", "last_wardid", "los"]
    icudf = pd.read_csv(icu, usecols = icu_fields)
    
    deaths = admitdf[admitdf['discharge_location'].isin(['DEAD/EXPIRED'])]
    icu_admin = pd.merge(icudf, deaths, how='left', on='hadm_id')

    icu_full = (
                icu_admin.
                drop(columns=['subject_id_y']).
                rename(columns={"subject_id_x": "subject_id"})
    )
    
    procdf = pd.read_csv(proc)
    procdf = procdf[procdf['ordercategoryname'].isin(procedure_list)].reset_index()
    procdf = procdf[procdf['icustay_id'].notna()]
    myproc_counts = procdf.groupby(['subject_id', 'icustay_id', 'ordercategoryname']).size().reset_index(name='counts')
    myproc_counts_long = myproc_counts.pivot(index = ['subject_id','icustay_id'], 
                                             columns = 'ordercategoryname',
                                             values = 'counts').reset_index()
    myproc_counts_long = myproc_counts_long.replace(np.nan,0)
    myproc_counts_long = myproc_counts_long.rename_axis(None).drop(columns=['subject_id'])

    drugdf = pd.read_csv(drug)
    #filter rows to only these drugs
    drugdf = drugdf[drugdf['formulary_drug_cd'].isin(drug_list)].reset_index()
    drugdf = drugdf[drugdf['icustay_id'].notna()]
    #now start making counts based on person and icu_stay_id
    mycounts = drugdf.groupby(['subject_id', 'icustay_id', 'formulary_drug_cd']).size().reset_index(name='counts')
    mycounts_long = mycounts.pivot(index = ['subject_id','icustay_id'], 
                                   columns = 'formulary_drug_cd', values = 'counts').reset_index()
    mycounts_long = mycounts_long.replace(np.nan,0).drop(columns=['subject_id'])
    
    df1 = pd.merge(icu_full, mycounts_long, how='left', on='icustay_id')
    analyticdf = pd.merge(df1, myproc_counts_long, how='left', on='icustay_id')
    # convert nas in specific columns to 0
    analyticdf[procedure_list] = analyticdf[procedure_list].fillna(0)
    analyticdf[drug_list] = analyticdf[drug_list].fillna(0)
    analyticdf = analyticdf.drop(columns=['subject_id', 'hadm_id', 'icustay_id',
                                          'discharge_location', 'last_careunit',
                                         'last_wardid'])
    #categorize diagnoses
    analyticdf['diagnosis'] = np.where(analyticdf['diagnosis'].str.contains("congestive heart failure", case=False), "CV Failure", 
                                np.where(analyticdf['diagnosis'].str.contains("sepsis", case=False), "Sepsis",
                                np.where(analyticdf['diagnosis'].str.contains("seizure", case=False), "CNS Failure",
                                np.where(analyticdf['diagnosis'].str.contains("stroke", case=False), "CNS Failure",
                                np.where(analyticdf['diagnosis'].str.contains("tia", case=False), "CNS Failure",
                                np.where(analyticdf['diagnosis'].str.contains("ACUTE CHOLANGITIS", case=False), "Organ Failure",
                                np.where(analyticdf['diagnosis'].str.contains("GI BLEED", case=False), "Organ Failure",
                                np.where(analyticdf['diagnosis'].str.contains("lung failure", case=False), "Organ Failure",
                                np.where(analyticdf['diagnosis'].str.contains("liver failure", case=False), "Organ Failure",
                                np.where(analyticdf['diagnosis'].str.contains("MYOCARDIAL INFARCTION", case=False), "CV Failure", "Other"))))))))))
    
    
    # convert nas in specific columns in the outcome column
    analyticdf['hospital_expire_flag'] = analyticdf['hospital_expire_flag'].fillna(0)
    #make wardID a categorical  
    to_categ = ['first_wardid','first_careunit','hospital_expire_flag',
                'admission_type','admission_location','insurance','diagnosis']
    analyticdf[to_categ] = analyticdf[to_categ].astype('category')
    return analyticdf

In [4]:
# data files
procdf = "https://physionet.org/files/mimiciii-demo/1.4/PROCEDUREEVENTS_MV.csv?download"
drugdf = "https://physionet.org/files/mimiciii-demo/1.4/PRESCRIPTIONS.csv?download"
admitdf = "https://physionet.org/files/mimiciii-demo/1.4/ADMISSIONS.csv?download"
icudf = "https://physionet.org/files/mimiciii-demo/1.4/ICUSTAYS.csv?download"

In [5]:
# call data
testdat = feature_eng(admitdf, icudf, procdf, drugdf)
####PICKLE THE DATA 
#testdat.to_pickle("../modeling/demo_training.pkl") 