# Combining all the variables and their values for the patients into a single CSV

In [51]:
# Import libraries
import pandas as pd
import os
import psycopg2
import getpass
import pandas as pd
import numpy as np

In [2]:
# Path to save the extracted values
export_dir = '../extracted_data'
if not os.path.isdir(export_dir):
    os.mkdir(export_dir)

In [3]:
# Create a database connection
user = 'dhruv.sharma'
host = 'localhost'
dbname = 'mimic'
schema = 'mimiciii'

## Loading the data

In [4]:
# Connect to the database
con = psycopg2.connect(dbname=dbname, user=user, host=host, 
                       password=getpass.getpass(prompt='Password:'.format(user)))
cur = con.cursor()
cur.execute('SET search_path to {}'.format(schema))

Password:········


## Combining the static variables

<b>Demographics:</b> Subject_ID, age, gender, weight

<b>Comorbidities:</b> Congestive Heart Failure, Cardiac arrhythmias, valvular disease, pulmonary circulation, peripheral vascular, hypertension, paralysis, neurological disorder, chronic pulmonary disease, diabetes, hypothyroidism, renal failure, liver diseases, peptic ulcer, AIDS, lymphoma, metastatic cancer, rheumatoid arthritis, coagulopathy, obesity, fluid electrolyte, anemias, alcohol abuse, drug abuse, psychosis, depression 

In [7]:
demographics_path = os.path.join(export_dir, 'demographics.csv')
comorbidities_path = os.path.join(export_dir, 'comorbidities.csv')

demographics = pd.read_csv(demographics_path)
comorbidities = pd.read_csv(comorbidities_path)

The function get_comorbidity() takes the ICD9 code and returns the correspnding comorbidity.

In [11]:
def get_comorbidity(icd):
    '''
    Args:
        icd: the ICD9 code
    Returns:
        The corresponding comorbidity
    '''
    if icd == 4280:
        return 'congestive_Heart_Failure'
    if icd == 4279:
        return 'cardiac_arrhythmias'
    if icd == 3969:
        return 'valvular_disease'
    if (icd >= 4150 and icd <= 4179) or (icd >= 41511 and icd <= 41519):
        return 'pulmonary_circulation'
    if icd == 4439:
        return 'peripheral_vascular'
    if icd == 4019:
        return 'hypertension'
    if icd == 3449:
        return 'paralysis'
    if icd == 3499:
        return 'neurological_disorder'
    if (icd >= 4910 and icd <= 4959) or (icd>=49120 and icd<=49122) or (icd>=49300 and icd<=49392) or icd==490 or icd == 496:
        return ' chronic_pulmonary_disease'
    if (icd >= 25000 or icd <= 25093):
        return 'diabetes'
    if (icd == 2449):
        return 'hypothyroidism'
    if (icd == 586):
        return 'renal_failure'
    if (icd >= 5710 and icd <= 5719) or (icd >= 57140 and icd <= 57149):
        return 'liver_diseases'
    if (icd >= 53300 and icd <= 53391):
        return 'peptic_ulcer'
    if icd == 42:
        return 'AIDS'
    if icd >= 20280 and icd <= 20288:
        return 'lymphoma'
    if icd == 1991:
        return 'metastatic_cancer'
    if icd == 7140:
        return 'rheumatoid_arthritis'
    if icd == 2869:
        return 'coagulopathy'
    if icd == 27800:
        return 'obesity'
    if icd == 2769:
        return 'fluid_electrolyte'
    if icd == 2809:
        return 'anemias'
    if icd >= 30500 and icd <= 30503:
        return 'alcohol_abuse'
    if icd >= 30590 and icd <= 30593:
        return 'drug_abuse'
    if icd == 2989:
        return 'psychosis'
    if icd == 311:
        return 'depression'
        

In [19]:
def get_comorbidity_dict():
    '''
    This functions generates a dictionary with all the comorbidities, set as 0
    Args:
        None
    Returns:
        comorb_dict
    '''
    comorb_dict = dict()
    comorb_dict['congestive_Heart_Failure'] = 0
    comorb_dict['cardiac_arrhythmias'] = 0
    comorb_dict['valvular_disease'] = 0
    comorb_dict['pulmonary_circulation'] = 0
    comorb_dict['peripheral_vascular'] = 0
    comorb_dict['hypertension'] = 0
    comorb_dict['paralysis'] = 0
    comorb_dict['neurological_disorder'] = 0
    comorb_dict['chronic_pulmonary_disease'] = 0
    comorb_dict['diabetes'] = 0
    comorb_dict['hypothyroidism'] = 0
    comorb_dict['renal_failure'] = 0
    comorb_dict['liver_diseases'] = 0
    comorb_dict['peptic_ulcer'] = 0
    comorb_dict['AIDS'] = 0
    comorb_dict['lymphoma'] = 0
    comorb_dict['metastatic_cancer'] = 0
    comorb_dict['rheumatoid_arthritis'] = 0
    comorb_dict['coagulopathy'] = 0
    comorb_dict['obesity'] = 0
    comorb_dict['fluid_electrolyte'] = 0
    comorb_dict['anemias'] = 0
    comorb_dict['alcohol_abuse'] = 0
    comorb_dict['drug_abuse'] = 0
    comorb_dict['psychosis'] = 0
    comorb_dict['depression'] = 0
    return comorb_dict

In [59]:
def get_age_weight_gender(demo_df, sub_id, hadm_id):
    '''
    This function gets the age, weight, and gender corresponding to the subject_id.
    Args:
        demo_df: the DataFrame containing the demographic information for all the patients
        sub_id: the subject ID corresponding to which we want the info
    Returns:
        a dictionary containing all the info
    '''
    info = {'subject_id':sub_id, 'hadm_id':hadm_id}
    sub_info = demo_df[demo_df.subject_id == sub_id]
    try:
        info['age'] = np.array(sub_info.age)[0]
    except:
        info['age'] = np.float('nan')
    try:
        info['weight'] = np.array(sub_info.weight)[0]
    except:
        info['weight'] = np.float('nan')
    try:
        if np.array(sub_info.gender)[0] == 'M':
            info['male'] = 1
            info['female'] = 0
        elif np.array(sub_info.gender)[0] == 'F':
            info['male'] = 0
            info['female'] = 1
    except:
        info['male'] = np.float('nan')
        info['female'] = np.float('nan')
    
    return info    

In [75]:
combined_static = pd.DataFrame()

prev_hadm = None
prev_info = None
count = comorbidities.shape[0]

for i in range(count):
    this_sub = comorbidities.subject_id[i]
    this_hadm = comorbidities.hadm_id[i]
    this_icd = comorbidities.icd9_code[i]
    
    this_comorbidity = get_comorbidity(this_icd)
    this_info = get_age_weight_gender(demographics, this_sub, this_hadm)
    comorb_dict = get_comorbidity_dict()
    
    this_info.update(comorb_dict)
    this_info[this_comorbidity] = 1
    if this_hadm == prev_hadm:
        this_info = prev_info
        this_info[this_comorbidity] = 1
    else:
        combined_static = combined_static.append(prev_info, ignore_index = True)
        prev_hadm = this_hadm
    prev_info = this_info


In [78]:
columns = ['subject_id', 'hadm_id', 'age', 'weight', 'male', 'female']
for k in comorb_dict:
    columns.append(k)

combined_static = combined_static.reindex(columns=columns)
combined_static.to_csv(os.path.join(export_dir, 'static_data.csv'), index = False)
combined_static.head(n=15)

Unnamed: 0,subject_id,hadm_id,age,weight,male,female,congestive_Heart_Failure,cardiac_arrhythmias,valvular_disease,pulmonary_circulation,...,metastatic_cancer,rheumatoid_arthritis,coagulopathy,obesity,fluid_electrolyte,anemias,alcohol_abuse,drug_abuse,psychosis,depression
0,3.0,145834.0,76.52,106.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,185777.0,47.84,53.599998,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9.0,150750.0,41.79,100.300003,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12.0,112213.0,72.37,81.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,13.0,143045.0,39.86,74.599998,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,17.0,161087.0,47.45,68.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,18.0,188822.0,50.84,,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,19.0,109235.0,300.0,,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,20.0,157681.0,75.88,93.300003,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,21.0,109451.0,87.44,64.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Combining the vital_cv.csv and vital_mv.csv

In [89]:
def get_vital(itemid):
    '''
    This function returns the vital sign corresponding to the the itemid
    Args:
        itemid: The itemid for the vital sign
    Returns:
        vital: a string for the vital sign
    '''
    if(itemid == 52 or itemid == 220052):
        return 'mean_arterial_bp'
    if(itemid == 211 or itemid == 220045):
        return 'heart_rate'
    if(itemid == 618 or itemid == 220210):
        return 'respiration_rate'
    if(itemid == 676 or itemid == 223761):
        return 'temperature_c'

In [80]:
vitals_cv_path = os.path.join(export_dir, 'vital_cv.csv')
vitals_mv_path = os.path.join(export_dir, 'vital_mv.csv')

vitals_cv = pd.read_csv(vitals_cv_path)
vitals_mv = pd.read_csv(vitals_mv_path)

  interactivity=interactivity, compiler=compiler, result=result)


In [88]:
vitals_cv.iloc[2]

subject_id              2
hadm_id            163353
icustay_id         243653
charttime     5.31869e+09
itemid                211
label          Heart Rate
value                 144
Name: 2, dtype: object

In [124]:
vital_cv_new = pd.DataFrame()
vital_cv_new['mean_arterial_bp'] = np.float('nan')
vital_cv_new['heart_rate'] = np.float('nan')
vital_cv_new['respiration_rate'] = np.float('nan')
vital_cv_new['temperature_c'] = np.float('nan')

count = vitals_cv.shape[0]

In [125]:
prev_charttime = None
prev_df = None
for i in range(100):
    this_data = vitals_cv.iloc[i]
    if(this_data.charttime == prev_charttime):
        prev_df[get_vital(this_data.itemid)] = this_data.value
    else:
        if(prev_df is not None):
            vital_cv_new = vital_cv_new.append(prev_df, ignore_index=True)
        this_data[get_vital(this_data.itemid)] = this_data.value
        this_data = this_data.drop(['itemid', 'label', 'value'])
        prev_df = this_data
        prev_charttime = this_data.charttime

if prev_df is not None:
    vital_cv_new = vital_cv_new.append(prev_df, ignore_index=True)

columns = ['subject_id', 'hadm_id', 'icustay_id', 'charttime', 'mean_arterial_bp', 
           'heart_rate', 'respiration_rate', 'temperature_c']
vital_cv_new = vital_cv_new.reindex(columns=columns)
vital_cv_new.head(n=15)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,mean_arterial_bp,heart_rate,respiration_rate,temperature_c
0,2.0,163353.0,243653.0,5318684000.0,,148.0,,
1,2.0,163353.0,243653.0,5318685000.0,,131.0,,
2,2.0,163353.0,243653.0,5318687000.0,,144.0,,
3,2.0,163353.0,243653.0,5318690000.0,,140.0,,
4,3.0,145834.0,211552.0,4159277000.0,,95.0,16.0,
5,3.0,145834.0,211552.0,4159279000.0,259.0,,,
6,3.0,145834.0,211552.0,4159280000.0,,151.0,,
7,3.0,145834.0,211552.0,4159281000.0,,135.0,,
8,3.0,145834.0,211552.0,4159282000.0,60.0,143.0,,
9,3.0,145834.0,211552.0,4159282000.0,58.0,165.0,5.0,


In [126]:
vital_mv_new = pd.DataFrame()
vital_mv_new['mean_arterial_bp'] = np.float('nan')
vital_mv_new['heart_rate'] = np.float('nan')
vital_mv_new['respiration_rate'] = np.float('nan')
vital_mv_new['temperature_c'] = np.float('nan')

count = vitals_mv.shape[0]

In [127]:
prev_charttime = None
prev_df = None
for i in range(100):
    this_data = vitals_mv.iloc[i]
    if(this_data.charttime == prev_charttime):
        prev_df[get_vital(this_data.itemid)] = this_data.value
    else:
        if(prev_df is not None):
            vital_mv_new = vital_mv_new.append(prev_df, ignore_index=True)
        this_data[get_vital(this_data.itemid)] = this_data.value
        this_data = this_data.drop(['itemid', 'label', 'value'])
        prev_df = this_data
        prev_charttime = this_data.charttime

if prev_df is not None:
    vital_mv_new = vital_mv_new.append(prev_df, ignore_index=True)

columns = ['subject_id', 'hadm_id', 'icustay_id', 'charttime', 'mean_arterial_bp', 
           'heart_rate', 'respiration_rate', 'temperature_c']
vital_mv_new = vital_mv_new.reindex(columns=columns)
vital_mv_new.head(n=15)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,subject_id,hadm_id,icustay_id,charttime,mean_arterial_bp,heart_rate,respiration_rate,temperature_c
0,23.0,124321.0,234044.0,5926565000.0,,,,95.8
1,23.0,124321.0,234044.0,5926565000.0,,77.0,14.0,
2,23.0,124321.0,234044.0,5926565000.0,,75.0,,
3,23.0,124321.0,234044.0,5926568000.0,86.0,84.0,18.0,
4,23.0,124321.0,234044.0,5926572000.0,86.0,78.0,11.0,97.2
5,23.0,124321.0,234044.0,5926576000.0,98.0,81.0,13.0,
6,23.0,124321.0,234044.0,5926580000.0,104.0,72.0,13.0,
7,23.0,124321.0,234044.0,5926583000.0,104.0,64.0,16.0,
8,23.0,124321.0,234044.0,5926586000.0,,69.0,15.0,
9,23.0,124321.0,234044.0,5926586000.0,225.0,,,


In [131]:
vital_combined = vital_cv_new.append(vital_mv_new, ignore_index=True)
vital_combined.to_csv(os.path.join(export_dir,'vital_combined.csv'),index=False,sep=',')

## Combining the vaso_cv.csv and vaso_mv.csv

In [134]:
def get_vaso(itemid):
    '''
    This function returns the vasopressor item corresponding to the the itemid
    Args:
        itemid: The itemid for the vasopressor
    Returns:
        vital: a string for the vasopressor
    '''
    if(itemid == 30043 or itemid == 221662):
        return 'dopamine'
    if(itemid == 30047 or itemid == 30120):
        return 'levophed'
    if(itemid == 30051 or itemid == 222315):
        return 'vasopressin'
    if(itemid == 30119 or itemid == 221289):
        return 'epineprine'
    if(itemid == 30127 or itemid == 30128):
        return 'neosynephrine'
    if(itemid == 30307):
        return 'dopamine_drip'
    if(itemid == 221749):
        return 'phenylephrine'
    if(itemid == 221906):
        return 'norepinephrine'

In [135]:
vaso_cv_path = os.path.join(export_dir, 'vaso_cv.csv')
vaso_mv_path = os.path.join(export_dir, 'vaso_mv.csv')

vaso_cv = pd.read_csv(vaso_cv_path)
vaso_mv = pd.read_csv(vaso_mv_path)

In [136]:
vaso_cv.iloc[2]

subject_id              3
hadm_id            145834
icustay_id         211552
charttime     4.15929e+09
itemid              30043
label            Dopamine
rate_std              0.1
Name: 2, dtype: object

In [137]:
vaso_cv_new = pd.DataFrame()
vaso_cv_new['dopamine'] = np.float('nan')
vaso_cv_new['levophed'] = np.float('nan')
vaso_cv_new['vasopressin'] = np.float('nan')
vaso_cv_new['epineprine'] = np.float('nan')
vaso_cv_new['neosynephrine'] = np.float('nan')
vaso_cv_new['dopamine_drip'] = np.float('nan')
vaso_cv_new['phenylephrine'] = np.float('nan')
vaso_cv_new['norepinephrine'] = np.float('nan')

count = vaso_cv_new.shape[0]