### Goal: create feature table that includes
- abnormal lab values
- min max mean of select lab values
- min max mean of select chart events
- sum of urine output
- demographic data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import statistics
import matplotlib.pyplot as plt

In [2]:
labevents_subset = pd.read_csv('./subsets_tables_collection/labevents_subset.csv')
labevents_subset['HADM_ID'] = labevents_subset['HADM_ID'].astype(np.int64)
labevents_subset['CHARTTIME'] = pd.to_datetime(labevents_subset['CHARTTIME'])
labevents_subset['ROW_ID'] = 'lab' + labevents_subset['ROW_ID'].astype(str)


chartevents_subset = pd.read_csv('./subsets_tables_collection/chartevents_subset.csv')
chartevents_subset['HADM_ID'] = chartevents_subset['HADM_ID'].astype(np.int64)
chartevents_subset['CHARTTIME'] = pd.to_datetime(chartevents_subset['CHARTTIME'])
chartevents_subset['ROW_ID'] = 'chart' + chartevents_subset['ROW_ID'].astype(str)

outputevents_subset = pd.read_csv('./subsets_tables_collection/outputevents_subset.csv')
outputevents_subset['HADM_ID'] = outputevents_subset['HADM_ID'].astype(np.int64)
outputevents_subset['CHARTTIME'] = pd.to_datetime(outputevents_subset['CHARTTIME'])
outputevents_subset['ROW_ID'] = 'output' + outputevents_subset['ROW_ID'].astype(str)

lab_times = labevents_subset[['ROW_ID','CHARTTIME','HADM_ID']]
chart_times = chartevents_subset[['ROW_ID','CHARTTIME','HADM_ID']]
out_times = outputevents_subset[['ROW_ID','CHARTTIME','HADM_ID']]
lab_chart_out_times = pd.concat([lab_times, chart_times, out_times], sort=False)
grouped = lab_chart_out_times.groupby(['HADM_ID', pd.Grouper(freq='48H', key='CHARTTIME')])
count = 0
past_hadm = ''
def counter(x):
    global count, past_hadm
    curr_hadm = x.iloc[0]
    if past_hadm != curr_hadm:
        count = 0
        past_hadm = curr_hadm
    y = count
    count += 1
    return str(curr_hadm) + "_" + str(count)
lab_chart_out_times['HADM_datebin_num'] = grouped['HADM_ID'].transform(counter)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
labevents_subset_binned = labevents_subset.merge(lab_chart_out_times, on='ROW_ID')
chartevents_subset_binned = chartevents_subset.merge(lab_chart_out_times, on='ROW_ID')
outputevents_subset_binned = outputevents_subset.merge(lab_chart_out_times, on='ROW_ID')

In [4]:
import sqlite3 # library for working with sqlite database
conn = sqlite3.connect("./data/MIMIC.db") # Create a connection to the on-disk database
labitems = pd.read_sql("SELECT * FROM d_labitems", conn)
cols_to_use = ['ITEMID']
labevents_subset_binned = labevents_subset_binned.merge(labitems, on=cols_to_use)
labevents_subset_binned = labevents_subset_binned.drop(columns=['ROW_ID_y', 'index_y'])

### Make features out of abnormal lab values

In [5]:
labevents_subset_abnormal = labevents_subset_binned[labevents_subset_binned['FLAG'] == 'abnormal']
labevents_subset_normal = labevents_subset_binned[labevents_subset_binned['FLAG'] != 'abnormal']

In [6]:
# dropping lab categories detected abnormal less than 5 times
num_abnormal_labels = labevents_subset_abnormal.LABEL.value_counts()
abnormal_labels = num_abnormal_labels[num_abnormal_labels > 5].index
abnormal_labels

Index(['Glucose', 'Hematocrit', 'Hemoglobin', 'Red Blood Cells',
       'Urea Nitrogen', 'Creatinine', 'PT', 'RDW', 'Calcium, Total',
       'Chloride', 'pO2', 'PTT', 'Bicarbonate', 'Phosphate', 'MCH', 'Sodium',
       'Platelet Count', 'pCO2', 'White Blood Cells', 'INR(PT)', 'pH', 'MCHC',
       'MCV', 'Lactate', 'Troponin T', 'Free Calcium', 'Creatine Kinase (CK)',
       'Calculated Total CO2', 'Neutrophils', 'Lymphocytes', 'Potassium',
       'Albumin', 'Anion Gap', 'Magnesium', 'Bilirubin, Total',
       'Alkaline Phosphatase', 'Osmolality, Measured',
       'Asparate Aminotransferase (AST)', 'Creatine Kinase, MB Isoenzyme',
       'Lactate Dehydrogenase (LD)', 'Potassium, Whole Blood', 'RBC',
       'Alanine Aminotransferase (ALT)', 'CK-MB Index', 'Vancomycin',
       'Fibrinogen, Functional', 'Sodium, Whole Blood', 'Hyaline Casts',
       'Cortisol', 'WBC', 'Bands', 'Chloride, Whole Blood', 'Eosinophils',
       'Bilirubin, Direct', 'Gentamicin', 'Iron', 'Transferrin',
       'I

In [7]:
#only greater than 5 times abnormal
labevents_subset_abnormal = labevents_subset_abnormal[labevents_subset_abnormal['LABEL'].isin(abnormal_labels)]

In [8]:
labevents_subset_abnormal

Unnamed: 0.1,Unnamed: 0,index_x,ROW_ID_x,SUBJECT_ID,HADM_ID_x,ITEMID,CHARTTIME_x,VALUE,VALUENUM,VALUEUOM,FLAG,CHARTTIME_y,HADM_ID_y,HADM_datebin_num,LABEL,FLUID,CATEGORY,LOINC_CODE
0,0,204360,lab212234,338,194592,50912,2135-11-01 06:00:00,1.3,1.3,mg/dL,abnormal,2135-11-01 06:00:00,194592,194592_4,Creatinine,Blood,Chemistry,2160-0
1,40,204400,lab212274,338,194592,50912,2135-11-02 06:00:00,1.4,1.4,mg/dL,abnormal,2135-11-02 06:00:00,194592,194592_4,Creatinine,Blood,Chemistry,2160-0
2,68,204904,lab212302,338,194592,50912,2135-11-03 08:50:00,1.3,1.3,mg/dL,abnormal,2135-11-03 08:50:00,194592,194592_5,Creatinine,Blood,Chemistry,2160-0
3,87,209756,lab212001,338,194592,50912,2135-10-27 05:25:00,1.5,1.5,mg/dL,abnormal,2135-10-27 05:25:00,194592,194592_1,Creatinine,Blood,Chemistry,2160-0
4,121,210499,lab212035,338,194592,50912,2135-10-27 19:01:00,1.4,1.4,mg/dL,abnormal,2135-10-27 19:01:00,194592,194592_1,Creatinine,Blood,Chemistry,2160-0
5,144,210522,lab212058,338,194592,50912,2135-10-28 00:20:00,1.4,1.4,mg/dL,abnormal,2135-10-28 00:20:00,194592,194592_2,Creatinine,Blood,Chemistry,2160-0
6,191,211766,lab212105,338,194592,50912,2135-10-28 14:20:00,1.3,1.3,mg/dL,abnormal,2135-10-28 14:20:00,194592,194592_2,Creatinine,Blood,Chemistry,2160-0
8,206,211781,lab212120,338,194592,50912,2135-10-29 07:05:00,1.3,1.3,mg/dL,abnormal,2135-10-29 07:05:00,194592,194592_2,Creatinine,Blood,Chemistry,2160-0
9,232,212279,lab212146,338,194592,50912,2135-10-30 06:50:00,1.4,1.4,mg/dL,abnormal,2135-10-30 06:50:00,194592,194592_3,Creatinine,Blood,Chemistry,2160-0
10,272,213028,lab212186,338,194592,50912,2135-10-31 05:30:00,1.3,1.3,mg/dL,abnormal,2135-10-31 05:30:00,194592,194592_3,Creatinine,Blood,Chemistry,2160-0


In [9]:
labevents_subset_abnormal = pd.get_dummies(labevents_subset_abnormal, prefix='abnormal_lab', columns=['LABEL'])

In [10]:
labevents_subset_features = pd.concat([labevents_subset_abnormal, labevents_subset_normal], sort=False)

In [11]:
labevents_subset_features = labevents_subset_features.drop('LABEL', axis=1)

In [12]:
filter_col = [col for col in labevents_subset_features if col.startswith('abnormal')]
filter_col.append('HADM_datebin_num')
labevents_subset_abnormal_features = labevents_subset_features[filter_col].groupby('HADM_datebin_num').sum()

In [13]:
labevents_subset_abnormal_features

Unnamed: 0_level_0,abnormal_lab_Alanine Aminotransferase (ALT),abnormal_lab_Albumin,abnormal_lab_Alkaline Phosphatase,abnormal_lab_Anion Gap,abnormal_lab_Asparate Aminotransferase (AST),abnormal_lab_Bands,abnormal_lab_Bicarbonate,"abnormal_lab_Bilirubin, Direct","abnormal_lab_Bilirubin, Total",abnormal_lab_CK-MB Index,...,abnormal_lab_Troponin I,abnormal_lab_Troponin T,abnormal_lab_Urea Nitrogen,abnormal_lab_Urobilinogen,abnormal_lab_Vancomycin,abnormal_lab_WBC,abnormal_lab_White Blood Cells,abnormal_lab_pCO2,abnormal_lab_pH,abnormal_lab_pO2
HADM_datebin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102390_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
102390_2,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,1.0,4.0
102390_3,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
102390_4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105348_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
105348_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105348_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105348_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105348_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116543_1,0.0,0.0,2.0,5.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,6.0,0.0,0.0,0.0,2.0,2.0,5.0,2.0


### Extract numeric features out of chartevents

In [14]:
# What ITEMID from chartevents do we care about?
chart_ids_we_care = {
    'heart_rate': [211, 220045],
    'oxygen': [646, 220277, 834],
    'respiratory rate': [618,220210,3603,224689],
    'systolic blood pressure': [51, 455, 220179, 220050, 3313, 225309],
    'diastolic blood pressure': [8368, 8441, 220180, 220051, 8502, 225310],
    'mean blood pressure': [52, 456, 220181, 220052, 3312, 225312],
    'Glascow': [184, 723, 454, 220739, 223900, 223901],
    'temp F': [678, 223761, 679, ]
}
outputevents_ids_we_care = {
    'urine': [40055, 226559, 43175, 40069, 40094, 40065, 40061, 40715, 226627, 40473]
}
lab_ids_we_care = {
    'Hematocrit': [51221],
    'potassium': [50971],
    'sodium': [50983],
    'creatinine': [50912],
    'chloride': [50902],
    'platelets': [51265],
    'white blood cell': [51301],
    'hemoglobin': [51222],
    'glucose': [50931],
    'RBC count': [51279]
}

In [15]:
l = list(chart_ids_we_care.values())
flat_list = [item for sublist in l for item in sublist]
chartevents_subset_selected = chartevents_subset_binned[chartevents_subset_binned['ITEMID'].isin(flat_list)]
itemid_to_var_df = pd.read_csv('./resources/itemid_to_variable_map.csv')
chartevents_subset_selected = chartevents_subset_selected.merge(itemid_to_var_df, on='ITEMID')
cols_to_keep = ['SUBJECT_ID', 'HADM_ID_x', 'ITEMID', 'CHARTTIME_x', 'VALUE', 'VALUENUM', 'VALUEUOM', 'LEVEL2', 'HADM_datebin_num']
chartevents_subset_selected = chartevents_subset_selected[cols_to_keep]

In [16]:
#get rid off empty values
chartevents_subset_selected = chartevents_subset_selected[chartevents_subset_selected['VALUENUM'].notnull()]

In [17]:
chartevents_features_max = chartevents_subset_selected.groupby(['HADM_datebin_num','LEVEL2'])['VALUENUM'].max().unstack()
chartevents_features_min = chartevents_subset_selected.groupby(['HADM_datebin_num','LEVEL2'])['VALUENUM'].min().unstack()
chartevents_features_mean = chartevents_subset_selected.groupby(['HADM_datebin_num','LEVEL2'])['VALUENUM'].mean().unstack()

In [18]:
chartevents_features_max = chartevents_features_max.add_suffix('_max')
chartevents_features_min = chartevents_features_min.add_suffix('_min')
chartevents_features_mean = chartevents_features_mean.add_suffix('_mean')
chartevents_features = pd.concat([chartevents_features_max, chartevents_features_min, chartevents_features_mean], axis=1)

In [19]:
chartevents_features

LEVEL2,Diastolic blood pressure_max,Glascow coma scale eye opening_max,Glascow coma scale motor response_max,Glascow coma scale verbal response_max,Heart Rate_max,Mean blood pressure_max,Oxygen saturation_max,Respiratory rate_max,Systolic blood pressure_max,Temperature_max,...,Diastolic blood pressure_mean,Glascow coma scale eye opening_mean,Glascow coma scale motor response_mean,Glascow coma scale verbal response_mean,Heart Rate_mean,Mean blood pressure_mean,Oxygen saturation_mean,Respiratory rate_mean,Systolic blood pressure_mean,Temperature_mean
HADM_datebin_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102390_1,63.0,1.0,4.0,1.0,84.0,89.000000,100.0,24.0,147.0,97.000000,...,54.000000,1.000000,4.000000,1.000000,77.666667,76.000000,100.000000,14.000000,131.800000,97.000000
102390_2,65.0,1.0,5.0,1.0,89.0,95.000000,100.0,26.0,160.0,101.500000,...,54.288462,1.000000,4.404255,1.000000,75.960784,79.557692,100.000000,15.984375,131.807692,99.342857
102390_3,76.0,1.0,4.0,1.0,102.0,116.000000,100.0,38.0,173.0,100.900000,...,60.551020,1.000000,4.000000,1.000000,70.408163,89.040816,99.816327,16.682540,146.122449,99.600000
102390_4,110.0,1.0,4.0,1.0,128.0,117.000000,100.0,39.0,186.0,101.600000,...,68.106383,1.000000,4.000000,1.000000,99.042553,91.936170,93.382979,18.183333,136.787234,100.930000
105348_1,79.0,4.0,6.0,5.0,147.0,86.000000,100.0,21.0,132.0,100.000000,...,57.000000,3.666667,6.000000,5.000000,118.739130,70.571429,98.478261,13.086957,114.238095,98.085714
105348_2,63.0,4.0,6.0,5.0,136.0,77.000000,100.0,17.0,128.0,101.100000,...,54.066667,3.750000,6.000000,5.000000,125.733333,69.000000,99.066667,13.666667,120.600000,99.725000
116543_1,85.0,4.0,6.0,5.0,114.0,133.000000,99.0,27.0,112.0,100.100000,...,50.833333,3.000000,4.250000,3.500000,85.081081,60.166667,95.363636,17.918919,87.250000,98.612500
116543_2,64.0,4.0,6.0,5.0,88.0,73.000000,100.0,34.0,116.0,98.100000,...,47.046512,2.800000,5.200000,3.600000,80.844444,56.697674,97.444444,18.977778,89.465116,96.422222
117382_1,100.0,4.0,6.0,5.0,96.0,126.000000,100.0,25.0,184.0,98.599998,...,67.285714,3.600000,6.000000,5.000000,78.071429,98.238093,98.071429,17.285714,160.142857,97.750000
117382_2,104.0,4.0,6.0,5.0,108.0,117.000000,100.0,26.0,172.0,100.800003,...,56.872340,3.911765,6.000000,5.000000,71.183673,84.978743,98.775510,19.729167,141.191489,98.981818


### Extract numeric features from lab events

In [20]:
l = list(lab_ids_we_care.values())
flat_list = [item for sublist in l for item in sublist]
labevents_subset_selected = labevents_subset_binned[labevents_subset_binned['ITEMID'].isin(flat_list)]
labevents_subset_selected = labevents_subset_selected.merge(itemid_to_var_df, on='ITEMID')
cols_to_keep = ['SUBJECT_ID', 'HADM_ID_x', 'ITEMID', 'CHARTTIME_x', 'VALUE', 'VALUENUM', 'VALUEUOM', 'LEVEL2', 'HADM_datebin_num']
labevents_subset_selected = labevents_subset_selected[cols_to_keep]
labevents_subset_selected = labevents_subset_selected[labevents_subset_selected['VALUENUM'].notnull()]
labevents_features_max = labevents_subset_selected.groupby(['HADM_datebin_num','LEVEL2'])['VALUENUM'].max().unstack()
labevents_features_min = labevents_subset_selected.groupby(['HADM_datebin_num','LEVEL2'])['VALUENUM'].min().unstack()
labevents_features_mean = labevents_subset_selected.groupby(['HADM_datebin_num','LEVEL2'])['VALUENUM'].mean().unstack()
labevents_features_max = labevents_features_max.add_suffix('_max')
labevents_features_min = labevents_features_min.add_suffix('_min')
labevents_features_mean = labevents_features_mean.add_suffix('_mean')
labevents_features = pd.concat([labevents_features_max, labevents_features_min, labevents_features_mean], axis=1)

### Extract numeric features from outevents

In [21]:
l = list(outputevents_ids_we_care.values())
flat_list = [item for sublist in l for item in sublist]
outputevents_subset_selected = outputevents_subset_binned[outputevents_subset_binned['ITEMID'].isin(flat_list)]
outputevents_subset_selected = outputevents_subset_selected.merge(itemid_to_var_df, on='ITEMID')
cols_to_keep = ['SUBJECT_ID', 'HADM_ID_x', 'ITEMID', 'CHARTTIME_x', 'VALUE', 'VALUEUOM', 'LEVEL2', 'HADM_datebin_num']
outputevents_subset_selected = outputevents_subset_selected[cols_to_keep]
outputevents_subset_selected = outputevents_subset_selected[outputevents_subset_selected['VALUE'].notnull()]
outputevents_features_sum = outputevents_subset_selected.groupby(['HADM_datebin_num','LEVEL2'])['VALUE'].sum().unstack()

outputevents_features_sum = outputevents_features_sum.add_suffix('_sum')
# labevents_features.reset_index(inplace=True)
# outputevents_features_sum.fillna(outputevents_features_sum.mean(),inplace=True)

In [22]:
outputevents_features_sum

LEVEL2,Urine output_sum
HADM_datebin_num,Unnamed: 1_level_1
102390_2,5390.0
102390_3,4775.0
102390_4,3350.0
105348_1,1870.0
105348_2,1600.0
116543_1,205.0
116543_2,95.0
117382_1,980.0
117382_2,2160.0
117382_3,2640.0


### Merge Everything

In [23]:
feature_table = labevents_features.merge(chartevents_features, left_index=True, right_index=True, how='outer')
feature_table = feature_table.merge(labevents_subset_abnormal_features, left_index=True, right_index=True, how='outer')
feature_table = feature_table.merge(outputevents_features_sum, left_index=True, right_index=True, how='outer')


### Add HADM_ID back into the feature table, also create bin_num to mark which bin it is for that patient. these will be useful for assigning death labels

In [24]:
feature_table['HADM_ID'] = feature_table.index
feature_table[['HADM_ID', 'bin_num']] = feature_table['HADM_ID'].str.split('_', expand=True)
feature_table['HADM_ID'] = pd.to_numeric(feature_table['HADM_ID'])
feature_table['bin_num'] = pd.to_numeric(feature_table['bin_num'])

In [25]:
feature_table['HADM_ID'].value_counts().count()

43

### Obtain number of diagnoses for each patient

In [26]:
diagnoses_subset = pd.read_csv('./subsets_tables_collection/diagnoses_icd_subset.csv')

In [27]:
diagnoses_count = diagnoses_subset.groupby('HADM_ID')['ICD9_CODE'].count()

### Obtain number of medications for each patient

In [28]:
prescriptions_subset = pd.read_csv('./subsets_tables_collection/prescriptions_subset.csv')
prescriptions_count = prescriptions_subset.groupby(['HADM_ID', 'DRUG_TYPE'])['DRUG'].count().unstack()
prescriptions_count.fillna(0, inplace=True)
prescriptions_count = prescriptions_count.add_suffix('_drug')

In [29]:
prescriptions_count.reset_index(level=0, inplace=True)
prescriptions_count.head()

DRUG_TYPE,HADM_ID,ADDITIVE_drug,BASE_drug,MAIN_drug
0,102390,0.0,27.0,52.0
1,105348,0.0,25.0,49.0
2,116543,0.0,25.0,44.0
3,117382,0.0,11.0,46.0
4,122406,0.0,32.0,79.0


In [30]:
prescriptions_count['HADM_ID'].value_counts().count()

35

In [31]:
diagnoses_count= pd.DataFrame({'HADM_ID':diagnoses_count.index, 'diag_count':diagnoses_count.values})
diagnoses_count.head()

Unnamed: 0,HADM_ID,diag_count
0,102390,14
1,105348,12
2,116543,26
3,117382,8
4,118776,9


In [32]:
diagnoses_count['HADM_ID'].value_counts().count()

43

In [33]:
prescriptions_count['HADM_ID'].value_counts().count()

35

In [34]:
prescription_diag_merge = pd.merge(diagnoses_count, prescriptions_count, how='outer', on='HADM_ID')
prescription_diag_merge = prescription_diag_merge.fillna(0)

In [35]:
prescription_diag_merge['HADM_ID'].value_counts().count()

43

Merge diagnoses and medications

### Incorporate demographics info

In [36]:
admissions_subset = pd.read_csv('./subsets_tables_collection/admissions_subset.csv')
patients = pd.read_sql("""SELECT subject_id, gender
                            FROM patients
                            """, conn)
patients.replace(['M','F'], [0,1], inplace=True)
patients.rename(columns={'GENDER': 'is_female'}, inplace=True)
admissions_with_gender = admissions_subset.merge(patients,on='SUBJECT_ID')
admissions_with_gender = admissions_with_gender[['HADM_ID', 'INSURANCE', 'ETHNICITY', 'is_female']]
admissions_features = pd.get_dummies(admissions_with_gender, prefix=['insurance','ethnicity'], columns=['INSURANCE', 'ETHNICITY'])
patient_features = admissions_features.merge(prescription_diag_merge, on='HADM_ID')

In [37]:
feature_table['HADM_ID'].value_counts().count()

43

In [38]:
feature_table = feature_table.merge(patient_features, on='HADM_ID')
feature_table.dtypes

Chloride_max                            float64
Creatinine_max                          float64
Glucose_max                             float64
Hematocrit_max                          float64
Hemoglobin_max                          float64
Platelets_max                           float64
Potassium_max                           float64
Red blood cell count_max                float64
Sodium_max                              float64
White blood cell count_max              float64
Chloride_min                            float64
Creatinine_min                          float64
Glucose_min                             float64
Hematocrit_min                          float64
Hemoglobin_min                          float64
Platelets_min                           float64
Potassium_min                           float64
Red blood cell count_min                float64
Sodium_min                              float64
White blood cell count_min              float64
Chloride_mean                           

In [41]:
import sqlite3
conn = sqlite3.connect("./data/MIMIC.db")
admissions_df=pd.read_sql("""
    SELECT * FROM admissions
""",conn)

In [42]:
idx=feature_table.groupby(['HADM_ID'],sort=False)['bin_num'].transform(max)==feature_table['bin_num']
feature_table=feature_table[-idx]

In [47]:
dead_patient_HADM_ID = admissions_df[admissions_df['HOSPITAL_EXPIRE_FLAG'] == 1]['HADM_ID']

In [45]:
bool_max=feature_table.groupby(['HADM_ID'], sort=False)['bin_num'].transform(max)
feature_table['label']=np.where((bool_max==feature_table['bin_num']) & (feature_table['HADM_ID'].isin(dead_patient_HADM_ID)),1,0)
feature_table.head(n=30)

Unnamed: 0,Chloride_max,Creatinine_max,Glucose_max,Hematocrit_max,Hemoglobin_max,Platelets_max,Potassium_max,Red blood cell count_max,Sodium_max,White blood cell count_max,...,ethnicity_MULTI RACE ETHNICITY,ethnicity_PATIENT DECLINED TO ANSWER,ethnicity_UNABLE TO OBTAIN,ethnicity_UNKNOWN/NOT SPECIFIED,ethnicity_WHITE,diag_count,ADDITIVE_drug,BASE_drug,MAIN_drug,label
0,108.0,0.8,160.0,35.4,12.4,210.0,2.9,3.8,143.0,10.6,...,0,0,0,0,0,14,0.0,27.0,52.0,0
1,120.0,0.7,207.0,30.3,10.6,211.0,3.3,3.28,148.0,12.7,...,0,0,0,0,0,14,0.0,27.0,52.0,0
2,118.0,0.6,137.0,25.0,8.9,165.0,4.0,2.7,144.0,9.8,...,0,0,0,0,0,14,0.0,27.0,52.0,1
4,110.0,0.9,148.0,32.1,11.7,312.0,4.7,3.61,140.0,16.1,...,1,0,0,0,0,12,0.0,25.0,49.0,0
5,105.0,0.7,122.0,30.3,7.2,154.0,4.3,2.26,137.0,7.7,...,1,0,0,0,0,12,0.0,25.0,49.0,0
6,106.0,0.5,97.0,28.1,9.8,157.0,4.0,3.18,140.0,7.6,...,1,0,0,0,0,12,0.0,25.0,49.0,0
7,105.0,0.5,117.0,28.0,10.1,239.0,4.0,3.3,139.0,7.0,...,1,0,0,0,0,12,0.0,25.0,49.0,0
9,105.0,3.7,206.0,30.2,9.3,256.0,5.0,3.88,135.0,12.8,...,0,0,0,0,1,26,0.0,25.0,44.0,0
10,101.0,4.7,131.0,31.5,9.4,236.0,5.5,3.96,130.0,15.4,...,0,0,0,0,1,26,0.0,25.0,44.0,1
12,106.0,1.1,112.0,37.8,13.7,197.0,4.0,4.23,143.0,7.1,...,0,0,0,0,1,8,0.0,11.0,46.0,0


In [48]:
feature_table.to_csv("./data/feature_with_label.csv",index=None, header=True)