### Goal: create a feature for each abnormal lab result

In [43]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import statistics
import matplotlib.pyplot as plt

In [44]:
labevents_subset = pd.read_csv('./subsets_tables_collection/labevents_subset.csv')

In [45]:
labevents_subset.dtypes

Unnamed: 0      int64
index           int64
ROW_ID          int64
SUBJECT_ID      int64
HADM_ID       float64
ITEMID          int64
CHARTTIME      object
VALUE          object
VALUENUM      float64
VALUEUOM       object
FLAG           object
dtype: object

In [46]:
import sqlite3 # library for working with sqlite database
conn = sqlite3.connect("./data/MIMIC.db") # Create a connection to the on-disk database
labitems = pd.read_sql("SELECT * FROM d_labitems", conn)

In [47]:
labitems.dtypes

index          int64
ROW_ID         int64
ITEMID         int64
LABEL         object
FLUID         object
CATEGORY      object
LOINC_CODE    object
dtype: object

In [48]:
labevents_subset = labevents_subset.merge(labitems, on='ITEMID')
labevents_subset.head(5)

Unnamed: 0.1,Unnamed: 0,index_x,ROW_ID_x,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG,index_y,ROW_ID_y,LABEL,FLUID,CATEGORY,LOINC_CODE
0,0,204360,212234,338,194592.0,50912,2135-11-01 06:00:00,1.3,1.3,mg/dL,abnormal,239,113,Creatinine,Blood,Chemistry,2160-0
1,40,204400,212274,338,194592.0,50912,2135-11-02 06:00:00,1.4,1.4,mg/dL,abnormal,239,113,Creatinine,Blood,Chemistry,2160-0
2,68,204904,212302,338,194592.0,50912,2135-11-03 08:50:00,1.3,1.3,mg/dL,abnormal,239,113,Creatinine,Blood,Chemistry,2160-0
3,87,209756,212001,338,194592.0,50912,2135-10-27 05:25:00,1.5,1.5,mg/dL,abnormal,239,113,Creatinine,Blood,Chemistry,2160-0
4,121,210499,212035,338,194592.0,50912,2135-10-27 19:01:00,1.4,1.4,mg/dL,abnormal,239,113,Creatinine,Blood,Chemistry,2160-0


In [49]:
labevents_subset_abnormal = labevents_subset[labevents_subset['FLAG'] == 'abnormal']
labevents_subset_normal = labevents_subset[labevents_subset['FLAG'] != 'abnormal']

In [50]:
# dropping lab categories detected abnormal less than 5 times
num_abnormal_labels = labevents_subset_abnormal.LABEL.value_counts()
abnormal_labels = num_abnormal_labels[num_abnormal_labels > 5].index
abnormal_labels

Index(['Glucose', 'Hematocrit', 'Hemoglobin', 'Red Blood Cells',
       'Urea Nitrogen', 'Creatinine', 'PT', 'RDW', 'Calcium, Total',
       'Chloride', 'pO2', 'PTT', 'Bicarbonate', 'Phosphate', 'Sodium', 'MCH',
       'Platelet Count', 'pCO2', 'White Blood Cells', 'INR(PT)', 'pH', 'MCHC',
       'MCV', 'Lactate', 'Troponin T', 'Free Calcium', 'Creatine Kinase (CK)',
       'Calculated Total CO2', 'Lymphocytes', 'Neutrophils', 'Potassium',
       'Albumin', 'Anion Gap', 'Magnesium', 'Bilirubin, Total',
       'Alkaline Phosphatase', 'Osmolality, Measured',
       'Asparate Aminotransferase (AST)', 'Creatine Kinase, MB Isoenzyme',
       'Lactate Dehydrogenase (LD)', 'Potassium, Whole Blood', 'RBC',
       'Alanine Aminotransferase (ALT)', 'CK-MB Index', 'Vancomycin',
       'Fibrinogen, Functional', 'Hyaline Casts', 'Sodium, Whole Blood',
       'Cortisol', 'WBC', 'Bands', 'Chloride, Whole Blood', 'Eosinophils',
       'Gentamicin', 'Iron', 'Bilirubin, Direct', 'Metamyelocytes',
      

In [51]:
labevents_subset_abnormal = labevents_subset_abnormal[labevents_subset_abnormal['LABEL'].isin(abnormal_labels)]

In [52]:
labevents_subset_abnormal = pd.get_dummies(labevents_subset_abnormal, prefix='abnormal_lab', columns=['LABEL'])

In [55]:
labevents_subset_features = pd.concat([labevents_subset_abnormal, labevents_subset_normal], sort=False)

In [88]:
labevents_subset_features.dtypes

Unnamed: 0                                                   int64
index_x                                                      int64
ROW_ID_x                                                     int64
SUBJECT_ID                                                   int64
HADM_ID                                                      int64
ITEMID                                                       int64
CHARTTIME                                           datetime64[ns]
VALUE                                                       object
VALUENUM                                                   float64
VALUEUOM                                                    object
FLAG                                                        object
index_y                                                      int64
ROW_ID_y                                                     int64
FLUID                                                       object
CATEGORY                                                    ob

In [60]:
labevents_subset_features = labevents_subset_features.drop('LABEL', axis=1)

In [62]:
# alright, now we have a df with each abnormal lab as one feature. we can now bin the times...

0.0    6446
1.0      51
Name: abnormal_lab_Albumin, dtype: int64

In [84]:
labevents_subset_features['CHARTTIME'] = pd.to_datetime(labevents_subset_features['CHARTTIME'])
labevents_subset_features['HADM_ID'] = labevents_subset_features['HADM_ID'].astype(np.int64)
grouped = labevents_subset_features.groupby(['HADM_ID', pd.Grouper(freq='48H', key='CHARTTIME')])
count = 0
past_hadm = ''
def counter(x):
    
    global count, past_hadm
    curr_hadm = x.iloc[0]
    if past_hadm != curr_hadm:
        count = 0
        past_hadm = curr_hadm
    y = count
    count += 1
    return str(curr_hadm) + "_" + str(count)
labevents_subset_features['bin_date'] = grouped['HADM_ID'].transform(counter)
filter_col = [col for col in labevents_subset_features if col.startswith('abnormal')]
filter_col.append('bin_date')
labevents_subset_abnormal_cols = labevents_subset_features[filter_col]
total_abnormal_by_bindate = labevents_subset_abnormal_cols.groupby('bin_date').sum()

In [85]:
total_abnormal_by_bindate

Unnamed: 0_level_0,abnormal_lab_Alanine Aminotransferase (ALT),abnormal_lab_Albumin,abnormal_lab_Alkaline Phosphatase,abnormal_lab_Anion Gap,abnormal_lab_Asparate Aminotransferase (AST),abnormal_lab_Bands,abnormal_lab_Bicarbonate,"abnormal_lab_Bilirubin, Direct","abnormal_lab_Bilirubin, Total",abnormal_lab_CK-MB Index,...,abnormal_lab_Troponin I,abnormal_lab_Troponin T,abnormal_lab_Urea Nitrogen,abnormal_lab_Urobilinogen,abnormal_lab_Vancomycin,abnormal_lab_WBC,abnormal_lab_White Blood Cells,abnormal_lab_pCO2,abnormal_lab_pH,abnormal_lab_pO2
bin_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102390_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
102390_2,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,1.0,4.0
102390_3,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
102390_4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105348_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
105348_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105348_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105348_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105348_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116543_1,0.0,0.0,2.0,5.0,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,6.0,0.0,0.0,0.0,2.0,2.0,5.0,2.0


In [86]:
labevents_subset_features.dtypes

Unnamed: 0                                                   int64
index_x                                                      int64
ROW_ID_x                                                     int64
SUBJECT_ID                                                   int64
HADM_ID                                                      int64
ITEMID                                                       int64
CHARTTIME                                           datetime64[ns]
VALUE                                                       object
VALUENUM                                                   float64
VALUEUOM                                                    object
FLAG                                                        object
index_y                                                      int64
ROW_ID_y                                                     int64
FLUID                                                       object
CATEGORY                                                    ob