# eICU data exploration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from dataExtraction import get_query_result, extract_csv

In [2]:
qr = get_query_result(user="dtank", database="eicu", query="SELECT count(*) FROM patient;")[0][0]
print(qr)

200859


In [None]:
# extract_csv(saveto="/home/dtank/data/volume_2/eicu_csv/noteTRY.csv",
#             variablestring="patientUnitStayID, notePath, noteText", 
#             table="note")

## Basic dataset characteristics


In [4]:
print("Number of patients in the dataset: ", 
      get_query_result(user="dtank", database="eicu", query="SELECT COUNT(DISTINCT uniquePid) FROM patient;")[0][0])
print("Number of admissions in the dataset: ",
     get_query_result(user="dtank", database="eicu", query="SELECT COUNT(*) FROM patient;")[0][0])
print("Number of hospitals in the dataset: ",
     get_query_result(user="dtank", database="eicu", query="SELECT COUNT(DISTINCT hospitalID) FROM patient;")[0][0])
print("Number of wards in the dataset: ",
     get_query_result(user="dtank", database="eicu", query="SELECT COUNT(DISTINCT wardID) FROM patient;")[0][0])
print("Number of admission diagnoses in the dataset: ",
     get_query_result(user="dtank", database="eicu", query="SELECT COUNT(DISTINCT apacheAdmissionDx) FROM patient;")[0][0])
print("Number of ICU units in the dataset: ",
     get_query_result(user="dtank", database="eicu", query="SELECT COUNT(DISTINCT unitType) FROM patient;")[0][0])
print("Years in the dataset: ",
     get_query_result(user="dtank", database="eicu", query="SELECT DISTINCT hospitalDischargeYear FROM patient;"))

# also include country, year, version of the dataset

Number of patients in the dataset:  139367
Number of admissions in the dataset:  200859
Number of hospitals in the dataset:  208
Number of wards in the dataset:  335
Number of admission diagnoses in the dataset:  393
Number of ICU units in the dataset:  8
Years in the dataset:  [(2014,), (2015,)]


## Patient characteristics

In [5]:
extract_csv(saveto="/home/dtank/data/volume_2/eicu_csv/patientCharacteristics.csv",
            variablestring="patientunitstayid, uniquePid, gender, age, ethnicity, admissionHeight, hospitalDischargeStatus, unitType, admissionWeight, unitDischargeStatus", 
            table="patient")

COPY 200859
data extracted


In [6]:
patientChar = pd.read_csv("/home/dtank/data/volume_2/eicu_csv/patientCharacteristics.csv")
patientChar

Unnamed: 0,patientunitstayid,uniquepid,gender,age,ethnicity,admissionheight,hospitaldischargestatus,unittype,admissionweight,unitdischargestatus
0,141168,002-34851,Female,70,Caucasian,152.4,Expired,Med-Surg ICU,84.3,Expired
1,141178,002-33870,Female,52,Caucasian,162.6,Alive,Med-Surg ICU,54.4,Alive
2,141179,002-33870,Female,52,Caucasian,162.6,Alive,Med-Surg ICU,,Alive
3,141194,002-5276,Male,68,Caucasian,180.3,Alive,CTICU,73.9,Alive
4,141196,002-37665,Male,71,Caucasian,162.6,Alive,Med-Surg ICU,,Alive
...,...,...,...,...,...,...,...,...,...,...
200854,3353235,035-16382,Male,50,Caucasian,175.3,Alive,Cardiac ICU,90.0,Alive
200855,3353237,035-751,Female,79,Caucasian,162.6,Alive,MICU,78.4,Alive
200856,3353251,035-5166,Male,73,African American,177.8,Alive,Cardiac ICU,102.0,Alive
200857,3353254,035-19511,Male,81,Caucasian,185.4,Alive,Med-Surg ICU,83.9,Alive


In [7]:
patientChar['age'] = patientChar['age'].replace("> 89", "89") # replace all > 89 with 89 entries 
patientChar['age'] = patientChar['age'].astype('float') # convert age from strings to floats

# renaming conventions
patientChar['gender'] = patientChar['gender'].replace("Other", "Other/Unknown") 
patientChar['gender'] = patientChar['gender'].replace("Unknown", "Other/Unknown") 

patientChar['hospitaldischargestatus'] = patientChar['hospitaldischargestatus'].replace('Alive', 'Alive at hospital discharge')
patientChar['hospitaldischargestatus'] = patientChar['hospitaldischargestatus'].replace('Expired', 'Dead at hospital discharge')

patientChar = patientChar.drop_duplicates(subset='uniquepid')
patientChar
# np.where(patientChar['hospitaldischargestatus'] == patientChar['unitdischargestatus'])[0].size # hospitaldischargestatus and unitdischargestatus are not the same

Unnamed: 0,patientunitstayid,uniquepid,gender,age,ethnicity,admissionheight,hospitaldischargestatus,unittype,admissionweight,unitdischargestatus
0,141168,002-34851,Female,70.0,Caucasian,152.4,Dead at hospital discharge,Med-Surg ICU,84.3,Expired
1,141178,002-33870,Female,52.0,Caucasian,162.6,Alive at hospital discharge,Med-Surg ICU,54.4,Alive
3,141194,002-5276,Male,68.0,Caucasian,180.3,Alive at hospital discharge,CTICU,73.9,Alive
4,141196,002-37665,Male,71.0,Caucasian,162.6,Alive at hospital discharge,Med-Surg ICU,,Alive
6,141203,002-23234,Female,77.0,Caucasian,160.0,Alive at hospital discharge,Med-Surg ICU,70.2,Alive
...,...,...,...,...,...,...,...,...,...,...
200852,3353216,035-2734,Female,50.0,African American,165.1,Alive at hospital discharge,CTICU,55.4,Alive
200854,3353235,035-16382,Male,50.0,Caucasian,175.3,Alive at hospital discharge,Cardiac ICU,90.0,Alive
200855,3353237,035-751,Female,79.0,Caucasian,162.6,Alive at hospital discharge,MICU,78.4,Alive
200857,3353254,035-19511,Male,81.0,Caucasian,185.4,Alive at hospital discharge,Med-Surg ICU,83.9,Alive


In [8]:
# !pip install tableone
from tableone import TableOne, load_dataset

In [9]:
columns = ['gender', 'age', 'ethnicity', 'admissionheight', 'admissionweight', 'unittype']
categorical = ['gender', 'ethnicity', 'unittype']
labels = {'gender' : 'Gender', 
          'age' : 'Age', 
          'ethnicity' : 'Ethnicity', 
          'admissionheight' : 'Height', 
          'admissionweight' : 'Weight', 
          'unittype' : 'Hospital Unit'}

patientCharTable = TableOne(patientChar, columns=columns, categorical=categorical, rename=labels,
                            label_suffix=True, nonnormal=['age'], groupby='hospitaldischargestatus', pval=True)

In [10]:
patientCharTable

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by hospitaldischargestatus,Grouped by hospitaldischargestatus,Grouped by hospitaldischargestatus,Grouped by hospitaldischargestatus,Grouped by hospitaldischargestatus
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,Alive at hospital discharge,Dead at hospital discharge,P-Value
n,,,139367,124981,13129,
"Gender, n (%)",Female,111.0,64038 (46.0),57289 (45.9),6144 (46.8),<0.001
"Gender, n (%)",Male,,75176 (54.0),67579 (54.1),6963 (53.1),
"Gender, n (%)",Other/Unknown,,42 (0.0),26 (0.0),16 (0.1),
"Age, median [Q1,Q3]",,82.0,"65.0 [53.0,77.0]","64.0 [52.0,76.0]","72.0 [61.0,82.0]",<0.001
"Ethnicity, n (%)",African American,1780.0,14672 (10.7),13255 (10.7),1303 (10.1),0.141
"Ethnicity, n (%)",Asian,,2338 (1.7),2073 (1.7),232 (1.8),
"Ethnicity, n (%)",Caucasian,,107724 (78.3),96567 (78.3),10198 (78.7),
"Ethnicity, n (%)",Hispanic,,5233 (3.8),4685 (3.8),523 (4.0),
"Ethnicity, n (%)",Native American,,964 (0.7),871 (0.7),87 (0.7),


In [None]:
print(patientCharTable.tabulate(tablefmt = 'latex'))

In [None]:
query = """SELECT ethnicity, COUNT(*) 
        FROM patient 
        WHERE hospitalDischargeStatus='Expired' AND age BETWEEN '15' AND '17' 
        GROUP BY ethnicity;"""
get_query_result(user="dtank", database="eicu", query=query)

In [None]:
data = [[30, 25, 50, 20],
[40, 23, 51, 17],
[35, 22, 45, 19]]
X = np.arange(4)
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(X + 0.00, data[0], color = 'b', width = 0.25)
ax.bar(X + 0.25, data[1], color = 'g', width = 0.25)
ax.bar(X + 0.50, data[2], color = 'r', width = 0.25)
ax.legend(labels=['Men', 'Women', 'Other'])
plt.show()