In [2]:
!pip install PyAthena
from pyathena import connect
from pyathena.pandas.util import as_pandas


# Import libraries
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import boto3
from botocore.client import ClientError
from IPython.display import display, HTML
import pickle
%matplotlib inline


s3 = boto3.resource('s3')
client = boto3.client("sts")
account_id = client.get_caller_identity()["Account"]
my_session = boto3.session.Session()
region = my_session.region_name
athena_query_results_bucket = 'aws-athena-query-results-'+account_id+'-'+region

try:
    s3.meta.client.head_bucket(Bucket=athena_query_results_bucket)
except ClientError:
    bucket = s3.create_bucket(Bucket=athena_query_results_bucket)
    print('Creating bucket '+athena_query_results_bucket)
cursor = connect(s3_staging_dir='s3://'+athena_query_results_bucket+'/athena/temp').cursor()



## Diabetic patients cohort

In [100]:
query='select * from default.diabetic_patients_cohort order by subject_id'
cursor.execute(query)
patients = as_pandas(cursor)
patients.head()

Unnamed: 0,subject_id,admit_time,discharge_time,mortality_flag
0,13,2167-01-08 18:43:00,2167-01-15 15:15:00,0
1,18,2167-10-02 11:18:00,2167-10-04 16:15:00,0
2,20,2183-04-28 09:45:00,2183-05-03 14:45:00,0
3,21,2134-09-11 12:17:00,2135-02-08 02:08:00,1
4,24,2139-06-06 16:14:00,2139-06-09 12:48:00,0


## Selected ICD9 codes

In [5]:
chartevents_codes = [
    ['Capillary refill rate', 3348, 224308, 223951, 8377, 115],
    ['Diastolic blood pressure', 8364, 225310, 228151, 8555, 8368, 220051, 8502, 8503, 8504, 8505, 8506, 8507, 8508, 153, 8440, 224643, 227242, 8441, 220180, 8444, 8445, 8446, 8448, 220060],
    ['Fraction inspired oxygen', 7146, 226767, 227035, 228192, 228193, 228232],
    ['Glascow coma scale eye opening', 184, 220739],
    ['Glascow coma scale motor response', 223901, 226757],
    ['Glascow coma scale total', 198],
    ['Glascow coma scale verbal response', 223900, 226758],
    ['Glucose', 3744, 3745, 1310, 807, 1529, 811, 220621, 226537, 3447, 225664],
    ['Heart Rate', 211, 220045],
    ['Height', 226730],
    ['Mean blood pressure', 225312, 52, 6702, 220052, 6927, 3312, 3314, 3316, 7618, 3318, 3320, 3322, 7620, 7622, 3324, 5702, 443, 456, 220181],
    ['Oxygen saturation', 0],
    ['Respiratory rate', 220210, 618, 224688, 224690, 224689, 619],
    ['Systolic blood pressure', 51, 225309, 220050, 3313, 3315, 3317, 3319, 3321, 3323, 3325, 442, 224167, 227243, 455, 220179, 480, 482, 484 ],
    ['Temperature', 224027, 645, 8537, 676, 677, 223762, 678, 679, 223761],
    ['Weight', 581],
    ['pH', 1126, 780, 223830, 220274, 220734, 4753, 4202, 1365, 7717, 3839]
]

labevents_codes = [
    ['Oxygen Saturation', 50817],
    ['Temperature', 50825],
    ['pH', 50820],
    ['% Hemoglobin A1c', 50852, 50854],
    ['Blood Glucose', 50931, 51529],
    ['Serum Creatinine', 50912]    
]

In [6]:
def create_codes_table():
    cursor.execute("SHOW TABLES LIKE 'featurescodes'")
    if (cursor.fetchone() is None):
        try:
            query = "create external table default.featurescodes  (code int, mimiciiitable string, feature string, icd9code int) stored as PARQUET location 's3://aws-athena-query-results-067114122515-us-east-1/featurescodes'"
            cursor.execute(query)
            cnt = 0
            for f in labevents_codes:
                fn = f[0]
                for c in f[1:]:
                    query = "insert into featurescodes values (" + str(cnt) + ",'labevents','" + fn +"'," + str(c) +")"
                    cursor.execute(query)
                    cnt += 1
            for f in chartevents_codes:
                fn = f[0]
                for c in f[1:]:
                    query = "insert into featurescodes values (" + str(cnt) + ",'chartevents','" + fn +"'," + str(c) +")"
                    cursor.execute(query)
                    cnt += 1
            print ("featurescodes table created!")
        except Exception as e:
            print (e)
    else:
        print ("featurescodes table already exists.")
            

In [7]:
create_codes_table()

featurescodes table created!


Item codes are unique among chart and lab events.

In [8]:
cursor.execute("select count(*) from featurescodes f, featurescodes g where g.icd9code=f.icd9code and g.feature <> f.feature")
assert 0 == cursor.fetchone()[0], "Different features have same code in featurescodes table!!!"

## Create joint events table

In [9]:
def create_joint_events():
    cursor.execute("""CREATE TABLE events AS 
                    SELECT p.subject_id,
                             itemid,
                             charttime,
                             valuenum,
                             p.admit_time,
                             p.discharge_time,
                             p.mortality_flag
                    FROM mimiciii.labevents l
                    INNER JOIN default.diabetic_patients_cohort p
                        ON l.subject_id = p.subject_id
                    INNER JOIN default.featurescodes f
                        ON f.icd9code = l.itemid
                    WHERE f.mimiciiitable='labevents'
                        and l.charttime < p.discharge_time - INTERVAL '48' HOUR
                    UNION
                    SELECT p.subject_id,
                             itemid,
                             charttime,
                             valuenum,
                             p.admit_time,
                             p.discharge_time,
                             p.mortality_flag
                    FROM mimiciii.chartevents c
                    INNER JOIN default.diabetic_patients_cohort p
                        ON c.subject_id = p.subject_id
                    INNER JOIN default.featurescodes f
                        ON f.icd9code = c.itemid
                    WHERE f.mimiciiitable='chartevents'
                        and c.charttime < p.discharge_time - INTERVAL '48' HOUR
                    ORDER BY  subject_id, itemid, charttime""")
    

In [10]:
cursor.execute("SHOW TABLES LIKE 'events'")
if (cursor.fetchone() is None):
    create_joint_events()

In [11]:
def create_events_daystodischarge():
    cursor.execute("""CREATE TABLE events_daystodischarge AS SELECT subject_id,
                             -date_diff('day', discharge_time, charttime) daystodischarge, itemid, valuenum, mortality_flag
                    FROM events e
                    ORDER BY  subject_id, daystodischarge desc, itemid""")

In [12]:
cursor.execute("SHOW TABLES LIKE 'events_daystodischarge'")
if (cursor.fetchone() is None):
    create_events_daystodischarge()

#### We took the average of the valuenum for each itemid on each day

In [28]:
def create_events_features():
    cursor.execute("""CREATE TABLE events_features WITH (external_location = 's3://aws-athena-query-results-067114122515-us-east-1/events_features')
                    AS SELECT subject_id,
                             daystodischarge,
                             f.code,
                             avg(valuenum) value,
                             mortality_flag
                    FROM events_daystodischarge e
                    LEFT JOIN featurescodes f ON e.itemid=f.icd9code
                    GROUP BY  subject_id, daystodischarge, f.code, mortality_flag
                    ORDER BY  subject_id, daystodischarge desc, code""")


In [29]:
cursor.execute("SHOW TABLES LIKE 'events_features'")
if (cursor.fetchone() is None):
    create_events_features()

## Sanity check

In [31]:
cursor.execute("""SELECT feature,
                         avg(value) avg,
                         stddev(value) dev,
                         min(value) min,
                         max(value) max,
                         count(value) cnt
                FROM events_features AS e
                INNER JOIN featurescodes AS f
                    ON e.code=f.code
                WHERE value>0 and value<1000
                GROUP BY  feature
                ORDER BY  dev DESC """)
df = as_pandas(cursor)
df

Unnamed: 0,feature,avg,dev,min,max,cnt
0,Blood Glucose,146.786787,66.943482,9.0,981.0,163775
1,Glucose,152.473772,55.603106,3.0,952.5,176479
2,Temperature,69.520109,30.686679,0.8,300.02,140187
3,Weight,90.05712,26.362641,0.3,300.0,25210
4,Systolic blood pressure,122.299468,20.378424,0.3,964.375,95847
5,Diastolic blood pressure,55.629337,17.317948,0.380952,914.0,105113
6,Mean blood pressure,78.258917,14.859039,0.333333,891.857143,95618
7,Heart Rate,84.96913,14.8387,31.8,230.566667,75725
8,Height,168.541773,13.483901,13.0,284.0,3678
9,Oxygen Saturation,90.09224,11.69514,1.4,542.5,20731


In [32]:
cursor.execute("""
                SELECT min(daystodischarge) minDays, max(daystodischarge) maxDays, avg(daystodischarge) avgDays, stddev(daystodischarge) stddevDays FROM events_features""")
df = as_pandas(cursor)
df

Unnamed: 0,minDays,maxDays,avgDays,stddevDays
0,2,4039,250.830126,551.782531


In [33]:
cursor.execute("""
                SELECT subject_id, count(distinct daystodischarge) cntDays FROM events_features GROUP BY subject_id""")
df = as_pandas(cursor)

In [34]:
print ("Statistics of days to discharge\n", df['cntDays'].describe())
print ("\n\nMore than 180 days\n", df.loc[df['cntDays']>180, ['cntDays']].describe())

Statistics of days to discharge
 count    9806.000000
mean       17.771976
std        29.193836
min         1.000000
25%         5.000000
50%         9.000000
75%        19.000000
max      1007.000000
Name: cntDays, dtype: float64


More than 180 days
            cntDays
count    38.000000
mean    284.842105
std     141.450120
min     181.000000
25%     211.000000
50%     250.000000
75%     297.500000
max    1007.000000


In [35]:
cursor.execute("""
                SELECT mortality_flag, count(distinct subject_id) cnt FROM diabetic_patients_cohort group by mortality_flag""")
df = as_pandas(cursor)
df

Unnamed: 0,mortality_flag,cnt
0,0,8605
1,1,1217


In [36]:
cursor.execute("""
                SELECT mortality_flag, count(*) cnt FROM events_features group by mortality_flag""")
df = as_pandas(cursor)
df

Unnamed: 0,mortality_flag,cnt
0,1,329054
1,0,1250447


## Construction of Events dataframe

In [39]:
cursor.execute("select * from events_features order by subject_id, daystodischarge desc, code")
events = as_pandas(cursor)

In [40]:
events

Unnamed: 0,subject_id,daystodischarge,code,value,mortality_flag
0,13,6,2,7.430000,0
1,13,6,5,166.500000,0
2,13,6,7,0.700000,0
3,13,6,17,64.571429,0
4,13,6,26,9.000000,0
...,...,...,...,...,...
1579496,99999,3,103,128.476190,0
1579497,99999,3,107,,0
1579498,99999,3,115,98.575000,0
1579499,99999,2,5,139.000000,0


In [43]:
events_item = events.groupby(['subject_id', 'daystodischarge'])['code'].apply(list).reset_index(name='codes')
events_values = events.groupby(['subject_id', 'daystodischarge'])['value'].apply(list).reset_index(name='values')

In [101]:

pickle.dump( events_item, open("events_item.p", "wb" ) )
pickle.dump( events_values, open("events_value.p", "wb"))
pickle.dump( patients, open("patients.p", "wb"))

In [111]:
ei = pickle.load( open( "events_item.p", "rb" ) )

assert len(ei[ei['subject_id']==13]['codes']) == len(events_item[events_item['subject_id']==13]['codes']), "Wrong serialization!!"
assert len(ei)==len(events_item, ), "Wrong serialization!!"

ei = pickle.load(open("events_value.p", "rb") )
assert ei[ei['subject_id']==13]['values'][4] == events_values[events_values['subject_id']==13]['values'][4], "Wrong serialization!!"
assert len(ei)==len(events_item), "Wrong serialization!!"

ei = pickle.load(open('patients.p', 'rb'))
assert int(ei[ei['subject_id']==2511]['mortality_flag']) == int(patients[patients['subject_id']==2511]['mortality_flag'])

In [4]:
cursor.execute('select max(code) from featurescodes')
max_code = int(cursor.fetchone()[0])

In [6]:
pickle.dump(max_code, open("events_maxcode.p", "wb"))