In [1]:
!pip install PyAthena
from pyathena import connect
from pyathena.pandas.util import as_pandas


# Import libraries
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import boto3
from botocore.client import ClientError
from IPython.display import display, HTML
%matplotlib inline


s3 = boto3.resource('s3')
client = boto3.client("sts")
account_id = client.get_caller_identity()["Account"]
my_session = boto3.session.Session()
region = my_session.region_name
athena_query_results_bucket = 'aws-athena-query-results-'+account_id+'-'+region

try:
    s3.meta.client.head_bucket(Bucket=athena_query_results_bucket)
except ClientError:
    bucket = s3.create_bucket(Bucket=athena_query_results_bucket)
    print('Creating bucket '+athena_query_results_bucket)
cursor = connect(s3_staging_dir='s3://'+athena_query_results_bucket+'/athena/temp').cursor()

Collecting PyAthena
  Using cached PyAthena-2.2.0-py3-none-any.whl (37 kB)
Collecting tenacity>=4.1.0
  Using cached tenacity-7.0.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: tenacity, PyAthena
Successfully installed PyAthena-2.2.0 tenacity-7.0.0


## Diabetic patients cohort

In [2]:
query='select * from default.diabetic_patients_cohort'
cursor.execute(query)
patients = as_pandas(cursor)
patients.head()

Unnamed: 0,subject_id,admit_time,discharge_time,mortality_flag
0,1454,2108-01-04 08:00:00,2108-01-08 13:49:00,0
1,1337,2137-03-25 12:26:00,2137-04-03 11:55:00,0
2,2511,2191-10-12 21:55:00,2191-10-26 17:57:00,1
3,3435,2148-04-27 21:03:00,2148-05-02 14:55:00,0
4,3868,2125-05-03 17:03:00,2125-11-05 16:02:00,1


## Selected ICD9 codes

In [3]:
chartevents_codes = [
    ['Capillary refill rate', 3348, 224308, 223951, 8377, 115],
    ['Diastolic blood pressure', 8364, 225310, 228151, 8555, 8368, 220051, 8502, 8503, 8504, 8505, 8506, 8507, 8508, 153, 8440, 224643, 227242, 8441, 220180, 8444, 8445, 8446, 8448, 220060],
    ['Fraction inspired oxygen', 7146, 226767, 227035, 228192, 228193, 228232],
    ['Glascow coma scale eye opening', 184, 220739],
    ['Glascow coma scale motor response', 223901, 226757],
    ['Glascow coma scale total', 198],
    ['Glascow coma scale verbal response', 223900, 226758],
    ['Glucose', 3744, 3745, 1310, 807, 1529, 811, 220621, 226537, 3447, 225664],
    ['Heart Rate', 211, 220045],
    ['Height', 226730],
    ['Mean blood pressure', 225312, 52, 6702, 220052, 6927, 3312, 3314, 3316, 7618, 3318, 3320, 3322, 7620, 7622, 3324, 5702, 443, 456, 220181],
    ['Oxygen saturation', 0],
    ['Respiratory rate', 220210, 618, 224688, 224690, 224689, 619],
    ['Systolic blood pressure', 51, 225309, 220050, 3313, 3315, 3317, 3319, 3321, 3323, 3325, 442, 224167, 227243, 455, 220179, 480, 482, 484 ],
    ['Temperature', 224027, 645, 8537, 676, 677, 223762, 678, 679, 223761],
    ['Weight', 581],
    ['pH', 1126, 780, 223830, 220274, 220734, 4753, 4202, 1365, 7717, 3839]
]

labevents_codes = [
    ['Oxygen Saturation', 50817],
    ['Temperature', 50825],
    ['pH', 50820],
    ['% Hemoglobin A1c', 50852, 50854],
    ['Blood Glucose', 50931, 51529],
    ['Serum Creatinine', 50912]    
]

In [4]:
def create_codes_table():
    cursor.execute("SHOW TABLES LIKE 'featurescodes'")
    if (cursor.fetchone() is None):
        try:
            query = "create external table default.featurescodes  (mimiciiitable string, feature string, icd9code int) stored as PARQUET location 's3://aws-athena-query-results-067114122515-us-east-1/events'"
            cursor.execute(query)
            for f in labevents_codes:
                fn = f[0]
                for c in f[1:]:
                    query = "insert into featurescodes values ('labevents','" + fn +"'," + str(c) +")"
                    cursor.execute(query)
            for f in chartevents_codes:
                fn = f[0]
                for c in f[1:]:
                    query = "insert into featurescodes values ('chartevents','" + fn +"'," + str(c) +")"
                    cursor.execute(query)

            print ("featurescodes table created!")
        except Exception as e:
            print (e)
    else:
        print ("featurescodes table already exists.")
            

In [5]:
create_codes_table()

featurescodes table already exists.


Item codes are unique among chart and lab events.

In [6]:
cursor.execute("select count(*) from featurescodes f, featurescodes g where g.icd9code=f.icd9code and g.feature <> f.feature")
assert 0 == cursor.fetchone()[0], "Different features have same code in featurescodes table!!!"

## Create joint events table

In [7]:
def create_joint_events():
    cursor.execute("""CREATE TABLE events AS 
                    SELECT p.subject_id,
                             itemid,
                             charttime,
                             valuenum,
                             p.admit_time,
                             p.discharge_time,
                             p.mortality_flag
                    FROM mimiciii.labevents l
                    INNER JOIN default.diabetic_patients_cohort p
                        ON l.subject_id = p.subject_id
                    INNER JOIN default.featurescodes f
                        ON f.icd9code = l.itemid
                    WHERE f.mimiciiitable='labevents'
                        and l.charttime < p.discharge_time - INTERVAL '48' HOUR
                    UNION
                    SELECT p.subject_id,
                             itemid,
                             charttime,
                             valuenum,
                             p.admit_time,
                             p.discharge_time,
                             p.mortality_flag
                    FROM mimiciii.chartevents c
                    INNER JOIN default.diabetic_patients_cohort p
                        ON c.subject_id = p.subject_id
                    INNER JOIN default.featurescodes f
                        ON f.icd9code = c.itemid
                    WHERE f.mimiciiitable='chartevents'
                        and c.charttime < p.discharge_time - INTERVAL '48' HOUR
                    ORDER BY  subject_id, itemid, charttime""")
    

In [8]:
cursor.execute("SHOW TABLES LIKE 'events'")
if (cursor.fetchone() is None):
    create_joint_events()

In [9]:
def create_events_daystodischarge():
    cursor.execute("""CREATE TABLE events_daystodischarge AS SELECT subject_id,
                             -date_diff('day', discharge_time, charttime) daystodischarge, itemid, valuenum, mortality_flag
                    FROM events
                    ORDER BY  subject_id, daystodischarge desc, itemid""")

In [10]:
cursor.execute("SHOW TABLES LIKE 'events_daystodischarge'")
if (cursor.fetchone() is None):
    create_events_daystodischarge()

In [11]:
def create_events_features():
    cursor.execute("""CREATE TABLE events_features AS SELECT subject_id,
                             daystodischarge,
                             itemid,
                             avg(valuenum) value,
                             mortality_flag
                    FROM events_daystodischarge
                    GROUP BY  subject_id, daystodischarge, itemid, mortality_flag
                    ORDER BY  subject_id, daystodischarge desc, itemid""")


In [12]:
cursor.execute("SHOW TABLES LIKE 'events_features'")
if (cursor.fetchone() is None):
    create_events_features()

## Sanity check

In [14]:
cursor.execute("""SELECT feature,
                         avg(value) avg,
                         stddev(value) dev,
                         min(value) min,
                         max(value) max,
                         count(value) cnt
                FROM events_features AS e
                INNER JOIN featurescodes AS f
                    ON e.itemid=f.icd9code
                WHERE value>0 and value<1000
                GROUP BY  feature
                ORDER BY  dev DESC """)
df = as_pandas(cursor)
df

Unnamed: 0,feature,avg,dev,min,max,cnt
0,Blood Glucose,146.786787,66.943482,9.0,981.0,163775
1,Glucose,152.473772,55.603106,3.0,952.5,176479
2,Temperature,69.520109,30.686679,0.8,300.02,140187
3,Weight,90.05712,26.362641,0.3,300.0,25210
4,Systolic blood pressure,122.299468,20.378424,0.3,964.375,95847
5,Diastolic blood pressure,55.629337,17.317948,0.380952,914.0,105113
6,Mean blood pressure,78.258917,14.859039,0.333333,891.857143,95618
7,Heart Rate,84.96913,14.8387,31.8,230.566667,75725
8,Height,168.541773,13.483901,13.0,284.0,3678
9,Oxygen Saturation,90.09224,11.69514,1.4,542.5,20731


In [15]:
cursor.execute("""
                SELECT min(daystodischarge) minDays, max(daystodischarge) maxDays, avg(daystodischarge) avgDays, stddev(daystodischarge) stddevDays FROM events_features""")
df = as_pandas(cursor)
df

Unnamed: 0,minDays,maxDays,avgDays,stddevDays
0,2,4039,250.830126,551.782531


In [16]:
cursor.execute("""
                SELECT mortality_flag, count(distinct subject_id) cnt FROM diabetic_patients_cohort group by mortality_flag""")
df = as_pandas(cursor)
df

Unnamed: 0,mortality_flag,cnt
0,1,1217
1,0,8605


In [18]:
cursor.execute("""
                SELECT mortality_flag, count(*) cnt FROM events_features group by mortality_flag""")
df = as_pandas(cursor)
df

Unnamed: 0,mortality_flag,cnt
0,1,329054
1,0,1250447
