In [65]:
# Simulating data for the data lake

from collections import OrderedDict
import datetime
import json
import os
import random
import sqlite3
import string
import struct

import names
import pandas as pd

In [86]:
genders = [random.choice(['male', 'female']) for _ in range(400)]

df_patients = pd.DataFrame(OrderedDict([
    ('patient_id', range(1, 401)), ('name', [names.get_full_name(gender=g) for g in genders]),
    ('dob', [datetime.datetime(random.choice(range(1950, 2000)), 1, 1, 1) for _ in range(400)]),
    ('gender', genders), ('race', [random.choice(['asian', 'black', 'white', 'hispanic', 'native_american']) for _ in range(400)]),
    ('postal_code', [random.choice(range(40000, 45000)) for _ in range(400)]),
    ('occupation', [random.choice(['builder', 'teacher', 'firefighter', 'engineer', 'doctor']) for _ in range(400)]),
    ('phone', [random.choice(range(17810000000, 17819999999)) for _ in range(400)]),
]))

In [204]:
# 1 admission per patient with fixed date range where they have their procedures.
# Another < 30 day readmission for 100 patients. Note that not all these 100 had car-endar.

readmitted_pids = random.sample(range(1, 401), 100)

df_admissions = pd.DataFrame(OrderedDict([
    ('hadm_id', range(50001, 50501)),
    ('patient_id', list(range(1, 401)) + readmitted_pids),
    ('intime', [datetime.datetime(2018, 1, 1, 1)]*400 + [datetime.datetime(2018, 2, 20, 1)]*100),
    ('outtime', [datetime.datetime(2018, 2, 1, 1)]*400 + [datetime.datetime(2018, 2, 20, 1)]*100)
]))

In [91]:
# 1 procedure per hadm_id. Around half will have code 3000 for carotid endartorectomy
df_procedures = pd.DataFrame(OrderedDict([
    ('hadm_id', range(50001, 50401)),
    ('icd9_code', [random.choice([3000, 3001]) for _ in range(400)]),
    ('clinician_id', [random.choice(range(1, 10)) for _ in range(400)]),
]))

In [121]:
# 0-3 diagnoses per patient
hadm_ids, icd9_codes, clinician_ids = [], [], []

for h in range(50001, 50401):
    c = random.choice(range(1, 10))
    x = random.choice(range(7))
    
    if x & 1:
        hadm_ids.append(h)
        icd9_codes.append(10000)
        clinician_ids.append(c)
    if x & 2:
        hadm_ids.append(h)
        icd9_codes.append(10001)
        clinician_ids.append(c)
    if x & 4:
        hadm_ids.append(h)
        icd9_codes.append(10002)
        clinician_ids.append(c)
    
df_diagnoses = pd.DataFrame(OrderedDict([
    ('hadm_id', hadm_ids),
    ('icd9_code', icd9_codes),
    ('clinician_id', clinician_ids),
]))

In [168]:
# Each hadm_id has a weight and 2 bps
df_chartevents = pd.DataFrame(OrderedDict([
    ('hadm_id', [item for sublist in [(x, x, x) for x in range(50001, 50401)] for item in sublist]),
    ('item_id', [2000, 2001, 2001] * 400),
    ('value', [item for subitem in [[random.choice(range(40, 120)), random.choice(range(40, 70)), random.choice(range(70, 110))] for _ in range(400)] for item in subitem]),
    ('datetime', [datetime.datetime(2018, 1, 1, 2), datetime.datetime(2018, 1, 1, 3), datetime.datetime(2018, 1, 1, 4)]*400)
]))

In [207]:
# The connection object represents the database
if os.path.isfile('data/example-data-lake/ehr/ehr.db'):
    os.remove('data/example-data-lake/ehr/ehr.db')

conn = sqlite3.connect('data/example-data-lake/ehr/ehr.db')
c = conn.cursor()

# Create tables
c.execute('''CREATE TABLE patients
             (patient_id integer primary key, name text, dob date, gender text,
              race text, postal_code integer, occupation text, phone integer);''')

c.execute('''CREATE TABLE admissions
             (hadm_id integer primary key, patient_id integer,
              intime datetime, outtime datetime);''')

c.execute('''CREATE TABLE procedures
             (hadm_id integer, icd9_code integer, clinician_id integer);''')

c.execute('''CREATE TABLE diagnoses
             (hadm_id integer, icd9_code integer, clinician_id integer);''')

c.execute('''CREATE TABLE chartevents
             (hadm_id integer, item_id integer, value integer, datetime datetime);''')


# Insert data
df_patients.to_sql('patients', conn, index=False, if_exists='append')
df_admissions.to_sql('admissions', conn, index=False, if_exists='append')
df_procedures.to_sql('procedures', conn, index=False, if_exists='append')
df_diagnoses.to_sql('diagnoses', conn, index=False, if_exists='append')
df_chartevents.to_sql('chartevents', conn, index=False, if_exists='append')

# Save (commit) the changes
conn.commit()

# We can also close the connection if we are done with it.
# Just be sure any changes have been committed or they will be lost.
conn.close()

In [176]:
# Numerics of bp in blood-pressure/<patient_id>-<date>.csv. Both before and after surg.
# Each file contains 1 day of aggregated hourly data. Each patient has recordings
# both before and after their procedure.
"""
hour,bp_min,bp_max
0,70,72
1
2
...
23
...
"""

for pid in range(1, 401):
    for datestr in ['2017-12-20', '2018-01-15']:
        df = pd.DataFrame(OrderedDict([
            ('hour', range(24)),
            ('bp_min', [random.choice(range(40, 70)) for _ in range(24)]),
            ('bp_max', [random.choice(range(70, 110)) for _ in range(24)])
        ]))

        df.to_csv('data/example-data-lake/bp-stream/{}-{}.csv'.format(pid, datestr), index=False)

In [180]:
# Create pharmacy data in pharmacy/NNNNNN.claim
"""
Datetime: 
Patient ID: 
Pharmacy: 
Drug:dexamethasone/erlotinib/other
"""

# Each patient may get dex, erlot, or paracetemol
claim_num = 3849500
for pid in range(1, 401):
    pharmacy = random.choice(['CVS', 'Kroger', 'Rite Aid'])
    x = random.choice(range(7))
    
    if x & 1:
        with open('data/example-data-lake/pharmacy/{}.claim'.format(claim_num), 'w') as f:
            f.write('\n'.join(
                ['Date:2018-02-10', 'Patient ID:{}'.format(pid),
                 'Pharmacy:{}'.format(pharmacy), 'Drug:Dexamethasone', '']))
        claim_num += 1

    if x & 2:
        with open('data/example-data-lake/pharmacy/{}.claim'.format(claim_num), 'w') as f:
            f.write('\n'.join(
                ['Date:2018-02-10', 'Patient ID:{}'.format(pid),
                 'Pharmacy:{}'.format(pharmacy), 'Drug:Erlotinib', '']))
        claim_num += 1
        
    if x & 4:
        with open('data/example-data-lake/pharmacy/{}.claim'.format(claim_num), 'w') as f:
            f.write('\n'.join(
                ['Date:2018-02-10', 'Patient ID:{}'.format(pid),
                 'Pharmacy:{}'.format(pharmacy), 'Drug:Paracetemol', '']))
        claim_num += 1