In [65]:
# Simulating data for the data lake

from collections import OrderedDict
import datetime
import json
import os
import random
import sqlite3
import string
import struct

import names
import pandas as pd

In [63]:
# [random.choice(range(17810000000, 17819999999)) for _ in range(100)]

In [None]:
'patient_id', 'name', 'dob', 'gender', 'race', 'postal_code', 'occupation',
       'phone', 'email'

In [86]:
genders = [random.choice(['male', 'female']) for _ in range(400)]

df_patients = pd.DataFrame(OrderedDict([
    ('patient_id', range(1, 401)), ('name', [names.get_full_name(gender=g) for g in genders]),
    ('dob', [datetime.datetime(random.choice(range(1950, 2000)), 1, 1, 1) for _ in range(400)]),
    ('gender', genders), ('race', [random.choice(['asian', 'black', 'white', 'hispanic', 'native_american']) for _ in range(400)]),
    ('postal_code', [random.choice(range(40000, 45000)) for _ in range(400)]),
    ('occupation', [random.choice(['builder', 'teacher', 'firefighter', 'engineer', 'doctor']) for _ in range(400)]),
    ('phone', [random.choice(range(17810000000, 17819999999)) for _ in range(400)]),
]))

In [88]:
# 1 admission per patient
df_admissions = pd.DataFrame(OrderedDict([
    ('hadm_id', range(50001, 50401)),
    ('patient_id', range(1, 401)),
    ('intime', [datetime.datetime(2018, 1, 1, 1)]*400),
    ('outtime', [datetime.datetime(2018, 2, 1, 1)]*400)
]))

In [91]:
# 1 procedure per hadm_id. Around half will have code 3000 for carotid endartorectomy
df_procedures = pd.DataFrame(OrderedDict([
    ('hadm_id', range(50001, 50401)),
    ('icd9_code', [random.choice([3000, 3001]) for _ in range(400)]),
    ('clinician_id', [random.choice(range(1, 10)) for _ in range(400)]),
]))

In [121]:
# 0-3 diagnoses per patient
hadm_ids, icd9_codes, clinician_ids = [], [], []

for h in range(50001, 50401):
    c = random.choice(range(1, 10))
    x = random.choice(range(7))
    
    if x & 1:
        hadm_ids.append(h)
        icd9_codes.append(10000)
        clinician_ids.append(c)
    if x & 2:
        hadm_ids.append(h)
        icd9_codes.append(10001)
        clinician_ids.append(c)
    if x & 4:
        hadm_ids.append(h)
        icd9_codes.append(10002)
        clinician_ids.append(c)
    
df_diagnoses = pd.DataFrame(OrderedDict([
    ('hadm_id', hadm_ids),
    ('icd9_code', icd9_codes),
    ('clinician_id', clinician_ids),
]))

In [148]:
# Each hadm_id has a weight and 2 bps
df_chartevents = pd.DataFrame(OrderedDict([
    ('hadm_id', [item for sublist in [(x, x, x) for x in range(50001, 50401)] for item in sublist]),
    ('item_id', [2000, 2001, 2001] * 400),
    ('value', [item for subitem in [[random.choice(range(40, 120)), random.choice(range(4, 70)), random.choice(range(70, 110))] for _ in range(400)] for item in subitem]),
    ('datetime', [datetime.datetime(2018, 1, 1, 2), datetime.datetime(2018, 1, 1, 3), datetime.datetime(2018, 1, 1, 4)]*400)
]))

In [153]:
len(df_patients.columns)

8

In [157]:
df_patients.head()

Unnamed: 0,patient_id,name,dob,gender,race,postal_code,occupation,phone
0,1,Tammy Tomasino,1974-01-01 01:00:00,female,black,42956,builder,17815298080
1,2,Leo Olmstead,1965-01-01 01:00:00,male,hispanic,40066,builder,17814384900
2,3,Joel Felix,1993-01-01 01:00:00,male,asian,40277,engineer,17816534216
3,4,Elbert Rivera,1988-01-01 01:00:00,male,black,43453,builder,17814220451
4,5,Evan Quero,1985-01-01 01:00:00,male,white,40369,engineer,17816405750


In [None]:
# 5: 101

In [110]:
2 & 2

2

In [166]:
# The connection object represents the database
os.remove('data/example-data-lake/ehr/ehr.db')
conn = sqlite3.connect('data/example-data-lake/ehr/ehr.db')
c = conn.cursor()

# Create tables
c.execute('''CREATE TABLE patients
             (patient_id integer primary key, name text, dob date, gender text,
              race text, postal_code integer, occupation text, phone integer);''')

c.execute('''CREATE TABLE admissions
             (hadm_id integer primary key, patient_id integer,
              intime datetime, outtime datetime);''')

c.execute('''CREATE TABLE procedures
             (hadm_id integer, icd9_code integer, clinician_id integer);''')

c.execute('''CREATE TABLE diagnoses
             (hadm_id integer, icd9_code integer, clinician_id integer);''')

c.execute('''CREATE TABLE chartevents
             (hadm_id integer, item_id integer, value integer, datetime datetime);''')


# Insert data
df_patients.to_sql('patients', conn, index=False, if_exists='append')
df_admissions.to_sql('admissions', conn, index=False, if_exists='append')
df_procedures.to_sql('procedures', conn, index=False, if_exists='append')
df_diagnoses.to_sql('diagnoses', conn, index=False, if_exists='append')
df_chartevents.to_sql('chartevents', conn, index=False, if_exists='append')
# c.executemany("INSERT INTO patients VALUES (?, ?, ?, ?, ?, ?, ?, ?)", df_patients.values.tolist())
# c.executemany("INSERT INTO reviews VALUES (?, ?, ?, ?)", df_reviews.values.tolist())
# c.executemany("INSERT INTO bonuses VALUES (?, ?, ?)", df_bonuses.values.tolist())

# Save (commit) the changes
conn.commit()

# We can also close the connection if we are done with it.
# Just be sure any changes have been committed or they will be lost.
conn.close()

In [167]:
df_patients.head()

Unnamed: 0,patient_id,name,dob,gender,race,postal_code,occupation,phone
0,1,Tammy Tomasino,1974-01-01 01:00:00,female,black,42956,builder,17815298080
1,2,Leo Olmstead,1965-01-01 01:00:00,male,hispanic,40066,builder,17814384900
2,3,Joel Felix,1993-01-01 01:00:00,male,asian,40277,engineer,17816534216
3,4,Elbert Rivera,1988-01-01 01:00:00,male,black,43453,builder,17814220451
4,5,Evan Quero,1985-01-01 01:00:00,male,white,40369,engineer,17816405750


In [3]:
# Initial content

# icd = international classification of diseases

# 1. EHR
# 1a. patients
df1 = ['patient_id', 'name', 'dob', 'gender', 'race', 'postal_code', 'occupation',
       'phone', 'email']

# 1b. admissions
df = ['patient_id', 'hadm_id', 'intime', 'outtime']

# 1d. Procedures
df = ['hadm_id', 'icd9_code', 'clinician_id']
d_procedures = {3000:'Carotid endarterectomy'}

# 1b. diagnoses_icd
df = ['hadm_id', 'icd9_code', 'clinician_id']
d_diagnoses = {50000:'diabetes', 50001:'hiv', 50002:'cancer'}

# chartevents
df = ['hadm_id', 'item_id', 'datetime']
d_chartevents = {2000:'weight', 2001:'abp'}


# 2. Numerics of bp in blood-pressure/<patient_id>-<date>.csv. Both before and after surg.
# Aggregated hourly data. One for each day. May not contain all 24 hours.
"""
hour,bp_min,bp_max
0,70,72
1
2
...
23
...
"""

# 3. Medications claims (files) in claims/NNNNNN.claim
"""
Datetime: 
Patient ID: 
Pharmacy: 
Drug: dexamethasone/erlotinib/other
"""

# 4. Patient followup in secure-chat/<patient_id>-<clinician_id>-<datetime>.txt
"""
Patient feels limb pain, limb coldness, and vision changes
"""



'\nPatient feels limb pain, limb coldness, and vision changes\n'