In [65]:
# Simulating data for the data lake

from collections import OrderedDict
import datetime
import json
import os
import random
import sqlite3
import string
import struct

import names
import pandas as pd

In [63]:
# [random.choice(range(17810000000, 17819999999)) for _ in range(100)]

In [None]:
'patient_id', 'name', 'dob', 'gender', 'race', 'postal_code', 'occupation',
       'phone', 'email'

In [86]:
genders = [random.choice(['male', 'female']) for _ in range(400)]

df_patients = pd.DataFrame(OrderedDict([
    ('patient_id', range(1, 401)), ('name', [names.get_full_name(gender=g) for g in genders]),
    ('dob', [datetime.datetime(random.choice(range(1950, 2000)), 1, 1, 1) for _ in range(400)]),
    ('gender', genders), ('race', [random.choice(['asian', 'black', 'white', 'hispanic', 'native_american']) for _ in range(400)]),
    ('postal_code', [random.choice(range(40000, 45000)) for _ in range(400)]),
    ('occupation', [random.choice(['builder', 'teacher', 'firefighter', 'engineer', 'doctor']) for _ in range(400)]),
    ('phone', [random.choice(range(17810000000, 17819999999)) for _ in range(400)]),
]))

In [83]:
df_admissions = pd.DataFrame(OrderedDict([
    ('hadm_id', range(50001, 50401)),
    ('patient_id', range(1, 401)),
    ('intime', [datetime.datetime(2018, 1, 1, 1)]*400),
    ('outtime', [datetime.datetime(2018, 2, 1, 1)]*400)
]))

In [None]:
df_procedures = pd.DataFrame(OrderedDict([
    ('hadm_id', range(50001, 50101)),
    ('icd9_code', range(1, 101)),
    ('clinician_id', [datetime.datetime(2018, 1, 1, 1)]*100),
]))

'hadm_id', 'icd9_code', 'clinician_id'

In [84]:
df_admissions.head()

Unnamed: 0,hadm_id,patient_id,intime,outtime
0,50001,1,2018-01-01 01:00:00,2018-02-01 01:00:00
1,50002,2,2018-01-01 01:00:00,2018-02-01 01:00:00
2,50003,3,2018-01-01 01:00:00,2018-02-01 01:00:00
3,50004,4,2018-01-01 01:00:00,2018-02-01 01:00:00
4,50005,5,2018-01-01 01:00:00,2018-02-01 01:00:00


In [17]:
df_patients.values.tolist()

[[1, 'jack'], [2, 'jack'], [3, 'jack']]

In [11]:
# The connection object represents the database
os.remove('data/example-data-lake/ehr/ehr.db')
conn = sqlite3.connect('data/example-data-lake/ehr/ehr.db')
c = conn.cursor()

# Create tables
c.execute('''CREATE TABLE patients
             (patient_id integer primary key, name text, dob date, gender text,
              race text, postal_code integer, occupation text, phone integer,
              email text);''')

c.execute('''CREATE TABLE admissions
             (hadm_id integer primary key, patient_id integer,
              intime datetime, outtime datetime);''')

c.execute('''CREATE TABLE procedures
             (hadm_id integer, icd9_code integer, clinician_id integer);''')

c.execute('''CREATE TABLE diagnoses
             (hadm_id, icd9_code, clinician_id);''')

c.execute('''CREATE TABLE chartevents
             (hadm_id, item_id, datetime);''')


# Insert data
c.executemany("INSERT INTO employees VALUES (?, ?, ?, ?)", df_patients.values.tolist())
# c.executemany("INSERT INTO reviews VALUES (?, ?, ?, ?)", df_reviews.values.tolist())
# c.executemany("INSERT INTO bonuses VALUES (?, ?, ?)", df_bonuses.values.tolist())

# Save (commit) the changes
conn.commit()

# We can also close the connection if we are done with it.
# Just be sure any changes have been committed or they will be lost.
conn.close()

In [3]:
# Initial content

# icd = international classification of diseases

# 1. EHR
# 1a. patients
df1 = ['patient_id', 'name', 'dob', 'gender', 'race', 'postal_code', 'occupation',
       'phone', 'email']

# 1b. admissions
df = ['patient_id', 'hadm_id', 'intime', 'outtime']

# 1d. Procedures
df = ['hadm_id', 'icd9_code', 'clinician_id']
d_procedures = {3000:'Carotid endarterectomy'}

# 1b. diagnoses_icd
df = ['hadm_id', 'icd9_code', 'clinician_id']
d_diagnoses = {50000:'diabetes', 50001:'hiv', 50002:'cancer'}

# chartevents
df = ['hadm_id', 'item_id', 'datetime']
d_chartevents = {2000:'weight', 2001:'abp'}


# 2. Numerics of bp in blood-pressure/<patient_id>-<date>.csv. Both before and after surg.
# Aggregated hourly data. One for each day. May not contain all 24 hours.
"""
hour,bp_min,bp_max
0,70,72
1
2
...
23
...
"""

# 3. Medications claims (files) in claims/NNNNNN.claim
"""
Datetime: 
Patient ID: 
Pharmacy: 
Drug: dexamethasone/erlotinib/other
"""

# 4. Patient followup in secure-chat/<patient_id>-<clinician_id>-<datetime>.txt
"""
Patient feels limb pain, limb coldness, and vision changes
"""



'\nPatient feels limb pain, limb coldness, and vision changes\n'