In [2]:
import datetime
import os
import sqlite3

import pandas as pd

# Data Harmonization


In [None]:
# Collected Data

# 1. EHR tables in a relational database. Stored in `ehr/ehr.db`
patients = ['patient_id', 'name', 'dob', 'gender', 'race', 'postal_code', 'occupation', 'phone']

admissions = ['patient_id', 'hadm_id', 'intime', 'outtime']

procedures = ['hadm_id', 'icd9_code', 'clinician_id']

diagnoses = ['hadm_id', 'icd9_code', 'clinician_id']

chartevents = ['hadm_id', 'item_id', 'value', 'datetime']

d_procedures = {3000:'Carotid endarterectomy'}
d_diagnoses = {10000:'diabetes', 10001:'hiv', 10002:'cancer'}
d_chartevents = {2000:'weight', 2001:'abp'}


# 2. Streamed blood pressure numberics from wearable device.
# Stored in csv files: `blood-pressure/<patient_id>-<date>.csv`.
# Aggregated hourly data. Each patients have recordings both before and after their procedure.

# example format:
"""
hour,bp_min,bp_max
0,70,72
...
23,45,60
"""

# 3. Pharmacy orders. Stored in text files: pharmacy/NNNNNN.txt

# example format:
"""
Date:YYYY-MM-DD
Patient ID: 
Pharmacy: 
Drug: dexamethasone/erlotinib/other
"""



In [None]:
# Research database tables

# Removed name, phone
patients = ['patient_id', 'dob', 'gender', 'race', 'postal_code', 'occupation'] 

admissions = ['patient_id', 'hadm_id', 'intime', 'outtime']

# Removed clinician_id
procedures = ['hadm_id', 'icd9_code'] 

# Removed clinician_id
diagnoses = ['hadm_id', 'icd9_code']

chartevents = ['hadm_id', 'item_id', 'value', 'datetime']

# Convert file to rdb table
bpstream = ['patient_id', 'datetime', 'bp_min', 'bp_max']

# Convert file to rdb table
medications = ['patient_id', 'datetime', 'drug']


## Task: Combine the disparate data sources to create a single usable sharable research database

In [3]:
# Step 1 - Get the EHR data

# Read the data from the EHR
conn_ehr = sqlite3.connect('data/example-data-lake/ehr/ehr.db')

df_patients = pd.read_sql_query('select * from patients;', conn_ehr)
df_admissions = pd.read_sql_query('select * from admissions;', conn_ehr)
df_procedures = pd.read_sql_query('select * from procedures;', conn_ehr)
df_diagnoses = pd.read_sql_query('select * from diagnoses;', conn_ehr)
df_chartevents = pd.read_sql_query('select * from chartevents;', conn_ehr)

conn_ehr.close()

# Remove the unwanted columns
del(df_patients['name'])
del(df_patients['phone'])

del(df_procedures['clinician_id'])
del(df_diagnoses['clinician_id'])


In [7]:
# Step 2 - Get the hourly blood pressure stream
file_dir = 'data/example-data-lake/bp-stream'

# Holds all the dataframes from the bp files
bp_dataframes = []

for file in os.listdir(file_dir):
    # Read the csv file
    df = pd.read_csv(os.path.join(file_dir, file))
    # The file name contains information
    patient_id, year, month, day = file.strip('.csv').split('-')
    
    # Add the necessary columns 
    df['datetime'] = [datetime.datetime(int(year), int(month), int(day), h) for h in df.loc[:, 'hour']]
    df['patient_id'] = int(patient_id)
    df = df.loc[:, ['patient_id', 'datetime', 'bp_min', 'bp_max']]
    
    bp_dataframes.append(df)

# Concatenate all the streamed bp data
df_bpstream = pd.concat(bp_dataframes, ignore_index=True)

In [5]:
# Step 3 - Pharmacy data

def read_pharm_file(filename):
    """
    Read a pharmacy claim file. Lines are datetime, patient id, pharmacy, drug.
    
    Return patient_id, datetime, drug
    """
    with open(filename) as f:
        lines = f.readlines()
        
    lines = [l.strip() for l in lines]
        
    pid = int(lines[1].split(':')[-1])
    year, month, day = lines[0].split(':')[-1].split('-')
    dt = datetime.datetime(int(year), int(month), int(day))
    drug = lines[3].split(':')[-1]
    
    return pid, dt, drug
        
file_dir = 'data/example-data-lake/pharmacy'

# Holds all the info from the pharmacyfiles
pharm_data = []

for file in os.listdir(file_dir):
    # Read the file. The filename contains no information.
    data = read_pharm_file(os.path.join(file_dir, file))
    pharm_data.append(data)
    
df_medications = pd.DataFrame(pharm_data, columns=['patient_id', 'datetime', 'drug'])

In [8]:
# Now we have all the tables we need. Write them to the research database.

if os.path.isfile('data/allhealth.db'):
    os.remove('data/allhealth.db')
            
conn = sqlite3.connect('data/allhealth.db')
c = conn.cursor()

# Insert data
df_patients.to_sql('patients', conn, index=False)
df_admissions.to_sql('admissions', conn, index=False)
df_procedures.to_sql('procedures', conn, index=False)
df_diagnoses.to_sql('diagnoses', conn, index=False)
df_chartevents.to_sql('chartevents', conn, index=False)
df_bpstream.to_sql('bpstream', conn, index=False)
df_medications.to_sql('medications', conn, index=False)

# Save (commit) the changes
conn.commit()

# We can also close the connection if we are done with it.
# Just be sure any changes have been committed or they will be lost.
conn.close()
