In [None]:
# Install dependencies - only need to uncomment/run this once for each environment

# Install required packages for python
%pip install -q -r requirements.txt

# Install required command line tools
# Run this line for apt-based systems (e.g. Debian, Ubuntu, Linux Mint)
# !apt-get install -y wget

# Run this line for MacOS (requires Homebrew from https://brew.sh/)
# !brew install wget

In [3]:
# Import packages
# import duckdb
import pandas as pd
from os import listdir, getenv
from dotenv import load_dotenv

load_dotenv()

# Set up Physionet credentials
phys_pass = getenv('PHYSIONET_PASSWORD')
phys_user = getenv('PHYSIONET_USERNAME')
data_dir  = getenv('DATA_DIR')

# Set up DuckDB database using environment variables


# Create or load a persistent database with DuckDB
# Load SQL extension
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [10]:
# Connect to DuckDB (creates a new database if it doesn't exist)
%sql db_shape << select table_schema, table_name from information_schema.tables
display(db_shape)

Unnamed: 0,table_schema,table_name
0,mimiciv_hosp,emar
1,mimiciv_hosp,poe_detail
2,mimiciv_hosp,pharmacy
3,mimiciv_hosp,services
4,mimiciv_hosp,diagnoses_icd
5,mimiciv_hosp,drgcodes
6,mimiciv_hosp,omr
7,mimiciv_hosp,d_hcpcs
8,mimiciv_hosp,prescriptions
9,mimiciv_hosp,emar_detail


In [24]:
# Export to parquet

# DEBUG WITH ONE TABLE
db_shape = %sql select table_schema, table_name from information_schema.tables LIMIT 1

# if db_shape.shape[0] == 35:
if db_shape.shape[0] == 1:
    for schema in db_shape['table_schema'].unique():
        # Create directory for schema
        !mkdir -p $data_dir/parquet/$schema
        for table in db_shape[db_shape['table_schema'] == schema]['table_name']:
            print("Exporting " + schema + "." + table + " to parquet")
            %sql COPY {{schema}}.{{table}} TO '{{data_dir}}/parquet/{{schema}}/{{table}}.parquet' (FORMAT PARQUET);
else:
    print("Expected 35 tables, found " + str(db_shape.shape[0]) + ". Skipping export to parquet.")
    exit(1)



Exporting mimiciv_hosp.emar to parquet


FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

In [None]:
# STOP!

## If that ran successfully, you can skip the data import steps below

In [None]:
# Download data into data/ folder 
source = {
    'mimiciv': 'https://physionet.org/files/mimiciv/2.2/', 
    'mimiciv_note': 'https://physionet.org/files/mimic-iv-note/2.2/',
    'phenotype_annotations': 'https://physionet.org/files/phenotype-annotations-mimic/1.20.03/'
}

# for url in source.values():
#     !wget -r -N -c -np -P data/ --user "$phys_user" --password "$phys_pass" $url

In [None]:
# Extract columns and data_types for each table (run on a previous manual data load)
# %sql columns << SELECT table_schema, table_name, column_name, data_type FROM information_schema.columns;
# columns.to_csv('data/physionet_schema.csv', index=False)

In [None]:
# Load the column data_types saved from manual load
raw_col = pd.read_csv('data/physionet_schema.csv', delimiter='\t')
raw_col['param'] = '\'' + raw_col['column'] + '\': \'' + raw_col['data_type']  + '\''

# Aggregate the column data_types for each schema.table
df_col = raw_col.groupby(['schema', 'table'])['param'].aggregate(", ".join).reset_index()
df_col['param'] = 'columns={' + df_col['param'] + '}'

# Pass into a dict, keys are (schema, table) tuples
col = df_col.set_index(['schema', 'table']).to_dict()['param']

In [None]:
# Define a function to create a schema for each data source
# and a table for each file (*.csv.gz) in its directory

def create_schema_and_tables(schema, path):
    files = listdir(path)
    %sql CREATE SCHEMA IF NOT EXISTS {{schema}};
    for file in files:
        if file.endswith('.csv.gz'):
            table = file.split('.')[0]
            param = col.get((schema, table), 'AUTO_DETECT=TRUE')
            %sql DROP TABLE IF EXISTS {{schema}}.{{table}};
            %sql CREATE TABLE {{schema}}.{{table}} AS SELECT * FROM read_csv('{{path}}{{file}}', header=True, {{param}} );
    %sql result << SELECT table_schema, table_name FROM information_schema.tables WHERE table_schema = '{{schema}}';
    return result

In [None]:
# Import MIMIC-IV Notes
schema = 'mimiciv_note'
path = 'data/physionet.org/files/mimic-iv-note/2.2/note/'

# create_schema_and_tables(schema, path)

In [None]:
# Import MIMIC-IV ICU
schema = 'mimiciv_icu'
path = 'data/physionet.org/files/mimiciv/2.2/icu/'

# create_schema_and_tables(schema, path)

In [None]:
# Import MIMIC-IV Hospital
schema = 'mimiciv_hosp'
path = 'data/physionet.org/files/mimiciv/2.2/hosp/'

# create_schema_and_tables(schema, path)

In [None]:
datasets = {
    # 'mimiciv_hosp': 'data/physionet.org/files/mimiciv/2.2/hosp/',
    # 'mimiciv_icu': 'data/physionet.org/files/mimiciv/2.2/icu/',
    'mimiciv_note': 'data/physionet.org/files/mimic-iv-note/2.2/note/'
}

for schema, path in datasets.items():
    create_schema_and_tables(schema, path)