In [1]:
# Install required packages and download data
%pip install -q -r requirements.txt

from dotenv import load_dotenv
load_dotenv()

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
data_get.sh: 5: wget: not found
data_get.sh: 8: wget: not found
data_get.sh: 11: wget: not found


In [None]:
# Download data
!sh data_get.sh

In [None]:
# Import required packages
import duckdb
import pandas as pd
from os import listdir 

# Load SQL extension
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

# Connect to DuckDB
%sql duckdb:///cerulean.db

In [None]:
# Extract the column names and data_types for each table from manual load (not shown)
# %sql columns << SELECT table_schema, table_name, column_name, data_type FROM information_schema.columns;

In [None]:
# Load the column data_types saved from manual load
raw_col = pd.read_csv('column_schema.csv', delimiter='\t')
raw_col['param'] = '\'' + raw_col['column'] + '\': \'' + raw_col['data_type']  + '\''

# Aggregate the column data_types for each schema.table
df_col = raw_col.groupby(['schema', 'table'])['param'].aggregate(", ".join).reset_index()
df_col['param'] = 'columns={' + df_col['param'] + '}'

# Pass into a dict, keys are (schema, table) tuples
col = df_col.set_index(['schema', 'table']).to_dict()['param']

# Set `columns={}` if known, otherwise use AUTO_DETECT
# e.g.: col.get(('mimiciv_hosp', 'pharmacy'), 'AUTO_DETECT=TRUE'))

In [None]:
# Define a function to create a schema for each data source
# and a table for each file (*.csv.gz) in its directory

def create_schema_and_tables(schema, path):
    files = listdir(path)
    %sql CREATE SCHEMA IF NOT EXISTS {schema};
    for file in files:
        if file.endswith('.csv.gz'):
            table = file.split('.')[0]
            param = col.get((schema, table), 'AUTO_DETECT=TRUE')
            %sql DROP TABLE IF EXISTS {schema}.{table};
            %sql CREATE TABLE {schema}.{table} AS SELECT * FROM read_csv('{path}{file}', header=True, {param} );
    %sql result << SELECT table_schema, table_name FROM information_schema.tables WHERE table_schema = '{schema}';
    return result

In [None]:
# Import MIMIC-IV Hospital
schema = 'mimiciv_hosp'
path = 'physionet.org/files/mimiciv/2.2/hosp/'

create_schema_and_tables(schema, path)

In [None]:
# Import MIMIC-IV ICU
schema = 'mimiciv_icu'
path = 'physionet.org/files/mimiciv/2.2/icu/'

create_schema_and_tables(schema, path)

In [None]:
# Import MIMIC-IV Notes
schema = 'mimiciv_note'
path = 'physionet.org/files/mimic-iv-note/2.2/note/'

create_schema_and_tables(schema, path)