# Data Preparation - MIMIC-IV Demo Dataset (Community Care)

This notebook reads MIMIC-IV Demo CSV files from MinIO and converts them to Parquet format for community care integration.

**Purpose**: Prepare PhysioNet MIMIC-IV Demo data for Option C (Concurrent Care) integration  
**Source**: `med-sandbox/mimic-data/hosp/*.csv`  
**Destination**: `med-data/v1_raw/mimic/*.parquet`

## MIMIC-IV Tables

- **prescriptions.csv** - Medication orders (analogous to RxOut)
- **pharmacy.csv** - Pharmacy dispensing records
- **emar.csv** - Medication administrations (analogous to BCMA)
- **patients.csv** - Patient demographics
- **admissions.csv** - Hospital admissions

**Note**: emar_detail.csv was corrupt and is excluded from this integration.

In [None]:
# Import dependencies

import io
import os
import sys
import logging
import time
import boto3
import pandas as pd
import s3fs
import pyarrow as pa
import pyarrow.parquet as pq
from dotenv import load_dotenv
from importlib.metadata import version
from config import *

In [None]:
# Verify that dependencies are available for use

def print_version():
    """Display versions of key dependencies"""
    print("boto3:", boto3.__version__)
    print("pandas:", pd.__version__)
    print("s3fs:", s3fs.__version__)
    print("pyarrow:", pa.__version__)
    print("dotenv:", version("python-dotenv"))

print_version()

In [None]:
# Set up logging

# Clear any existing handlers to avoid duplicate logs
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Configure logging with timestamp and level
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s"
)

# Test logging
logging.info("Logging configured successfully")

In [None]:
# Load configuration from config module

logging.info(f"Configuration loaded: MinIO endpoint={MINIO_ENDPOINT}")
logging.info(f"Source: s3://{SOURCE_BUCKET}/{SOURCE_MIMIC_PATH}")
logging.info(f"Destination: s3://{DEST_BUCKET}/{V1_RAW_MIMIC_PREFIX}")
logging.info(f"Community care Sta3n: {COMMUNITY_CARE_STA3N}")
logging.info(f"Community care source system: {COMMUNITY_CARE_SOURCE}")

In [None]:
# Create S3 client for MinIO

def create_s3_client():
    """
    Factory function to create S3 client for MinIO (local development).
    Returns boto3 S3 client configured for MinIO backend.
    """
    logging.info(f"Creating MinIO S3 client at {MINIO_ENDPOINT}")
    return boto3.client(
        's3',
        endpoint_url=f"http://{MINIO_ENDPOINT}",
        aws_access_key_id=MINIO_ACCESS_KEY,
        aws_secret_access_key=MINIO_SECRET_KEY,
        region_name='us-east-1'  # Required but not used by MinIO
    )

# Create the S3 client
s3 = create_s3_client()
logging.info(f"S3 client created successfully")
logging.info(f"Client type: {type(s3)}")

In [None]:
# Create S3FileSystem for pandas/pyarrow I/O

logging.info(f"Initializing S3FileSystem for MinIO at {MINIO_ENDPOINT}")
fs = s3fs.S3FileSystem(
    anon=False,
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={
        'endpoint_url': f"http://{MINIO_ENDPOINT}"
    }
)
logging.info("S3FileSystem created successfully")

## Load MIMIC-IV CSV Files

Load all MIMIC-IV Demo CSV files from med-sandbox bucket.  
**Note**: emar_detail.csv was corrupt and is excluded from this integration.

In [None]:
# Define MIMIC files to process
# Note: emar_detail.csv was corrupt and is excluded

mimic_files = {
    'prescriptions': 'prescriptions.csv',
    'pharmacy': 'pharmacy.csv',
    'emar': 'emar.csv',
    'patients': 'patients.csv',
    'admissions': 'admissions.csv'
}

logging.info(f"Processing {len(mimic_files)} MIMIC-IV Demo files")
for table_name, filename in mimic_files.items():
    logging.info(f"  - {table_name}: {filename}")

In [None]:
# Load each MIMIC CSV file into a DataFrame

mimic_data = {}  # Dictionary to store DataFrames
total_start_time = time.time()

for table_name, filename in mimic_files.items():
    csv_uri = f"s3://{SOURCE_BUCKET}/{SOURCE_MIMIC_PATH}{filename}"
    logging.info(f"Reading {table_name} from {csv_uri}...")
    
    start_time = time.time()
    
    # Read CSV using s3fs storage_options
    mimic_data[table_name] = pd.read_csv(
        csv_uri,
        storage_options={
            'key': MINIO_ACCESS_KEY,
            'secret': MINIO_SECRET_KEY,
            'client_kwargs': {'endpoint_url': f"http://{MINIO_ENDPOINT}"}
        }
    )
    
    elapsed = time.time() - start_time
    rows = len(mimic_data[table_name])
    cols = len(mimic_data[table_name].columns)
    logging.info(f"  ‚úì Loaded {rows:,} rows, {cols} columns in {elapsed:.2f}s")

total_elapsed = time.time() - total_start_time
logging.info(f"All {len(mimic_files)} files loaded in {total_elapsed:.2f}s")

## Data Quality Summary

Display overview of loaded MIMIC-IV Demo data.

In [None]:
# Display MIMIC-IV Demo dataset summary

print("\n" + "=" * 80)
print("MIMIC-IV DEMO DATASET SUMMARY")
print("=" * 80)

for table_name, df in mimic_data.items():
    print(f"{table_name.upper():15} {len(df):>8,} rows  {len(df.columns):>3} columns")

print("=" * 80)

# Display key statistics
logging.info(f"\nKey Statistics:")
logging.info(f"  Unique patients with prescriptions: {mimic_data['prescriptions']['subject_id'].nunique()}")
logging.info(f"  Unique patients in demographics: {mimic_data['patients']['subject_id'].nunique()}")
logging.info(f"  Total hospital admissions: {len(mimic_data['admissions']):,}")

# Display date ranges (MIMIC uses shifted dates 2100-2200)
if 'starttime' in mimic_data['prescriptions'].columns:
    start_date = mimic_data['prescriptions']['starttime'].min()
    end_date = mimic_data['prescriptions']['starttime'].max()
    logging.info(f"  Prescription date range: {start_date} to {end_date}")
    logging.info(f"  Note: MIMIC-IV uses shifted dates (2100-2200) for privacy")

In [None]:
# Display sample prescription data

print("\n" + "=" * 80)
print("SAMPLE PRESCRIPTION DATA (First 5 rows)")
print("=" * 80)
display(mimic_data['prescriptions'].head())

In [None]:
# Display sample patient demographics

print("\n" + "=" * 80)
print("SAMPLE PATIENT DEMOGRAPHICS (First 5 rows)")
print("=" * 80)
display(mimic_data['patients'].head())

## Write Parquet Files to v1_raw/mimic/

Convert all MIMIC CSV files to Parquet format and write to med-data bucket.

In [None]:
# Write each DataFrame to Parquet format

logging.info("\nWriting MIMIC data to Parquet format...")
total_start_time = time.time()

for table_name, df in mimic_data.items():
    # Construct output path
    parquet_filename = f"{table_name}.parquet"
    parquet_uri = f"s3://{DEST_BUCKET}/{V1_RAW_MIMIC_PREFIX}{parquet_filename}"
    
    logging.info(f"Writing {table_name} to {parquet_uri}...")
    start_time = time.time()
    
    # Write to Parquet with compression
    df.to_parquet(
        parquet_uri,
        engine='pyarrow',
        filesystem=fs,
        compression='snappy',
        index=False  # Don't write DataFrame index
    )
    
    elapsed = time.time() - start_time
    logging.info(f"  ‚úì Written {len(df):,} rows in {elapsed:.2f}s")

total_elapsed = time.time() - total_start_time
logging.info(f"\n‚úÖ All {len(mimic_files)} Parquet files written in {total_elapsed:.2f}s")
logging.info(f"Data location: s3://{DEST_BUCKET}/{V1_RAW_MIMIC_PREFIX}")

## Verify Parquet Writes

Read back each Parquet file to verify successful write and data integrity.

In [None]:
# Verify each Parquet file by reading back

logging.info("\nVerifying Parquet files...")
verification_results = []

for table_name, original_df in mimic_data.items():
    parquet_filename = f"{table_name}.parquet"
    parquet_uri = f"s3://{DEST_BUCKET}/{V1_RAW_MIMIC_PREFIX}{parquet_filename}"
    
    logging.info(f"Verifying {table_name}...")
    start_time = time.time()
    
    # Read back from Parquet
    df_verify = pd.read_parquet(parquet_uri, filesystem=fs)
    elapsed = time.time() - start_time
    
    # Verify row count
    rows_match = len(df_verify) == len(original_df)
    cols_match = len(df_verify.columns) == len(original_df.columns)
    
    if rows_match and cols_match:
        logging.info(f"  ‚úì Verified: {len(df_verify):,} rows, {len(df_verify.columns)} columns ({elapsed:.2f}s)")
        verification_results.append((table_name, True, len(df_verify), len(df_verify.columns)))
    else:
        logging.error(f"  ‚úó Mismatch detected for {table_name}!")
        logging.error(f"    Original: {len(original_df)} rows, {len(original_df.columns)} cols")
        logging.error(f"    Verified: {len(df_verify)} rows, {len(df_verify.columns)} cols")
        verification_results.append((table_name, False, len(df_verify), len(df_verify.columns)))

# Summary
all_verified = all([result[1] for result in verification_results])
if all_verified:
    logging.info("\n‚úÖ All files verified successfully!")
else:
    logging.error("\n‚ùå Verification failed for some files!")

## File Size Comparison

Compare CSV vs Parquet file sizes to show compression benefits.

In [None]:
# Compare CSV vs Parquet file sizes

print("\n" + "=" * 80)
print("FILE SIZE COMPARISON (CSV vs Parquet)")
print("=" * 80)
print(f"{'Table':<15} {'CSV (MB)':>10} {'Parquet (MB)':>12} {'Compression':>12}")
print("-" * 80)

total_csv_size = 0
total_parquet_size = 0

for table_name, filename in mimic_files.items():
    # Get CSV file size
    csv_key = f"{SOURCE_MIMIC_PATH}{filename}"
    csv_response = s3.head_object(Bucket=SOURCE_BUCKET, Key=csv_key)
    csv_size_mb = csv_response['ContentLength'] / (1024**2)
    total_csv_size += csv_size_mb
    
    # Get Parquet file size
    parquet_key = f"{V1_RAW_MIMIC_PREFIX}{table_name}.parquet"
    parquet_response = s3.head_object(Bucket=DEST_BUCKET, Key=parquet_key)
    parquet_size_mb = parquet_response['ContentLength'] / (1024**2)
    total_parquet_size += parquet_size_mb
    
    # Calculate compression ratio
    compression_ratio = (1 - parquet_size_mb / csv_size_mb) * 100 if csv_size_mb > 0 else 0
    
    print(f"{table_name:<15} {csv_size_mb:>10.2f} {parquet_size_mb:>12.2f} {compression_ratio:>11.1f}%")

# Calculate total compression
total_compression = (1 - total_parquet_size / total_csv_size) * 100 if total_csv_size > 0 else 0

print("-" * 80)
print(f"{'TOTAL':<15} {total_csv_size:>10.2f} {total_parquet_size:>12.2f} {total_compression:>11.1f}%")
print("=" * 80)

logging.info(f"\nStorage savings: {total_compression:.1f}% ({total_csv_size:.2f} MB ‚Üí {total_parquet_size:.2f} MB)")

## Summary

Data preparation complete. MIMIC-IV Demo data is now available in Parquet format for community care integration.

In [None]:
# Final summary

print("\n" + "=" * 80)
print("DATA PREPARATION SUMMARY - MIMIC-IV DEMO")
print("=" * 80)
print(f"Source:       s3://{SOURCE_BUCKET}/{SOURCE_MIMIC_PATH}")
print(f"Destination:  s3://{DEST_BUCKET}/{V1_RAW_MIMIC_PREFIX}")
print()
print("Files processed:")
for table_name, df in mimic_data.items():
    print(f"  {table_name:<15} {len(df):>8,} rows  {len(df.columns):>3} columns")
print()
print(f"Total CSV size:     {total_csv_size:.2f} MB")
print(f"Total Parquet size: {total_parquet_size:.2f} MB ({total_compression:.1f}% reduction)")
print()
print("Status:       ‚úÖ Complete")
print("=" * 80)
print("\nüìã Next Steps:")
print("   1. Run 01e_mimic_patient_selection.ipynb to integrate community care with VA medications")
print("   2. This will create concurrent care scenarios (Option C)")
print("   3. Then re-run notebooks 02-06 for complete analysis with dual-source data")