# Data Preparation - DDI Dataset
This notebook reads the raw DDI CSV file from MinIO and writes an unmodified Parquet version to v1_raw.

**Source**: `med-sandbox/kaggle-data/ddi/db_drug_interactions.csv`  
**Destination**: `med-data/v1_raw/ddi/db_drug_interactions.parquet`

In [None]:
# Import dependencies

import io
import os
import sys
import logging
import time
import boto3
import pandas as pd
import s3fs
import pyarrow as pa
import pyarrow.parquet as pq
from dotenv import load_dotenv
from importlib.metadata import version
from config import *

In [35]:
# Verify that dependencies are available for use

def print_version():
    print("boto3:", boto3.__version__)
    print("pandas:", pd.__version__)
    print("s3fs:", s3fs.__version__)
    print("pyarrow:", pa.__version__)
    print("dotenv:", version("python-dotenv"))


print_version()

boto3: 1.41.5
pandas: 2.3.3
s3fs: 2025.10.0
pyarrow: 22.0.0
dotenv: 1.2.1


In [36]:
# Set up logging

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s"
)

# Test logging
logging.info("Logging configured successfully")

2025-11-28 07:22:23,161 INFO Logging configured successfully


In [37]:
# Load configuration from config module

logging.info(f"Configuration loaded: MinIO endpoint={MINIO_ENDPOINT}")
logging.info(f"Source: s3://{SOURCE_BUCKET}/{SOURCE_DDI_PATH}")
logging.info(f"Destination: s3://{DEST_BUCKET}/{V1_RAW_DDI_PREFIX}")
logging.info(f"Processing: chunk_size={CHUNK_SIZE}")

2025-11-28 07:22:25,458 INFO Configuration loaded: MinIO endpoint=localhost:9000
2025-11-28 07:22:25,459 INFO Source: s3://med-sandbox/kaggle-data/ddi/
2025-11-28 07:22:25,459 INFO Destination: s3://med-data/v1_raw/ddi/
2025-11-28 07:22:25,459 INFO Processing: chunk_size=100000


In [38]:
# Create S3 client for MinIO

def create_s3_client():
    """
    Factory function to create S3 client for MinIO (local development)
    Returns boto3 S3 client configured for MinIO backend
    """
    logging.info(f"Creating MinIO S3 client at {MINIO_ENDPOINT}")
    return boto3.client(
        's3',
        endpoint_url=f"http://{MINIO_ENDPOINT}",
        aws_access_key_id=MINIO_ACCESS_KEY,
        aws_secret_access_key=MINIO_SECRET_KEY,
        region_name='us-east-1'
    )


# Create the S3 client
s3 = create_s3_client()
logging.info(f"S3 client created successfully")
logging.info(f"Client type: {type(s3)}")

2025-11-28 07:22:29,148 INFO Creating MinIO S3 client at localhost:9000
2025-11-28 07:22:29,153 INFO S3 client created successfully
2025-11-28 07:22:29,154 INFO Client type: <class 'botocore.client.S3'>


In [39]:
# Create S3FileSystem for pandas/pyarrow I/O

logging.info(f"Initializing S3FileSystem for MinIO at {MINIO_ENDPOINT}")
fs = s3fs.S3FileSystem(
    anon=False,
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={
        'endpoint_url': f"http://{MINIO_ENDPOINT}"
    }
)
logging.info("S3FileSystem created successfully")

2025-11-28 07:22:31,841 INFO Initializing S3FileSystem for MinIO at localhost:9000
2025-11-28 07:22:31,842 INFO S3FileSystem created successfully


In [40]:
# Read CSV from source bucket

csv_filename = "db_drug_interactions.csv"
csv_uri = f"s3://{SOURCE_BUCKET}/{SOURCE_DDI_PATH}{csv_filename}"
logging.info(f"Reading CSV: {csv_uri}")

start_time = time.time()

# Read CSV using s3fs storage_options
df_ddi = pd.read_csv(
    csv_uri,
    storage_options={
        'key': MINIO_ACCESS_KEY,
        'secret': MINIO_SECRET_KEY,
        'client_kwargs': {'endpoint_url': f"http://{MINIO_ENDPOINT}"}
    }
)

elapsed = time.time() - start_time
logging.info(f"Successfully loaded {len(df_ddi):,} rows, {len(df_ddi.columns)} columns in {elapsed:.2f}s")

2025-11-28 07:22:34,083 INFO Reading CSV: s3://med-sandbox/kaggle-data/ddi/db_drug_interactions.csv
2025-11-28 07:22:34,248 INFO Successfully loaded 191,541 rows, 3 columns in 0.16s


In [41]:
# Take a look at DataFrame

df_ddi.head(10)

Unnamed: 0,Drug 1,Drug 2,Interaction Description
0,Trioxsalen,Verteporfin,Trioxsalen may increase the photosensitizing activities of Verteporfin.
1,Aminolevulinic acid,Verteporfin,Aminolevulinic acid may increase the photosensitizing activities of Verteporfin.
2,Titanium dioxide,Verteporfin,Titanium dioxide may increase the photosensitizing activities of Verteporfin.
3,Tiaprofenic acid,Verteporfin,Tiaprofenic acid may increase the photosensitizing activities of Verteporfin.
4,Cyamemazine,Verteporfin,Cyamemazine may increase the photosensitizing activities of Verteporfin.
5,Temoporfin,Verteporfin,Temoporfin may increase the photosensitizing activities of Verteporfin.
6,Methoxsalen,Verteporfin,Methoxsalen may increase the photosensitizing activities of Verteporfin.
7,Hexaminolevulinate,Verteporfin,Hexaminolevulinate may increase the photosensitizing activities of Verteporfin.
8,Benzophenone,Verteporfin,Benzophenone may increase the photosensitizing activities of Verteporfin.
9,Riboflavin,Verteporfin,Riboflavin may increase the photosensitizing activities of Verteporfin.


In [42]:
# Take a look at DataFrame (tail)

df_ddi.tail(10)

Unnamed: 0,Drug 1,Drug 2,Interaction Description
191531,Ulipristal,Dydrogesterone,The therapeutic efficacy of Dydrogesterone can be decreased when used in combination with Ulipristal.
191532,Ganciclovir,Entecavir,The risk or severity of adverse effects can be increased when Ganciclovir is combined with Entecavir.
191533,Valganciclovir,Entecavir,The risk or severity of adverse effects can be increased when Valganciclovir is combined with Entecavir.
191534,Clindamycin,Kaolin,Clindamycin can cause a decrease in the absorption of Kaolin resulting in a reduced serum concentration and potentia...
191535,Lincomycin,Kaolin,Lincomycin can cause a decrease in the absorption of Kaolin resulting in a reduced serum concentration and potential...
191536,Ascorbic acid,Deferoxamine,The risk or severity of adverse effects can be increased when Ascorbic acid is combined with Deferoxamine.
191537,Tenofovir disoproxil,Cidofovir,Tenofovir disoproxil may decrease the excretion rate of Cidofovir which could result in a higher serum level.
191538,Tenofovir disoproxil,Ganciclovir,The serum concentration of Ganciclovir can be increased when it is combined with Tenofovir disoproxil.
191539,Tenofovir disoproxil,Valganciclovir,The serum concentration of Valganciclovir can be increased when it is combined with Tenofovir disoproxil.
191540,L-Glutamine,Lactulose,The therapeutic efficacy of Lactulose can be decreased when used in combination with L-Glutamine.


In [43]:
# Display DataFrame info

df_ddi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191541 entries, 0 to 191540
Data columns (total 3 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Drug 1                   191541 non-null  object
 1   Drug 2                   191541 non-null  object
 2   Interaction Description  191541 non-null  object
dtypes: object(3)
memory usage: 4.4+ MB


In [44]:
# Display DataFrame shape and memory usage

print(f"Shape: {df_ddi.shape}")
print(f"Rows: {len(df_ddi):,}")
print(f"Columns: {len(df_ddi.columns)}")
print(f"Memory usage: {df_ddi.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Shape: (191541, 3)
Rows: 191,541
Columns: 3
Memory usage: 51.75 MB


In [45]:
# Write unmodified DataFrame to v1_raw as Parquet

parquet_filename = "db_drug_interactions.parquet"
parquet_uri = f"s3://{DEST_BUCKET}/{V1_RAW_DDI_PREFIX}{parquet_filename}"
logging.info(f"Writing Parquet: {parquet_uri}")

start_time = time.time()

df_ddi.to_parquet(
    parquet_uri,
    engine='pyarrow',
    filesystem=fs,
    compression='snappy',
    index=False
)

elapsed = time.time() - start_time
logging.info(f"Successfully wrote {len(df_ddi):,} rows in {elapsed:.2f}s")

2025-11-28 07:22:51,783 INFO Writing Parquet: s3://med-data/v1_raw/ddi/db_drug_interactions.parquet
2025-11-28 07:22:51,854 INFO Successfully wrote 191,541 rows in 0.07s


In [46]:
# Verify write by reading back from v1_raw

logging.info("Verifying write by reading back from v1_raw...")

start_time = time.time()
df_verify = pd.read_parquet(parquet_uri, filesystem=fs)
elapsed = time.time() - start_time

logging.info(f"Verification: Read {len(df_verify):,} rows in {elapsed:.2f}s")

# Check row count matches
assert len(df_verify) == len(df_ddi), f"Row count mismatch! Original: {len(df_ddi)}, Read back: {len(df_verify)}"
logging.info("✓ Verification successful - row counts match")

# Check column count matches
assert len(df_verify.columns) == len(df_ddi.columns), f"Column count mismatch!"
logging.info("✓ Verification successful - column counts match")

2025-11-28 07:23:14,982 INFO Verifying write by reading back from v1_raw...
2025-11-28 07:23:15,046 INFO Verification: Read 191,541 rows in 0.06s
2025-11-28 07:23:15,047 INFO ✓ Verification successful - row counts match
2025-11-28 07:23:15,047 INFO ✓ Verification successful - column counts match


In [47]:
# Display first few rows of verified data

df_verify.head()

Unnamed: 0,Drug 1,Drug 2,Interaction Description
0,Trioxsalen,Verteporfin,Trioxsalen may increase the photosensitizing activities of Verteporfin.
1,Aminolevulinic acid,Verteporfin,Aminolevulinic acid may increase the photosensitizing activities of Verteporfin.
2,Titanium dioxide,Verteporfin,Titanium dioxide may increase the photosensitizing activities of Verteporfin.
3,Tiaprofenic acid,Verteporfin,Tiaprofenic acid may increase the photosensitizing activities of Verteporfin.
4,Cyamemazine,Verteporfin,Cyamemazine may increase the photosensitizing activities of Verteporfin.


In [48]:
# Compare file sizes (CSV vs Parquet)

# Get CSV file size from MinIO
csv_key = f"{SOURCE_DDI_PATH}{csv_filename}"
csv_response = s3.head_object(Bucket=SOURCE_BUCKET, Key=csv_key)
csv_size_mb = csv_response['ContentLength'] / (1024**2)

# Get Parquet file size from MinIO
parquet_key = f"{V1_RAW_DDI_PREFIX}{parquet_filename}"
parquet_response = s3.head_object(Bucket=DEST_BUCKET, Key=parquet_key)
parquet_size_mb = parquet_response['ContentLength'] / (1024**2)

# Calculate compression ratio
compression_ratio = (1 - parquet_size_mb / csv_size_mb) * 100 if csv_size_mb > 0 else 0

print("=" * 60)
print("FILE SIZE COMPARISON")
print("=" * 60)
print(f"CSV size:        {csv_size_mb:.2f} MB")
print(f"Parquet size:    {parquet_size_mb:.2f} MB")
print(f"Compression:     {compression_ratio:.1f}% reduction")
print("=" * 60)

logging.info(f"CSV: {csv_size_mb:.2f} MB, Parquet: {parquet_size_mb:.2f} MB ({compression_ratio:.1f}% compression)")

2025-11-28 07:23:23,425 INFO CSV: 21.06 MB, Parquet: 2.71 MB (87.1% compression)


FILE SIZE COMPARISON
CSV size:        21.06 MB
Parquet size:    2.71 MB
Compression:     87.1% reduction


In [53]:
# Summary

print("\n" + "=" * 80)
print("DATA PREPARATION SUMMARY")
print("=" * 80)
print(f"Source:       s3://{SOURCE_BUCKET}/{SOURCE_DDI_PATH}{csv_filename}")
print(f"Destination:  s3://{DEST_BUCKET}/{V1_RAW_DDI_PREFIX}{parquet_filename}")
print(f"Rows:         {len(df_ddi):,}")
print(f"Columns:      {len(df_ddi.columns)}")
print(f"CSV size:     {csv_size_mb:.2f} MB")
print(f"Parquet size: {parquet_size_mb:.2f} MB ({compression_ratio:.1f}% reduction)")
print("Status:       ✓ Complete")
print("=" * 80)
print("\nNext: Run remaining 01x_dataprep notebooks")
print("Then: Run 02_explore.ipynb for exploratory data analysis")


DATA PREPARATION SUMMARY
Source:       s3://med-sandbox/kaggle-data/ddi/db_drug_interactions.csv
Destination:  s3://med-data/v1_raw/ddi/db_drug_interactions.parquet
Rows:         191,541
Columns:      3
CSV size:     21.06 MB
Parquet size: 2.71 MB (87.1% reduction)
Status:       ✓ Complete

Next: Run remaining 01x_dataprep notebooks
Then: Run 02_explore.ipynb for exploratory data analysis
