# Data Preparation - Patient Demographics
This notebook extracts patient demographic data from SQL Server CDWWork database.

**Source**: SQL Server CDWWork database (SPatient.SPatient schema)  
**Destination**: `med-data/v1_raw/demographics/patient_demographics.parquet`

**Demographics Extracted**:
- Age (calculated from DateOfBirth)
- Gender

**Note**: Only extracting patients who appear in medication data (RxOut or BCMA).

In [1]:
# Import dependencies

import os
import sys
import logging
import time
from datetime import datetime, timedelta
import pyodbc
import boto3
import pandas as pd
import s3fs
import pyarrow as pa
import pyarrow.parquet as pq
from dotenv import load_dotenv
from importlib.metadata import version
from config import *

In [2]:
# Verify that dependencies are available for use

def print_version():
    print("boto3:", boto3.__version__)
    print("pandas:", pd.__version__)
    print("s3fs:", s3fs.__version__)
    print("pyarrow:", pa.__version__)
    print("pyodbc:", pyodbc.version)
    print("dotenv:", version("python-dotenv"))


print_version()

boto3: 1.41.5
pandas: 2.3.3
s3fs: 2025.10.0
pyarrow: 22.0.0
pyodbc: 5.3.0
dotenv: 1.2.1


In [3]:
# Set up logging

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s"
)

# Test logging
logging.info("Logging configured successfully")

2025-11-28 17:37:58,126 INFO Logging configured successfully


In [4]:
# Load configuration from config module

logging.info(f"Configuration loaded: SQL Server={SQLSERVER_SERVER}/{SQLSERVER_DATABASE}")
logging.info(f"MinIO endpoint: {MINIO_ENDPOINT}")
logging.info(f"Destination: s3://{DEST_BUCKET}/v1_raw/demographics/")

2025-11-28 17:38:02,507 INFO Configuration loaded: SQL Server=127.0.0.1,1433/CDWWork
2025-11-28 17:38:02,507 INFO MinIO endpoint: localhost:9000
2025-11-28 17:38:02,508 INFO Destination: s3://med-data/v1_raw/demographics/


In [5]:
# Create SQL Server connection

def create_sqlserver_connection():
    """
    Factory function to create SQL Server connection using pyodbc.
    Returns connection object for CDWWork database.
    """
    logging.info(f"Creating SQL Server connection to {SQLSERVER_SERVER}/{SQLSERVER_DATABASE}")
    
    conn_string = (
        f"DRIVER={{{SQLSERVER_DRIVER}}};"
        f"SERVER={SQLSERVER_SERVER};"
        f"DATABASE={SQLSERVER_DATABASE};"
        f"UID={SQLSERVER_USER};"
        f"PWD={SQLSERVER_PASSWORD};"
        f"TrustServerCertificate={SQLSERVER_TRUST_CERT};"
    )
    
    return pyodbc.connect(conn_string)


# Create the connection
conn = create_sqlserver_connection()
logging.info("SQL Server connection created successfully")
logging.info(f"Connection type: {type(conn)}")

2025-11-28 17:38:04,978 INFO Creating SQL Server connection to 127.0.0.1,1433/CDWWork
2025-11-28 17:38:05,039 INFO SQL Server connection created successfully
2025-11-28 17:38:05,040 INFO Connection type: <class 'pyodbc.Connection'>


In [6]:
# Define SQL query for patient demographics

sql_query = """
-- Patient demographics for patients in medication data
SELECT DISTINCT
    p.PatientSID,
    p.PatientIEN,
    p.Sta3n,
    p.BirthDateTime AS DateOfBirth,
    p.Gender
FROM SPatient.SPatient p
WHERE p.PatientSID IN (
    -- Patients from RxOut
    SELECT DISTINCT PatientSID FROM RxOut.RxOutpat
    UNION
    -- Patients from BCMA
    SELECT DISTINCT PatientSID FROM BCMA.BCMAMedicationLog
)
ORDER BY p.PatientSID;
"""

logging.info("SQL query defined")
logging.info("Query: Extract demographics for patients with medication records")

2025-11-28 17:38:07,969 INFO SQL query defined
2025-11-28 17:38:07,969 INFO Query: Extract demographics for patients with medication records


In [7]:
# Execute query and load into DataFrame

logging.info("Executing SQL query...")
start_time = time.time()

df_demographics = pd.read_sql(sql_query, conn)

elapsed = time.time() - start_time
logging.info(f"Successfully loaded {len(df_demographics):,} patient records in {elapsed:.2f}s")

2025-11-28 17:38:11,428 INFO Executing SQL query...
  df_demographics = pd.read_sql(sql_query, conn)
2025-11-28 17:38:11,455 INFO Successfully loaded 15 patient records in 0.03s


In [8]:
# Close SQL Server connection

conn.close()
logging.info("SQL Server connection closed")

2025-11-28 17:38:14,469 INFO SQL Server connection closed


In [None]:
# Take a look at DataFrame

df_demographics.head(20)

Unnamed: 0,PatientSID,PatientIEN,Sta3n,DateOfBirth,Gender
0,1001,PtIEN1001,508,1980-01-02,M
1,1002,PtIEN1002,508,1975-01-02,F
2,1003,PtIEN1003,508,1990-01-02,F
3,1004,PtIEN1004,508,1954-01-02,F
4,1005,PtIEN1005,508,1981-05-15,M
5,1006,PtIEN1006,508,1981-05-15,F
6,1007,PtIEN1007,516,1940-01-02,M
7,1008,PtIEN1008,516,1961-01-02,M
8,1009,PtIEN1009,516,1960-01-02,F
9,1010,PtIEN1010,552,1965-07-15,M


In [None]:
# Display DataFrame info

df_demographics.info()

In [None]:
# Calculate age from DateOfBirth

logging.info("Calculating age from DateOfBirth...")

# Convert DateOfBirth to datetime if not already
df_demographics['DateOfBirth'] = pd.to_datetime(df_demographics['DateOfBirth'])

# Calculate age as of today
today = pd.Timestamp.now()
df_demographics['Age'] = ((today - df_demographics['DateOfBirth']).dt.days / 365.25).astype(int)

logging.info("Age calculated successfully")

print("\nAge statistics:")
print(df_demographics['Age'].describe())

print("\nAge distribution:")
age_bins = [0, 18, 40, 65, 80, 120]
age_labels = ['<18', '18-39', '40-64', '65-79', '80+']
df_demographics['AgeGroup'] = pd.cut(df_demographics['Age'], bins=age_bins, labels=age_labels)
print(df_demographics['AgeGroup'].value_counts().sort_index())

In [None]:
# Display DataFrame shape and memory usage

print(f"Shape: {df_demographics.shape}")
print(f"Rows: {len(df_demographics):,}")
print(f"Columns: {len(df_demographics.columns)}")
print(f"Memory usage: {df_demographics.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Data quality checks

print("="*60)
print("DATA QUALITY CHECKS")
print("="*60)

# Check for missing values
print("\nMissing values:")
print(df_demographics.isnull().sum())

# Check for duplicate patients
duplicates = df_demographics['PatientSID'].duplicated().sum()
print(f"\nDuplicate PatientSIDs: {duplicates}")

# Gender distribution
print("\nGender distribution:")
print(df_demographics['Gender'].value_counts())

# Check for unrealistic ages
print(f"\nAge range: {df_demographics['Age'].min()} to {df_demographics['Age'].max()}")
if df_demographics['Age'].min() < 0:
    print("⚠ WARNING: Negative ages found!")
if df_demographics['Age'].max() > 120:
    print("⚠ WARNING: Ages > 120 found!")

print("="*60)

In [None]:
# Create S3FileSystem for MinIO (pandas/pyarrow I/O)

logging.info(f"Initializing S3FileSystem for MinIO at {MINIO_ENDPOINT}")
fs = s3fs.S3FileSystem(
    anon=False,
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={
        'endpoint_url': f"http://{MINIO_ENDPOINT}"
    }
)
logging.info("S3FileSystem created successfully")

In [None]:
# Write demographics DataFrame to v1_raw as Parquet

parquet_filename = "patient_demographics.parquet"
parquet_uri = f"s3://{DEST_BUCKET}/v1_raw/demographics/{parquet_filename}"
logging.info(f"Writing Parquet: {parquet_uri}")

start_time = time.time()

df_demographics.to_parquet(
    parquet_uri,
    engine='pyarrow',
    filesystem=fs,
    compression='snappy',
    index=False
)

elapsed = time.time() - start_time
logging.info(f"Successfully wrote {len(df_demographics):,} records in {elapsed:.2f}s")

In [None]:
# Verify write by reading back from v1_raw

logging.info("Verifying write by reading back from v1_raw...")

start_time = time.time()
df_verify = pd.read_parquet(parquet_uri, filesystem=fs)
elapsed = time.time() - start_time

logging.info(f"Verification: Read {len(df_verify):,} records in {elapsed:.2f}s")

# Check row count matches
assert len(df_verify) == len(df_demographics), f"Row count mismatch! Original: {len(df_demographics)}, Read back: {len(df_verify)}"
logging.info("✓ Verification successful - row counts match")

# Check column count matches
assert len(df_verify.columns) == len(df_demographics.columns), f"Column count mismatch!"
logging.info("✓ Verification successful - column counts match")

In [None]:
# Display first few rows of verified data

df_verify.head()

In [None]:
# Summary

print("\n" + "="*80)
print("DATA PREPARATION SUMMARY - PATIENT DEMOGRAPHICS")
print("="*80)
print(f"Source:        SQL Server {SQLSERVER_SERVER}/{SQLSERVER_DATABASE}")
print(f"Schema:        SPatient.SPatient")
print(f"Destination:   s3://{DEST_BUCKET}/v1_raw/demographics/{parquet_filename}")
print(f"Patients:      {len(df_demographics):,}")
print(f"Columns:       {len(df_demographics.columns)}")
print(f"Age Range:     {df_demographics['Age'].min()} to {df_demographics['Age'].max()} years")
print(f"Mean Age:      {df_demographics['Age'].mean():.1f} years")
print(f"Gender:        {df_demographics['Gender'].value_counts().to_dict()}")
print(f"Status:        ✓ Complete")
print("="*80)
print("\nNext steps:")
print("  1. Run 02_explore.ipynb (optional: add demographic exploration)")
print("  2. Run 03_clean.ipynb (add demographics cleaning)")
print("  3. Run 04_features.ipynb (demographics will be joined into patient features)")