# Feature Engineering - DDI Risk Analysis
This notebook creates features for patient-level DDI risk analysis and clustering.

**Approach:** Dual feature sets  
- **Patient-level features**: For clustering and risk scoring  
- **DDI pair-level features**: For detailed interaction analysis

**Input**:  
- `med-data/v2_clean/ddi/db_drug_interactions_clean.parquet`  
- `med-data/v2_clean/medications/medications_clean.parquet`

**Output**:  
- `med-data/v3_features/patients_features.parquet` (one row per patient)  
- `med-data/v3_features/ddi_pairs_features.parquet` (one row per patient DDI pair)

**Future Enhancement**: After adding PhysioNet MIMIC-IV data, analyze care coordination  
risks between VA and non-VA settings (fragmented care DDI analysis).

In [None]:
# Import dependencies

import os
import sys
import logging
import time
import re
from datetime import datetime, timedelta
from itertools import combinations
import numpy as np
import pandas as pd
import s3fs
import pyarrow as pa
from scipy.stats import entropy
from importlib.metadata import version
from config import *

In [None]:
# Verify dependencies

def print_version():
    print("pandas:", pd.__version__)
    print("numpy:", np.__version__)
    print("scipy:", version("scipy"))
    print("s3fs:", s3fs.__version__)
    print("pyarrow:", pa.__version__)

print_version()

In [None]:
# Set up logging

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s"
)

logging.info("Logging configured successfully")

In [None]:
# Load configuration

logging.info(f"MinIO endpoint: {MINIO_ENDPOINT}")
logging.info(f"Source: {DEST_BUCKET}/v2_clean/")
logging.info(f"Destination: {DEST_BUCKET}/v3_features/")

In [None]:
# Create S3FileSystem for MinIO

logging.info(f"Initializing S3FileSystem for MinIO at {MINIO_ENDPOINT}")
fs = s3fs.S3FileSystem(
    anon=False,
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={'endpoint_url': f"http://{MINIO_ENDPOINT}"}
)
logging.info("S3FileSystem created successfully")

---
## Part 1: Load Clean Data

In [None]:
# Load DDI clean dataset from v2_clean

ddi_uri = f"s3://{DEST_BUCKET}/{V2_CLEAN_DDI_PREFIX}db_drug_interactions_clean.parquet"
logging.info(f"Reading DDI clean data: {ddi_uri}")

start_time = time.time()
df_ddi = pd.read_parquet(ddi_uri, filesystem=fs)
elapsed = time.time() - start_time

logging.info(f"Loaded {len(df_ddi):,} DDI records in {elapsed:.2f}s")

print(f"DDI Data Shape: {df_ddi.shape}")
print(f"Columns: {list(df_ddi.columns)}")
df_ddi.head(3)

In [None]:
# Load medications clean dataset from v2_clean

meds_uri = f"s3://{DEST_BUCKET}/{V2_CLEAN_MEDICATIONS_PREFIX}medications_clean.parquet"
logging.info(f"Reading medications clean data: {meds_uri}")

start_time = time.time()
df_meds = pd.read_parquet(meds_uri, filesystem=fs)
elapsed = time.time() - start_time

logging.info(f"Loaded {len(df_meds):,} medication records in {elapsed:.2f}s")

print(f"\nMedications Data Shape: {df_meds.shape}")
print(f"Columns: {list(df_meds.columns)}")
df_meds.head(3)

# Load demographics clean dataset from v2_clean

demo_uri = f"s3://{DEST_BUCKET}/v2_clean/demographics/patient_demographics_clean.parquet"
logging.info(f"Reading demographics clean data: {demo_uri}")

start_time = time.time()
df_demo = pd.read_parquet(demo_uri, filesystem=fs)
elapsed = time.time() - start_time

logging.info(f"Loaded {len(df_demo):,} patient demographics in {elapsed:.2f}s")

print(f"\nDemographics Data Shape: {df_demo.shape}")
print(f"Columns: {list(df_demo.columns)}")
df_demo.head(3)

---
## Part 2: Patient-Level Features

Create aggregated features for each patient to support clustering and risk scoring.

In [None]:
# Calculate medication profile features per patient

logging.info("Calculating patient medication profile features...")

# Group by patient
patient_features = df_meds.groupby('PatientSID').agg(
    medication_count=('DrugName_Normalized', 'count'),
    unique_medications=('DrugName_Normalized', 'nunique'),
    first_medication_date=('MedicationDateTime', 'min'),
    last_medication_date=('MedicationDateTime', 'max'),
    rxout_count=('SourceSystem', lambda x: (x == 'RxOut').sum()),
    bcma_count=('SourceSystem', lambda x: (x == 'BCMA').sum())
).reset_index()

# Calculate medication timespan
patient_features['medication_timespan_days'] = (
    patient_features['last_medication_date'] - patient_features['first_medication_date']
).dt.days

# Calculate average medications per day (medication burden)
patient_features['avg_medications_per_day'] = (
    patient_features['medication_count'] / 
    (patient_features['medication_timespan_days'] + 1)  # +1 to avoid division by zero
)

# Calculate source system diversity (0 = single source, higher = more diverse)
patient_features['source_diversity'] = (
    (patient_features['rxout_count'] > 0).astype(int) + 
    (patient_features['bcma_count'] > 0).astype(int)
)

logging.info(f"Created medication profile features for {len(patient_features)} patients")

print("\nPatient Medication Profile Features:")
patient_features.head()

In [None]:
# Calculate medication diversity using Shannon entropy

logging.info("Calculating medication diversity scores...")

def calculate_medication_diversity(patient_meds):
    """Calculate Shannon entropy of medication distribution for a patient."""
    med_counts = patient_meds['DrugName_Normalized'].value_counts()
    if len(med_counts) <= 1:
        return 0.0
    return entropy(med_counts, base=2)

diversity_scores = df_meds.groupby('PatientSID').apply(calculate_medication_diversity)
diversity_scores = diversity_scores.reset_index(name='medication_diversity')

# Merge with patient features
patient_features = patient_features.merge(diversity_scores, on='PatientSID', how='left')

logging.info("Medication diversity scores added")

print("\nMedication diversity distribution:")
print(patient_features['medication_diversity'].describe())

In [None]:
# Identify all DDI pairs for each patient

logging.info("Identifying DDI pairs for each patient...")

def find_patient_ddi_pairs(patient_id, patient_meds_df, ddi_df):
    """
    Find all DDI pairs for a given patient.
    Returns list of dicts with interaction details.
    """
    # Get patient's unique medications
    meds = patient_meds_df[patient_meds_df['PatientSID'] == patient_id]['DrugName_Normalized'].dropna().unique()
    
    interactions = []
    
    # Check all pairs of patient's medications
    for drug1, drug2 in combinations(meds, 2):
        # Check if this pair exists in DDI dataset (either order)
        match = ddi_df[
            ((ddi_df['Drug1_Normalized'] == drug1) & (ddi_df['Drug2_Normalized'] == drug2)) |
            ((ddi_df['Drug1_Normalized'] == drug2) & (ddi_df['Drug2_Normalized'] == drug1))
        ]
        
        if not match.empty:
            for _, row in match.iterrows():
                interactions.append({
                    'PatientSID': patient_id,
                    'Drug1': drug1,
                    'Drug2': drug2,
                    'Severity': row['Severity'],
                    'Interaction': row['Interaction Description']
                })
    
    return interactions

# Find DDI pairs for all patients
all_patient_ddis = []
for patient_id in df_meds['PatientSID'].unique():
    patient_ddis = find_patient_ddi_pairs(patient_id, df_meds, df_ddi)
    all_patient_ddis.extend(patient_ddis)

# Create DataFrame of patient DDI pairs
df_patient_ddis = pd.DataFrame(all_patient_ddis)

logging.info(f"Found {len(df_patient_ddis)} total DDI pairs across all patients")

if len(df_patient_ddis) > 0:
    print(f"\nDDI pairs found: {len(df_patient_ddis)}")
    print(f"Patients with DDIs: {df_patient_ddis['PatientSID'].nunique()}")
    print("\nSeverity distribution:")
    print(df_patient_ddis['Severity'].value_counts())
    print("\nSample DDI pairs:")
    print(df_patient_ddis.head())
else:
    print("\n⚠ No DDI pairs found in current patient data")

In [None]:
# Calculate DDI risk features per patient

logging.info("Calculating patient DDI risk features...")

if len(df_patient_ddis) > 0:
    # Count DDI pairs by severity for each patient
    ddi_counts = df_patient_ddis.groupby(['PatientSID', 'Severity']).size().unstack(fill_value=0)
    ddi_counts = ddi_counts.add_prefix('ddi_severity_').reset_index()
    
    # Total DDI pair count
    ddi_total = df_patient_ddis.groupby('PatientSID').size().reset_index(name='ddi_pair_count')
    
    # Merge DDI features
    ddi_features = ddi_total.merge(ddi_counts, on='PatientSID', how='left')
    
    # Calculate weighted DDI risk score (High=3, Moderate=2, Low=1)
    severity_weights = {'High': 3, 'Moderate': 2, 'Low': 1}
    ddi_features['total_ddi_risk_score'] = 0
    for severity, weight in severity_weights.items():
        col_name = f'ddi_severity_{severity}'
        if col_name in ddi_features.columns:
            ddi_features['total_ddi_risk_score'] += ddi_features[col_name] * weight
    
    # Maximum severity level (3=High, 2=Moderate, 1=Low, 0=None)
    def get_max_severity(row):
        if 'ddi_severity_High' in row and row.get('ddi_severity_High', 0) > 0:
            return 3
        elif 'ddi_severity_Moderate' in row and row.get('ddi_severity_Moderate', 0) > 0:
            return 2
        elif 'ddi_severity_Low' in row and row.get('ddi_severity_Low', 0) > 0:
            return 1
        return 0
    
    ddi_features['max_severity_level'] = ddi_features.apply(get_max_severity, axis=1)
    
    # Merge with patient features
    patient_features = patient_features.merge(ddi_features, on='PatientSID', how='left')
    
    logging.info(f"Added DDI risk features for {len(ddi_features)} patients with DDIs")
else:
    # Add empty DDI columns if no DDIs found
    patient_features['ddi_pair_count'] = 0
    patient_features['total_ddi_risk_score'] = 0
    patient_features['max_severity_level'] = 0
    logging.warning("No DDI pairs found - added zero-value DDI risk features")

# Fill NaN values with 0 for patients without DDIs
ddi_cols = [col for col in patient_features.columns if 'ddi' in col.lower()]
patient_features[ddi_cols] = patient_features[ddi_cols].fillna(0)

print("\nPatient DDI Risk Features:")
print(patient_features[['PatientSID', 'unique_medications', 'ddi_pair_count', 
                         'total_ddi_risk_score', 'max_severity_level']].head())

In [None]:
# Calculate DDI density (proportion of possible pairs that are DDIs)

logging.info("Calculating DDI density scores...")

def calculate_ddi_density(row):
    """Calculate DDI density: actual DDI pairs / total possible pairs."""
    n_meds = row['unique_medications']
    if n_meds < 2:
        return 0.0
    
    # Total possible pairs: n choose 2 = n*(n-1)/2
    total_possible_pairs = (n_meds * (n_meds - 1)) / 2
    
    ddi_pairs = row['ddi_pair_count']
    
    return ddi_pairs / total_possible_pairs if total_possible_pairs > 0 else 0.0

patient_features['ddi_density'] = patient_features.apply(calculate_ddi_density, axis=1)

logging.info("DDI density scores calculated")

print("\nDDI density distribution:")
print(patient_features['ddi_density'].describe())

In [None]:
# Add polypharmacy indicator (commonly defined as 5+ medications)

logging.info("Adding polypharmacy indicators...")

patient_features['is_polypharmacy'] = (patient_features['unique_medications'] >= 5).astype(int)
patient_features['is_high_ddi_risk'] = (patient_features['max_severity_level'] >= 2).astype(int)  # Moderate or High

polypharmacy_count = patient_features['is_polypharmacy'].sum()
high_risk_count = patient_features['is_high_ddi_risk'].sum()

logging.info(f"Polypharmacy patients: {polypharmacy_count}")
logging.info(f"High DDI risk patients: {high_risk_count}")

print(f"\nPolypharmacy patients (5+ medications): {polypharmacy_count} / {len(patient_features)}")
print(f"High DDI risk patients (moderate/high severity): {high_risk_count} / {len(patient_features)}")

In [None]:
# Merge demographics into patient features

logging.info("Merging demographics into patient features...")

# Select demographics columns to merge
demo_cols = ['PatientSID', 'Age', 'AgeGroup', 'IsElderly', 'Gender']
df_demo_subset = df_demo[demo_cols]

# Merge demographics
patient_features = patient_features.merge(df_demo_subset, on='PatientSID', how='left')

# Check for patients without demographics
missing_demo = patient_features['Age'].isnull().sum()
if missing_demo > 0:
    logging.warning(f"{missing_demo} patients without demographics data")
else:
    logging.info("All patients have demographics data")

logging.info("Demographics merged successfully")

print(f"\nPatient features now include demographics:")
print(f"Shape: {patient_features.shape}")
print("\nDemographics columns added: Age, AgeGroup, IsElderly, Gender")
print("\nSample data:")
print(patient_features[['PatientSID', 'Age', 'Gender', 'IsElderly', 'unique_medications', 'ddi_pair_count']].head())

### Join Demographics into Patient Features

In [None]:
# Patient-level features summary

print("\n" + "="*80)
print("PATIENT-LEVEL FEATURES SUMMARY")
print("="*80)

print(f"\nTotal patients: {len(patient_features)}")
print(f"Total features: {len(patient_features.columns)}")

print("\nFeature groups:")
print("  - Medication profile: medication_count, unique_medications, medication_diversity")
print("  - Temporal: first/last_medication_date, medication_timespan_days, avg_medications_per_day")
print("  - Source system: rxout_count, bcma_count, source_diversity")
print("  - DDI risk: ddi_pair_count, severity counts, total_ddi_risk_score, max_severity_level")
print("  - DDI metrics: ddi_density")
print("  - Indicators: is_polypharmacy, is_high_ddi_risk")

print("\nFeature statistics:")
print(patient_features[['unique_medications', 'ddi_pair_count', 'total_ddi_risk_score', 
                         'ddi_density', 'medication_diversity']].describe())

print("="*80)

patient_features.head()

---
## Part 3: DDI Pair-Level Features

Create detailed features for each patient-specific DDI pair.

In [None]:
# Create DDI pair-level feature dataset

logging.info("Creating DDI pair-level features...")

if len(df_patient_ddis) > 0:
    # Start with patient DDI pairs
    df_ddi_pairs = df_patient_ddis.copy()
    
    # Add patient context features
    patient_context = patient_features[['PatientSID', 'unique_medications', 'medication_count', 
                                         'total_ddi_risk_score', 'is_polypharmacy']]
    df_ddi_pairs = df_ddi_pairs.merge(patient_context, on='PatientSID', how='left')
    
    # Rename for clarity
    df_ddi_pairs = df_ddi_pairs.rename(columns={
        'unique_medications': 'patient_medication_count',
        'medication_count': 'patient_total_records',
        'total_ddi_risk_score': 'patient_total_risk_score',
        'is_polypharmacy': 'patient_is_polypharmacy'
    })
    
    logging.info(f"Created DDI pair features for {len(df_ddi_pairs)} interactions")
    
    print(f"\nDDI Pair Features Shape: {df_ddi_pairs.shape}")
    print("\nSample DDI pair features:")
    print(df_ddi_pairs.head())
else:
    df_ddi_pairs = pd.DataFrame()  # Empty dataframe
    logging.warning("No DDI pairs found - creating empty DDI pair features dataset")
    print("\n⚠ No DDI pairs to create features for")

In [None]:
# Calculate temporal overlap for DDI pairs

if len(df_ddi_pairs) > 0:
    logging.info("Calculating temporal overlap for DDI pairs...")
    
    def calculate_temporal_overlap(row, meds_df):
        """Calculate temporal overlap between two drugs for a patient."""
        patient_id = row['PatientSID']
        drug1 = row['Drug1']
        drug2 = row['Drug2']
        
        # Get medication records for each drug
        drug1_records = meds_df[
            (meds_df['PatientSID'] == patient_id) & 
            (meds_df['DrugName_Normalized'] == drug1)
        ]
        drug2_records = meds_df[
            (meds_df['PatientSID'] == patient_id) & 
            (meds_df['DrugName_Normalized'] == drug2)
        ]
        
        if len(drug1_records) == 0 or len(drug2_records) == 0:
            return pd.Series({
                'temporal_overlap': 0,
                'first_occurrence_date': None,
                'drug1_first_date': None,
                'drug2_first_date': None
            })
        
        # Get date ranges
        drug1_first = drug1_records['MedicationDateTime'].min()
        drug1_last = drug1_records['MedicationDateTime'].max()
        drug2_first = drug2_records['MedicationDateTime'].min()
        drug2_last = drug2_records['MedicationDateTime'].max()
        
        # Calculate overlap
        overlap_start = max(drug1_first, drug2_first)
        overlap_end = min(drug1_last, drug2_last)
        
        has_overlap = 1 if overlap_start <= overlap_end else 0
        
        # First occurrence is when both drugs are active
        first_occurrence = overlap_start if has_overlap else max(drug1_first, drug2_first)
        
        return pd.Series({
            'temporal_overlap': has_overlap,
            'first_occurrence_date': first_occurrence,
            'drug1_first_date': drug1_first,
            'drug2_first_date': drug2_first
        })
    
    # Apply temporal overlap calculation
    temporal_features = df_ddi_pairs.apply(lambda row: calculate_temporal_overlap(row, df_meds), axis=1)
    df_ddi_pairs = pd.concat([df_ddi_pairs, temporal_features], axis=1)
    
    # Calculate days between drug starts
    df_ddi_pairs['days_between_drug_starts'] = (
        df_ddi_pairs['drug2_first_date'] - df_ddi_pairs['drug1_first_date']
    ).dt.days.abs()
    
    overlap_count = df_ddi_pairs['temporal_overlap'].sum()
    logging.info(f"DDI pairs with temporal overlap: {overlap_count} / {len(df_ddi_pairs)}")
    
    print(f"\nTemporal overlap: {overlap_count} / {len(df_ddi_pairs)} DDI pairs have concurrent use")
    print("\nDays between drug starts:")
    print(df_ddi_pairs['days_between_drug_starts'].describe())

In [None]:
# Extract interaction type from description

if len(df_ddi_pairs) > 0:
    logging.info("Extracting interaction types...")
    
    def extract_interaction_type(description):
        """Extract primary interaction mechanism from description."""
        if pd.isna(description):
            return 'Unknown'
        
        desc_lower = description.lower()
        
        # Check for common interaction types
        if 'bleeding' in desc_lower or 'anticoagulant' in desc_lower:
            return 'Bleeding Risk'
        elif 'hyperkalemia' in desc_lower or 'potassium' in desc_lower:
            return 'Hyperkalemia'
        elif 'serotonin' in desc_lower:
            return 'Serotonin Syndrome'
        elif 'nephrotoxic' in desc_lower or 'kidney' in desc_lower:
            return 'Nephrotoxicity'
        elif 'hepatotoxic' in desc_lower or 'liver' in desc_lower:
            return 'Hepatotoxicity'
        elif 'qtc' in desc_lower or 'qt prolong' in desc_lower:
            return 'QT Prolongation'
        elif 'serum concentration' in desc_lower:
            return 'Altered Drug Levels'
        elif 'adverse effect' in desc_lower:
            return 'Additive Adverse Effects'
        else:
            return 'Other'
    
    df_ddi_pairs['interaction_type'] = df_ddi_pairs['Interaction'].apply(extract_interaction_type)
    
    logging.info("Interaction types extracted")
    
    print("\nInteraction type distribution:")
    print(df_ddi_pairs['interaction_type'].value_counts())

In [None]:
# DDI pair-level features summary

print("\n" + "="*80)
print("DDI PAIR-LEVEL FEATURES SUMMARY")
print("="*80)

if len(df_ddi_pairs) > 0:
    print(f"\nTotal DDI pairs: {len(df_ddi_pairs)}")
    print(f"Total features: {len(df_ddi_pairs.columns)}")
    print(f"Patients with DDIs: {df_ddi_pairs['PatientSID'].nunique()}")
    
    print("\nFeature groups:")
    print("  - Identification: PatientSID, Drug1, Drug2")
    print("  - Interaction: Severity, interaction_type, Interaction (description)")
    print("  - Temporal: temporal_overlap, first_occurrence_date, days_between_drug_starts")
    print("  - Patient context: patient_medication_count, patient_total_risk_score, patient_is_polypharmacy")
    
    print("\nSeverity distribution:")
    print(df_ddi_pairs['Severity'].value_counts())
    
    print("\nColumns:")
    print(list(df_ddi_pairs.columns))
else:
    print("\n⚠ No DDI pairs in dataset")

print("="*80)

---
## Part 4: Feature Validation

In [None]:
# Validate patient-level features

print("="*80)
print("PATIENT-LEVEL FEATURES VALIDATION")
print("="*80)

# Check for missing values
print("\nMissing values:")
missing = patient_features.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "None")

# Check for infinite values
numeric_cols = patient_features.select_dtypes(include=[np.number]).columns
inf_check = patient_features[numeric_cols].isin([np.inf, -np.inf]).sum()
print("\nInfinite values:")
print(inf_check[inf_check > 0] if inf_check.sum() > 0 else "None")

# Check for negative values where they shouldn't be
count_cols = [col for col in patient_features.columns if 'count' in col.lower()]
negative_counts = (patient_features[count_cols] < 0).sum()
print("\nNegative count values:")
print(negative_counts[negative_counts > 0] if negative_counts.sum() > 0 else "None")

# Distribution summary
print("\nKey feature distributions:")
print(patient_features[['unique_medications', 'ddi_pair_count', 'total_ddi_risk_score']].describe())

print("="*80)

In [None]:
# Validate DDI pair-level features

if len(df_ddi_pairs) > 0:
    print("="*80)
    print("DDI PAIR-LEVEL FEATURES VALIDATION")
    print("="*80)
    
    # Check for missing values
    print("\nMissing values:")
    missing = df_ddi_pairs.isnull().sum()
    print(missing[missing > 0] if missing.sum() > 0 else "None")
    
    # Check severity values are valid
    valid_severities = ['High', 'Moderate', 'Low', 'Unknown']
    invalid_severity = ~df_ddi_pairs['Severity'].isin(valid_severities)
    print(f"\nInvalid severity values: {invalid_severity.sum()}")
    
    # Check temporal overlap consistency
    print(f"\nTemporal overlap: {df_ddi_pairs['temporal_overlap'].sum()} / {len(df_ddi_pairs)} pairs")
    
    print("="*80)

In [None]:
# Check correlations between key patient features

print("\nCorrelation analysis (patient-level features):")
print("="*80)

corr_features = ['unique_medications', 'ddi_pair_count', 'total_ddi_risk_score', 
                 'ddi_density', 'medication_diversity']
corr_matrix = patient_features[corr_features].corr()

print(corr_matrix)
print("\nNote: High correlation (>0.8) between features may indicate redundancy")
print("="*80)

---
## Part 5: Write Features to v3_features

In [None]:
# Write patient-level features to v3_features

patient_features_filename = "patients_features.parquet"
patient_features_uri = f"s3://{DEST_BUCKET}/{V3_FEATURES_DDI_PREFIX}{patient_features_filename}"
logging.info(f"Writing patient features: {patient_features_uri}")

start_time = time.time()

patient_features.to_parquet(
    patient_features_uri,
    engine='pyarrow',
    filesystem=fs,
    compression='snappy',
    index=False
)

elapsed = time.time() - start_time
logging.info(f"Successfully wrote {len(patient_features):,} patient records in {elapsed:.2f}s")

print(f"✓ Patient features written to: {patient_features_uri}")

In [None]:
# Write DDI pair-level features to v3_features

if len(df_ddi_pairs) > 0:
    ddi_pairs_filename = "ddi_pairs_features.parquet"
    ddi_pairs_uri = f"s3://{DEST_BUCKET}/{V3_FEATURES_DDI_PREFIX}{ddi_pairs_filename}"
    logging.info(f"Writing DDI pair features: {ddi_pairs_uri}")
    
    start_time = time.time()
    
    df_ddi_pairs.to_parquet(
        ddi_pairs_uri,
        engine='pyarrow',
        filesystem=fs,
        compression='snappy',
        index=False
    )
    
    elapsed = time.time() - start_time
    logging.info(f"Successfully wrote {len(df_ddi_pairs):,} DDI pair records in {elapsed:.2f}s")
    
    print(f"✓ DDI pair features written to: {ddi_pairs_uri}")
else:
    logging.warning("No DDI pairs to write")
    print("⚠ No DDI pair features to write (no interactions found)")

---
## Part 6: Verification and Summary

In [None]:
# Verify patient features by reading back

logging.info("Verifying patient features...")

start_time = time.time()
df_patient_verify = pd.read_parquet(patient_features_uri, filesystem=fs)
elapsed = time.time() - start_time

assert len(df_patient_verify) == len(patient_features), "Row count mismatch!"
assert len(df_patient_verify.columns) == len(patient_features.columns), "Column count mismatch!"

logging.info(f"✓ Patient features verification successful: {len(df_patient_verify):,} rows in {elapsed:.2f}s")

print("\nPatient Features (first 3 rows):")
print(df_patient_verify.head(3))

In [None]:
# Final feature engineering summary

print("\n" + "="*80)
print("FEATURE ENGINEERING SUMMARY")
print("="*80)

print("\nPATIENT-LEVEL FEATURES:")
print(f"  Output: s3://{DEST_BUCKET}/{V3_FEATURES_DDI_PREFIX}{patient_features_filename}")
print(f"  Patients: {len(patient_features):,}")
print(f"  Features: {len(patient_features.columns)}")
print(f"  Polypharmacy patients: {patient_features['is_polypharmacy'].sum()}")
print(f"  High DDI risk patients: {patient_features['is_high_ddi_risk'].sum()}")
print(f"  Status: ✓ Complete")

if len(df_ddi_pairs) > 0:
    print("\nDDI PAIR-LEVEL FEATURES:")
    print(f"  Output: s3://{DEST_BUCKET}/{V3_FEATURES_DDI_PREFIX}{ddi_pairs_filename}")
    print(f"  DDI pairs: {len(df_ddi_pairs):,}")
    print(f"  Features: {len(df_ddi_pairs.columns)}")
    print(f"  Patients affected: {df_ddi_pairs['PatientSID'].nunique()}")
    print(f"  Temporal overlap: {df_ddi_pairs['temporal_overlap'].sum()} pairs")
    print(f"  Status: ✓ Complete")
else:
    print("\nDDI PAIR-LEVEL FEATURES:")
    print(f"  Status: ⚠ No DDI pairs found (skipped)")

print("\nFEATURE CATEGORIES:")
print("  ✓ Medication profile (count, diversity, burden)")
print("  ✓ Temporal patterns (timespan, frequency)")
print("  ✓ Source system (RxOut, BCMA, diversity)")
print("  ✓ DDI risk (pair count, severity, risk score, density)")
print("  ✓ Clinical indicators (polypharmacy, high risk flags)")
if len(df_ddi_pairs) > 0:
    print("  ✓ Interaction details (type, temporal overlap)")

print("\nUSE CASES SUPPORTED:")
print("  → Patient risk stratification and clustering (05_clustering.ipynb)")
print("  → DDI risk scoring and analysis (06_analysis.ipynb)")
print("  → Future: Predictive modeling, clinical decision support")
print("  → Future: Care coordination analysis (after PhysioNet integration)")

print("\nNEXT STEPS:")
print("  → Run 05_clustering.ipynb to identify patient risk groups")

print("="*80)