### Block 1: Environment Setup

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("Environment setup complete!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Environment setup complete!
Pandas version: 2.2.3
NumPy version: 2.2.2


### Block 2: Load Target Dataset

In [26]:
# Load target dataset from Phase C
import os

features_path = "data/features/"
processed_path = "data/processed/"

print("Loading target dataset from Phase C...")

# Load main target dataset
df_target = pd.read_parquet(f"{features_path}readmission_target_dataset.parquet")

# Load clean inpatient data for historical features
df_inpatient = pd.read_parquet(f"{processed_path}inpatient_master_clean.parquet")

print("✓ Datasets loaded successfully!")
print(f"├── Target dataset: {len(df_target):,} records")
print(f"├── Features available: {len(df_target.columns)} columns")
print(f"├── Inpatient claims: {len(df_inpatient):,} records")
print(f"└── Target variable: READMISSION_30DAY ({df_target['READMISSION_30DAY'].mean()*100:.2f}% positive)")

Loading target dataset from Phase C...
✓ Datasets loaded successfully!
├── Target dataset: 66,773 records
├── Features available: 46 columns
├── Inpatient claims: 66,773 records
└── Target variable: READMISSION_30DAY (10.13% positive)


### Block 3: Baseline Feature Assessment

In [27]:
def assess_baseline_features(df):
    """
    Assess the quality and completeness of baseline features
    """
    print("BASELINE FEATURE ASSESSMENT")
    print("="*40)
    
    # Categorize existing features
    demographic_features = [col for col in df.columns if any(x in col for x in 
                           ['AGE', 'SEX', 'RACE', 'STATE', 'BENE_'])]
    
    clinical_features = [col for col in df.columns if any(x in col for x in 
                        ['LOS', 'DRG', 'ICD9', 'CLM_'])]
    
    chronic_conditions = [col for col in df.columns if col.startswith('SP_') and col != 'SP_STATE_CODE']
    
    temporal_features = [col for col in df.columns if any(x in col for x in 
                        ['ADMISSION', 'DISCHARGE', 'SEASON', 'WEEKEND'])]
    
    print(f"FEATURE CATEGORIES:")
    print(f"├── Demographic features: {len(demographic_features)}")
    print(f"├── Clinical features: {len(clinical_features)}")
    print(f"├── Chronic conditions: {len(chronic_conditions)}")
    print(f"├── Temporal features: {len(temporal_features)}")
    print(f"└── Total features: {len(df.columns)}")
    
    # Check data quality
    print(f"\nDATA QUALITY ASSESSMENT:")
    missing_counts = df.isnull().sum()
    high_missing = missing_counts[missing_counts > len(df) * 0.1]  # >10% missing
    
    print(f"├── Complete records: {(missing_counts == 0).sum()}")
    print(f"├── Features with >10% missing: {len(high_missing)}")
    
    if len(high_missing) > 0:
        print(f"  High missing features:")
        for feature, count in high_missing.items():
            pct = count / len(df) * 100
            print(f"    - {feature}: {pct:.1f}% missing")
    
    # Key feature completeness
    key_features = ['AGE_AT_ADMISSION', 'BENE_SEX_IDENT_CD', 'LOS_CALCULATED', 
                   'CHRONIC_CONDITION_COUNT']
    
    print(f"\nKEY FEATURE COMPLETENESS:")
    for feature in key_features:
        if feature in df.columns:
            completeness = (1 - df[feature].isnull().mean()) * 100
            print(f"├── {feature}: {completeness:.1f}% complete")
    
    return {
        'demographic': demographic_features,
        'clinical': clinical_features, 
        'chronic': chronic_conditions,
        'temporal': temporal_features
    }

# Assess baseline features
baseline_features = assess_baseline_features(df_target)

BASELINE FEATURE ASSESSMENT
FEATURE CATEGORIES:
├── Demographic features: 9
├── Clinical features: 6
├── Chronic conditions: 11
├── Temporal features: 14
└── Total features: 46

DATA QUALITY ASSESSMENT:
├── Complete records: 23
├── Features with >10% missing: 2
  High missing features:
    - DAYS_TO_READMISSION: 89.9% missing
    - READMISSION_DATE: 89.9% missing

KEY FEATURE COMPLETENESS:
├── AGE_AT_ADMISSION: 99.7% complete
├── BENE_SEX_IDENT_CD: 99.7% complete
├── LOS_CALCULATED: 100.0% complete
├── CHRONIC_CONDITION_COUNT: 100.0% complete


### checking block

In [28]:
# DIAGNOSTIC CODE - Run this first to understand the ESRD column
print("DIAGNOSING BENE_ESRD_IND COLUMN")
print("="*35)

if 'BENE_ESRD_IND' in df_target.columns:
    print(f"Column data type: {df_target['BENE_ESRD_IND'].dtype}")
    print(f"Unique values: {df_target['BENE_ESRD_IND'].unique()}")
    print(f"Value counts:")
    print(df_target['BENE_ESRD_IND'].value_counts(dropna=False))
    print(f"Null count: {df_target['BENE_ESRD_IND'].isnull().sum()}")
    print(f"Non-null count: {df_target['BENE_ESRD_IND'].notna().sum()}")
    
    # Check for None values specifically
    none_count = (df_target['BENE_ESRD_IND'] == None).sum()
    print(f"None values: {none_count}")
else:
    print("BENE_ESRD_IND column not found!")

DIAGNOSING BENE_ESRD_IND COLUMN
Column data type: object
Unique values: [False True None]
Value counts:
BENE_ESRD_IND
False    49707
True     16840
None       226
Name: count, dtype: int64
Null count: 226
Non-null count: 66547
None values: 0


### checking Age coloum

In [31]:
# DIAGNOSE AGE COLUMN
print("DIAGNOSING AGE_AT_ADMISSION COLUMN")
print("="*40)

if 'AGE_AT_ADMISSION' in df_target.columns:
    print(f"Column exists: ✓")
    print(f"Data type: {df_target['AGE_AT_ADMISSION'].dtype}")
    print(f"Non-null count: {df_target['AGE_AT_ADMISSION'].notna().sum():,}")
    print(f"Null count: {df_target['AGE_AT_ADMISSION'].isnull().sum():,}")
    
    if df_target['AGE_AT_ADMISSION'].notna().sum() > 0:
        print(f"Min age: {df_target['AGE_AT_ADMISSION'].min()}")
        print(f"Max age: {df_target['AGE_AT_ADMISSION'].max()}")
        print(f"Mean age: {df_target['AGE_AT_ADMISSION'].mean():.1f}")
        print(f"Sample values: {df_target['AGE_AT_ADMISSION'].dropna().head().tolist()}")
    else:
        print("All age values are null!")
        
    # Check if ages are in reasonable range
    if df_target['AGE_AT_ADMISSION'].notna().sum() > 0:
        reasonable_ages = df_target['AGE_AT_ADMISSION'].between(0, 120)
        print(f"Ages in reasonable range (0-120): {reasonable_ages.sum():,}")
else:
    print("AGE_AT_ADMISSION column not found!")
    print("Available columns with 'AGE' in name:")
    age_cols = [col for col in df_target.columns if 'AGE' in col.upper()]
    print(age_cols)

DIAGNOSING AGE_AT_ADMISSION COLUMN
Column exists: ✓
Data type: float64
Non-null count: 66,547
Null count: 226
Min age: 24.58590006844627
Max age: 101.71115674195757
Mean age: 73.8
Sample values: [86.86379192334017, 66.27789185489391, 66.66392881587953, 66.71047227926078, 67.48254620123203]
Ages in reasonable range (0-120): 66,547


### Block 4: Enhanced Demographic Features

In [32]:
def create_demographic_features(df):
    """
    Create enhanced demographic features for modeling
    """
    print("CREATING ENHANCED DEMOGRAPHIC FEATURES")
    print("="*45)
    
    df_enhanced = df.copy()
    
    # 1. Age group categorization
    if 'AGE_AT_ADMISSION' in df_enhanced.columns:
        print("Creating age groups...")
        
        # Medicare-relevant age groups
        df_enhanced['AGE_GROUP'] = pd.cut(
            df_enhanced['AGE_AT_ADMISSION'],
            bins=[0, 65, 70, 75, 80, 85, 120],
            labels=['Under_65', '65-69', '70-74', '75-79', '80-84', '85_Plus'],
            right=False
        )
        
        # Age categories for analysis
        df_enhanced['AGE_CATEGORY'] = pd.cut(
            df_enhanced['AGE_AT_ADMISSION'],
            bins=[0, 65, 75, 85, 120],
            labels=['Under_65', 'Young_Senior', 'Old_Senior', 'Very_Old'],
            right=False
        )
        
        # High-risk age flag (80+)
        df_enhanced['HIGH_RISK_AGE'] = (df_enhanced['AGE_AT_ADMISSION'] >= 80).astype(int)
        
        print(f"  ✓ Age groups created")
        print(f"    Age distribution:")
        age_dist = df_enhanced['AGE_GROUP'].value_counts().sort_index()
        for group, count in age_dist.items():
            pct = count / len(df_enhanced) * 100
            print(f"      {group}: {count:,} ({pct:.1f}%)")
    
    # 2. Gender encoding
    if 'BENE_SEX_IDENT_CD' in df_enhanced.columns:
        print("\nCreating gender features...")
        df_enhanced['GENDER'] = df_enhanced['BENE_SEX_IDENT_CD'].map({1: 'Male', 2: 'Female'})
        df_enhanced['IS_MALE'] = (df_enhanced['BENE_SEX_IDENT_CD'] == 1).astype(int)
        df_enhanced['IS_FEMALE'] = (df_enhanced['BENE_SEX_IDENT_CD'] == 2).astype(int)
        
        gender_dist = df_enhanced['GENDER'].value_counts()
        print(f"  ✓ Gender distribution:")
        for gender, count in gender_dist.items():
            pct = count / len(df_enhanced) * 100
            print(f"      {gender}: {count:,} ({pct:.1f}%)")
    
    # 3. Race/Ethnicity grouping
    if 'BENE_RACE_CD' in df_enhanced.columns:
        print("\nCreating race/ethnicity features...")
        
        # CMS race codes: 1=White, 2=Black, 3=Other, 5=Hispanic
        race_mapping = {1: 'White', 2: 'Black', 3: 'Other', 5: 'Hispanic'}
        df_enhanced['RACE_ETHNICITY'] = df_enhanced['BENE_RACE_CD'].map(race_mapping)
        
        # Binary indicators for modeling
        df_enhanced['IS_WHITE'] = (df_enhanced['BENE_RACE_CD'] == 1).astype(int)
        df_enhanced['IS_BLACK'] = (df_enhanced['BENE_RACE_CD'] == 2).astype(int)
        df_enhanced['IS_HISPANIC'] = (df_enhanced['BENE_RACE_CD'] == 5).astype(int)
        df_enhanced['IS_MINORITY'] = (df_enhanced['BENE_RACE_CD'] != 1).astype(int)
        
        race_dist = df_enhanced['RACE_ETHNICITY'].value_counts()
        print(f"  ✓ Race/ethnicity distribution:")
        for race, count in race_dist.items():
            pct = count / len(df_enhanced) * 100
            print(f"      {race}: {count:,} ({pct:.1f}%)")
    
    # 4. Geographic features
    if 'SP_STATE_CODE' in df_enhanced.columns:
        print("\nCreating geographic features...")
        
        # State frequency encoding (states with high Medicare utilization)
        state_counts = df_enhanced['SP_STATE_CODE'].value_counts()
        df_enhanced['STATE_FREQUENCY'] = df_enhanced['SP_STATE_CODE'].map(state_counts)
        
        # High-volume states (top 10)
        top_states = state_counts.head(10).index
        df_enhanced['HIGH_VOLUME_STATE'] = df_enhanced['SP_STATE_CODE'].isin(top_states).astype(int)
        
        print(f"  ✓ Geographic features created")
        print(f"    Top 5 states: {list(state_counts.head().index)}")
    
    # 5. ESRD (End-Stage Renal Disease) indicator - SPECIFIC FIX FOR YOUR DATA
    if 'BENE_ESRD_IND' in df_enhanced.columns:
        print("\nProcessing ESRD indicator...")
        print(f"  Original ESRD values: {df_enhanced['BENE_ESRD_IND'].value_counts(dropna=False)}")
        
        # Handle the specific case: False, True, None as object type
        # Convert None to False, keep True as True, keep False as False
        df_enhanced['HAS_ESRD'] = df_enhanced['BENE_ESRD_IND'].fillna(False)  # Convert None to False
        df_enhanced['HAS_ESRD'] = (df_enhanced['HAS_ESRD'] == True).astype(int)  # Convert True to 1, False to 0
        
        esrd_count = df_enhanced['HAS_ESRD'].sum()
        esrd_pct = esrd_count / len(df_enhanced) * 100
        print(f"  ✓ ESRD processing complete:")
        print(f"    - True values converted to 1: {esrd_count:,}")
        print(f"    - False/None values converted to 0: {len(df_enhanced) - esrd_count:,}")
        print(f"    - ESRD prevalence: {esrd_pct:.1f}%")
        
        # Verify the conversion worked
        print(f"  ✓ Verification - HAS_ESRD values: {df_enhanced['HAS_ESRD'].value_counts()}")
    
    print(f"\n✓ Enhanced demographic features created")
    
    return df_enhanced

# Create enhanced demographic features
df_with_demographics = create_demographic_features(df_target)

CREATING ENHANCED DEMOGRAPHIC FEATURES
Creating age groups...
  ✓ Age groups created
    Age distribution:
      Under_65: 11,596 (17.4%)
      65-69: 10,468 (15.7%)
      70-74: 11,250 (16.8%)
      75-79: 11,066 (16.6%)
      80-84: 9,889 (14.8%)
      85_Plus: 12,278 (18.4%)

Creating gender features...
  ✓ Gender distribution:
      Female: 37,622 (56.3%)
      Male: 28,925 (43.3%)

Creating race/ethnicity features...
  ✓ Race/ethnicity distribution:
      White: 56,186 (84.1%)
      Black: 6,978 (10.5%)
      Other: 2,087 (3.1%)
      Hispanic: 1,296 (1.9%)

Creating geographic features...
  ✓ Geographic features created
    Top 5 states: [5.0, 10.0, 45.0, 33.0, 14.0]

Processing ESRD indicator...
  Original ESRD values: BENE_ESRD_IND
False    49707
True     16840
None       226
Name: count, dtype: int64
  ✓ ESRD processing complete:
    - True values converted to 1: 16,840
    - False/None values converted to 0: 49,933
    - ESRD prevalence: 25.2%
  ✓ Verification - HAS_ESRD valu

### Block 5: Advanced Clinical Features

In [33]:
def create_clinical_features(df):
    """
    Create advanced clinical features from admission data
    """
    print("CREATING ADVANCED CLINICAL FEATURES")
    print("="*40)
    
    df_clinical = df.copy()
    
    # 1. Enhanced Length of Stay features
    if 'LOS_CALCULATED' in df_clinical.columns:
        print("Creating length of stay features...")
        
        # LOS categories
        df_clinical['LOS_CATEGORY'] = pd.cut(
            df_clinical['LOS_CALCULATED'],
            bins=[0, 1, 3, 7, 14, 30, 999],
            labels=['Same_Day', 'Short_2-3', 'Medium_4-7', 'Long_8-14', 'Very_Long_15-30', 'Extended_30+'],
            right=False
        )
        
        # LOS risk indicators
        df_clinical['SHORT_STAY'] = (df_clinical['LOS_CALCULATED'] <= 2).astype(int)
        df_clinical['LONG_STAY'] = (df_clinical['LOS_CALCULATED'] >= 7).astype(int)
        df_clinical['VERY_LONG_STAY'] = (df_clinical['LOS_CALCULATED'] >= 14).astype(int)
        
        # LOS percentile within dataset
        df_clinical['LOS_PERCENTILE'] = df_clinical['LOS_CALCULATED'].rank(pct=True)
        
        print(f"  ✓ LOS features created")
        los_dist = df_clinical['LOS_CATEGORY'].value_counts()
        for category, count in los_dist.items():
            pct = count / len(df_clinical) * 100
            print(f"    {category}: {count:,} ({pct:.1f}%)")
    
    # 2. DRG (Diagnosis Related Group) features
    if 'CLM_DRG_CD' in df_clinical.columns:
        print("\nCreating DRG features...")
        
        # DRG frequency encoding
        drg_counts = df_clinical['CLM_DRG_CD'].value_counts()
        df_clinical['DRG_FREQUENCY'] = df_clinical['CLM_DRG_CD'].map(drg_counts)
        
        # Common vs rare DRGs
        common_drgs = drg_counts[drg_counts >= 100].index  # DRGs with 100+ cases
        df_clinical['COMMON_DRG'] = df_clinical['CLM_DRG_CD'].isin(common_drgs).astype(int)
        
        # High-risk DRGs (top readmission rates by DRG)
        drg_readmission_rates = df_clinical.groupby('CLM_DRG_CD')['READMISSION_30DAY'].mean()
        high_risk_drgs = drg_readmission_rates[drg_readmission_rates > drg_readmission_rates.quantile(0.75)].index
        df_clinical['HIGH_RISK_DRG'] = df_clinical['CLM_DRG_CD'].isin(high_risk_drgs).astype(int)
        
        print(f"  ✓ DRG features created")
        print(f"    Unique DRGs: {df_clinical['CLM_DRG_CD'].nunique()}")
        print(f"    Common DRGs (≥100 cases): {len(common_drgs)}")
        print(f"    High-risk DRGs: {len(high_risk_drgs)}")
    
    # 3. Primary diagnosis features
    if 'ICD9_DGNS_CD_1' in df_clinical.columns:
        print("\nCreating primary diagnosis features...")
        
        # Diagnosis frequency
        dx_counts = df_clinical['ICD9_DGNS_CD_1'].value_counts()
        df_clinical['PRIMARY_DX_FREQUENCY'] = df_clinical['ICD9_DGNS_CD_1'].map(dx_counts)
        
        # Major diagnostic categories (first 3 digits of ICD-9)
        df_clinical['ICD9_3DIGIT'] = df_clinical['ICD9_DGNS_CD_1'].astype(str).str[:3]
        
        # High-risk diagnosis groups
        # Common high-readmission diagnoses
        high_readmission_dx = ['428', '250', '584', '038', '486', '507', '599', '996']  # CHF, DM, AKI, sepsis, pneumonia, etc.
        df_clinical['HIGH_READMISSION_DX'] = df_clinical['ICD9_3DIGIT'].isin(high_readmission_dx).astype(int)
        
        print(f"  ✓ Primary diagnosis features created")
        print(f"    Unique primary diagnoses: {df_clinical['ICD9_DGNS_CD_1'].nunique()}")
        print(f"    High-readmission diagnoses: {df_clinical['HIGH_READMISSION_DX'].sum():,}")
    
    # 4. Admission timing features (already created in target creation, enhance here)
    print("\nEnhancing admission timing features...")
    
    # Weekend vs weekday admission risk
    if 'WEEKEND_ADMISSION' in df_clinical.columns:
        weekend_readmission_rate = df_clinical[df_clinical['WEEKEND_ADMISSION'] == True]['READMISSION_30DAY'].mean()
        weekday_readmission_rate = df_clinical[df_clinical['WEEKEND_ADMISSION'] == False]['READMISSION_30DAY'].mean()
        
        print(f"  Weekend readmission rate: {weekend_readmission_rate*100:.1f}%")
        print(f"  Weekday readmission rate: {weekday_readmission_rate*100:.1f}%")
    
    # Holiday admissions (approximation)
    if 'ADMISSION_MONTH' in df_clinical.columns and 'ADMISSION_DAY_OF_WEEK' in df_clinical.columns:
        # Major holidays approximation (December, January for winter holidays)
        df_clinical['HOLIDAY_PERIOD'] = df_clinical['ADMISSION_MONTH'].isin([12, 1]).astype(int)
        
        # End of year admissions
        df_clinical['END_OF_YEAR'] = (df_clinical['ADMISSION_MONTH'] == 12).astype(int)
    
    print(f"\n✓ Advanced clinical features created")
    
    return df_clinical

# Create clinical features
df_with_clinical = create_clinical_features(df_with_demographics)

CREATING ADVANCED CLINICAL FEATURES
Creating length of stay features...
  ✓ LOS features created
    Medium_4-7: 35,377 (53.0%)
    Long_8-14: 16,528 (24.8%)
    Short_2-3: 8,661 (13.0%)
    Very_Long_15-30: 5,181 (7.8%)
    Extended_30+: 1,026 (1.5%)
    Same_Day: 0 (0.0%)

Creating DRG features...
  ✓ DRG features created
    Unique DRGs: 739
    Common DRGs (≥100 cases): 243
    High-risk DRGs: 185

Creating primary diagnosis features...
  ✓ Primary diagnosis features created
    Unique primary diagnoses: 2740
    High-readmission diagnoses: 13,841

Enhancing admission timing features...
  Weekend readmission rate: 9.8%
  Weekday readmission rate: 10.3%

✓ Advanced clinical features created


### Block 6: Chronic Condition Features

In [34]:
def create_chronic_condition_features(df):
    """
    Create comprehensive chronic condition features
    """
    print("CREATING CHRONIC CONDITION FEATURES")
    print("="*40)
    
    df_chronic = df.copy()
    
    # Get chronic condition columns
    chronic_conditions = [col for col in df_chronic.columns if col.startswith('SP_') and col != 'SP_STATE_CODE']
    
    if not chronic_conditions:
        print("No chronic condition columns found!")
        return df_chronic
    
    print(f"Processing {len(chronic_conditions)} chronic conditions...")
    
    # 1. Individual condition analysis
    print("\nAnalyzing individual conditions:")
    condition_readmission_impact = {}
    
    for condition in chronic_conditions:
        if condition in df_chronic.columns:
            # Calculate readmission rate for each condition
            condition_rate = df_chronic[df_chronic[condition] == True]['READMISSION_30DAY'].mean()
            overall_rate = df_chronic['READMISSION_30DAY'].mean()
            relative_risk = condition_rate / overall_rate if overall_rate > 0 else 1.0
            
            condition_readmission_impact[condition] = {
                'rate': condition_rate,
                'relative_risk': relative_risk,
                'count': df_chronic[condition].sum()
            }
            
            print(f"  {condition}: {condition_rate*100:.1f}% readmission rate (RR: {relative_risk:.2f})")
    
    # 2. High-impact chronic conditions
    print("\nIdentifying high-impact conditions...")
    
    # Conditions with relative risk > 1.1 (10% higher than average)
    high_impact_conditions = [
        condition for condition, stats in condition_readmission_impact.items()
        if stats['relative_risk'] > 1.1 and stats['count'] >= 100  # At least 100 patients
    ]
    
    print(f"High-impact conditions: {high_impact_conditions}")
    
    # Create high-impact condition count
    if high_impact_conditions:
        df_chronic['HIGH_IMPACT_CONDITIONS'] = df_chronic[high_impact_conditions].sum(axis=1)
    else:
        df_chronic['HIGH_IMPACT_CONDITIONS'] = 0
    
    # 3. Condition severity categories
    print("\nCreating condition severity categories...")
    
    # Cardiovascular conditions
    cardio_conditions = ['SP_CHF', 'SP_ISCHMCHT']
    available_cardio = [col for col in cardio_conditions if col in df_chronic.columns]
    if available_cardio:
        df_chronic['CARDIOVASCULAR_CONDITIONS'] = df_chronic[available_cardio].sum(axis=1)
        df_chronic['HAS_CARDIOVASCULAR'] = (df_chronic['CARDIOVASCULAR_CONDITIONS'] > 0).astype(int)
    
    # Metabolic conditions
    metabolic_conditions = ['SP_DIABETES', 'SP_CHRNKIDN']
    available_metabolic = [col for col in metabolic_conditions if col in df_chronic.columns]
    if available_metabolic:
        df_chronic['METABOLIC_CONDITIONS'] = df_chronic[available_metabolic].sum(axis=1)
        df_chronic['HAS_METABOLIC'] = (df_chronic['METABOLIC_CONDITIONS'] > 0).astype(int)
    
    # Mental health conditions
    mental_conditions = ['SP_DEPRESSN', 'SP_ALZHDMTA']
    available_mental = [col for col in mental_conditions if col in df_chronic.columns]
    if available_mental:
        df_chronic['MENTAL_HEALTH_CONDITIONS'] = df_chronic[available_mental].sum(axis=1)
        df_chronic['HAS_MENTAL_HEALTH'] = (df_chronic['MENTAL_HEALTH_CONDITIONS'] > 0).astype(int)
    
    # Respiratory conditions
    respiratory_conditions = ['SP_COPD']
    available_respiratory = [col for col in respiratory_conditions if col in df_chronic.columns]
    if available_respiratory:
        df_chronic['RESPIRATORY_CONDITIONS'] = df_chronic[available_respiratory].sum(axis=1)
        df_chronic['HAS_RESPIRATORY'] = (df_chronic['RESPIRATORY_CONDITIONS'] > 0).astype(int)
    
    # 4. Comorbidity burden indicators
    print("\nCreating comorbidity burden indicators...")
    
    # Enhanced chronic condition count (already exists, but validate)
    if 'CHRONIC_CONDITION_COUNT' in df_chronic.columns:
        # Comorbidity categories
        df_chronic['NO_COMORBIDITIES'] = (df_chronic['CHRONIC_CONDITION_COUNT'] == 0).astype(int)
        df_chronic['LOW_COMORBIDITY'] = (df_chronic['CHRONIC_CONDITION_COUNT'].between(1, 2)).astype(int)
        df_chronic['MODERATE_COMORBIDITY'] = (df_chronic['CHRONIC_CONDITION_COUNT'].between(3, 5)).astype(int)
        df_chronic['HIGH_COMORBIDITY'] = (df_chronic['CHRONIC_CONDITION_COUNT'] >= 6).astype(int)
        
        # Complex patient indicator (multiple body systems affected)
        system_conditions = []
        if available_cardio:
            system_conditions.append('HAS_CARDIOVASCULAR')
        if available_metabolic:
            system_conditions.append('HAS_METABOLIC')
        if available_mental:
            system_conditions.append('HAS_MENTAL_HEALTH')
        if available_respiratory:
            system_conditions.append('HAS_RESPIRATORY')
        
        if len(system_conditions) >= 2:
            df_chronic['MULTI_SYSTEM_DISEASE'] = (df_chronic[system_conditions].sum(axis=1) >= 2).astype(int)
        
        # Print comorbidity distribution
        print(f"Comorbidity distribution:")
        comorbidity_dist = df_chronic['CHRONIC_CONDITION_COUNT'].value_counts().sort_index()
        for count, patients in comorbidity_dist.head(8).items():
            pct = patients / len(df_chronic) * 100
            print(f"  {count} conditions: {patients:,} patients ({pct:.1f}%)")
    
    # 5. Specific high-risk combinations
    print("\nIdentifying high-risk condition combinations...")
    
    # Diabetes + CHF (common high-risk combination)
    if 'SP_DIABETES' in df_chronic.columns and 'SP_CHF' in df_chronic.columns:
        df_chronic['DIABETES_CHF_COMBO'] = (
            (df_chronic['SP_DIABETES'] == True) & (df_chronic['SP_CHF'] == True)
        ).astype(int)
        
        combo_count = df_chronic['DIABETES_CHF_COMBO'].sum()
        if combo_count > 0:
            combo_rate = df_chronic[df_chronic['DIABETES_CHF_COMBO'] == 1]['READMISSION_30DAY'].mean()
            print(f"  Diabetes + CHF: {combo_count:,} patients ({combo_rate*100:.1f}% readmission rate)")
    
    # CKD + CHF (kidney-heart syndrome)
    if 'SP_CHRNKIDN' in df_chronic.columns and 'SP_CHF' in df_chronic.columns:
        df_chronic['CKD_CHF_COMBO'] = (
            (df_chronic['SP_CHRNKIDN'] == True) & (df_chronic['SP_CHF'] == True)
        ).astype(int)
        
        combo_count = df_chronic['CKD_CHF_COMBO'].sum()
        if combo_count > 0:
            combo_rate = df_chronic[df_chronic['CKD_CHF_COMBO'] == 1]['READMISSION_30DAY'].mean()
            print(f"  CKD + CHF: {combo_count:,} patients ({combo_rate*100:.1f}% readmission rate)")
    
    print(f"\n✓ Chronic condition features created")
    
    return df_chronic

# Create chronic condition features
df_with_chronic = create_chronic_condition_features(df_with_clinical)

CREATING CHRONIC CONDITION FEATURES
Processing 11 chronic conditions...

Analyzing individual conditions:
  SP_ALZHDMTA: 12.7% readmission rate (RR: 1.25)
  SP_CHF: 12.1% readmission rate (RR: 1.19)
  SP_CHRNKIDN: 13.1% readmission rate (RR: 1.29)
  SP_CNCR: 13.8% readmission rate (RR: 1.36)
  SP_COPD: 13.8% readmission rate (RR: 1.36)
  SP_DEPRESSN: 12.4% readmission rate (RR: 1.22)
  SP_DIABETES: 11.5% readmission rate (RR: 1.13)
  SP_ISCHMCHT: 11.2% readmission rate (RR: 1.10)
  SP_OSTEOPRS: 12.0% readmission rate (RR: 1.19)
  SP_RA_OA: 12.3% readmission rate (RR: 1.22)
  SP_STRKETIA: 15.2% readmission rate (RR: 1.50)

Identifying high-impact conditions...
High-impact conditions: ['SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']

Creating condition severity categories...

Creating comorbidity burden indicators...
Comorbidity distribution:
  0 conditions: 889 patients (1.3%)
  1 condit

### Block 7: Prior Admission History Features

In [35]:
def create_prior_admission_features(df_target, df_inpatient_all):
    """
    Create features based on prior admission history
    """
    print("CREATING PRIOR ADMISSION HISTORY FEATURES")
    print("="*45)
    
    df_history = df_target.copy()
    
    # Ensure date columns are datetime
    if 'CLM_ADMSN_DT_DATE' not in df_inpatient_all.columns:
        df_inpatient_all['CLM_ADMSN_DT_DATE'] = pd.to_datetime(
            df_inpatient_all['CLM_ADMSN_DT'].astype(str), 
            format='%Y%m%d', 
            errors='coerce'
        )
    
    if 'NCH_BENE_DSCHRG_DT_DATE' not in df_inpatient_all.columns:
        df_inpatient_all['NCH_BENE_DSCHRG_DT_DATE'] = pd.to_datetime(
            df_inpatient_all['NCH_BENE_DSCHRG_DT'].astype(str), 
            format='%Y%m%d', 
            errors='coerce'
        )
    
    # Add LOS to inpatient data if not present
    if 'LOS_CALCULATED' not in df_inpatient_all.columns:
        df_inpatient_all['LOS_CALCULATED'] = (
            df_inpatient_all['NCH_BENE_DSCHRG_DT_DATE'] - df_inpatient_all['CLM_ADMSN_DT_DATE']
        ).dt.days + 1
    
    print(f"Analyzing prior admissions for {len(df_history):,} index admissions...")
    
    # Initialize prior admission features
    df_history['PRIOR_ADMISSIONS_30D'] = 0
    df_history['PRIOR_ADMISSIONS_90D'] = 0
    df_history['PRIOR_ADMISSIONS_180D'] = 0
    df_history['PRIOR_ADMISSIONS_365D'] = 0
    df_history['DAYS_SINCE_LAST_ADMISSION'] = np.nan
    df_history['PRIOR_TOTAL_LOS_365D'] = 0
    df_history['PRIOR_AVG_LOS_365D'] = np.nan
    df_history['FREQUENT_FLYER'] = 0
    df_history['PRIOR_ICU_ADMISSIONS_365D'] = 0  # Proxy using very long stays
    
    # Process in batches for memory efficiency
    batch_size = 5000
    processed = 0
    
    for i in range(0, len(df_history), batch_size):
        batch_end = min(i + batch_size, len(df_history))
        batch = df_history.iloc[i:batch_end].copy()
        
        print(f"  Processing batch {i//batch_size + 1}: records {i:,} to {batch_end:,}")
        
        for idx, admission in batch.iterrows():
            beneficiary_id = admission['DESYNPUF_ID']
            index_admission_date = admission['CLM_ADMSN_DT_DATE']
            
            # Find all prior admissions for this beneficiary
            prior_admissions = df_inpatient_all[
                (df_inpatient_all['DESYNPUF_ID'] == beneficiary_id) &
                (df_inpatient_all['NCH_BENE_DSCHRG_DT_DATE'] < index_admission_date) &
                (df_inpatient_all['CLM_ADMSN_DT_DATE'].notna())
            ].copy()
            
            if len(prior_admissions) > 0:
                # Calculate days between discharge and current admission
                prior_admissions['DAYS_BETWEEN'] = (
                    index_admission_date - prior_admissions['NCH_BENE_DSCHRG_DT_DATE']
                ).dt.days
                
                # Prior admissions in different time windows
                df_history.loc[idx, 'PRIOR_ADMISSIONS_30D'] = len(
                    prior_admissions[prior_admissions['DAYS_BETWEEN'] <= 30]
                )
                df_history.loc[idx, 'PRIOR_ADMISSIONS_90D'] = len(
                    prior_admissions[prior_admissions['DAYS_BETWEEN'] <= 90]
                )
                df_history.loc[idx, 'PRIOR_ADMISSIONS_180D'] = len(
                    prior_admissions[prior_admissions['DAYS_BETWEEN'] <= 180]
                )
                df_history.loc[idx, 'PRIOR_ADMISSIONS_365D'] = len(
                    prior_admissions[prior_admissions['DAYS_BETWEEN'] <= 365]
                )
                
                # Days since last admission
                df_history.loc[idx, 'DAYS_SINCE_LAST_ADMISSION'] = prior_admissions['DAYS_BETWEEN'].min()
                
                # Prior length of stay metrics (last 365 days)
                prior_365d = prior_admissions[prior_admissions['DAYS_BETWEEN'] <= 365]
                if len(prior_365d) > 0:
                    df_history.loc[idx, 'PRIOR_TOTAL_LOS_365D'] = prior_365d['LOS_CALCULATED'].sum()
                    df_history.loc[idx, 'PRIOR_AVG_LOS_365D'] = prior_365d['LOS_CALCULATED'].mean()
                    
                    # Proxy for ICU admissions (very long stays ≥ 14 days)
                    df_history.loc[idx, 'PRIOR_ICU_ADMISSIONS_365D'] = len(
                        prior_365d[prior_365d['LOS_CALCULATED'] >= 14]
                    )
                
                # Frequent flyer indicator (≥3 admissions in past 365 days)
                if df_history.loc[idx, 'PRIOR_ADMISSIONS_365D'] >= 3:
                    df_history.loc[idx, 'FREQUENT_FLYER'] = 1
        
        processed += len(batch)
        if processed % 10000 == 0:
           print(f"    Processed {processed:,} records...")
   
    # Create additional derived features
    print("\nCreating derived prior admission features...")
   
    # Recent admission indicators
    df_history['RECENT_ADMISSION_30D'] = (df_history['PRIOR_ADMISSIONS_30D'] > 0).astype(int)
    df_history['RECENT_ADMISSION_90D'] = (df_history['PRIOR_ADMISSIONS_90D'] > 0).astype(int)
   
    # High utilization indicators
    df_history['HIGH_UTILIZER_90D'] = (df_history['PRIOR_ADMISSIONS_90D'] >= 2).astype(int)
    df_history['HIGH_UTILIZER_365D'] = (df_history['PRIOR_ADMISSIONS_365D'] >= 3).astype(int)
   
    # Days since last admission categories
    df_history['DAYS_SINCE_LAST_CAT'] = pd.cut(
       df_history['DAYS_SINCE_LAST_ADMISSION'],
       bins=[-1, 7, 30, 90, 180, 365, 9999],
       labels=['No_Prior', 'Within_7d', 'Within_30d', 'Within_90d', 'Within_180d', 'Within_365d'],
       right=True
    )
   
    # Average LOS categories
    df_history['PRIOR_AVG_LOS_CAT'] = pd.cut(
       df_history['PRIOR_AVG_LOS_365D'],
       bins=[0, 3, 7, 14, 999],
       labels=['Short_Avg', 'Medium_Avg', 'Long_Avg', 'Very_Long_Avg'],
       right=False
    )
   
    # Summary statistics
    print(f"\nPRIOR ADMISSION STATISTICS:")
    print(f"├── Patients with prior admissions (365d): {(df_history['PRIOR_ADMISSIONS_365D'] > 0).sum():,}")
    print(f"├── Frequent flyers (≥3 in 365d): {df_history['FREQUENT_FLYER'].sum():,}")
    print(f"├── High utilizers (≥2 in 90d): {df_history['HIGH_UTILIZER_90D'].sum():,}")
    print(f"├── Recent admissions (30d): {df_history['RECENT_ADMISSION_30D'].sum():,}")
    print(f"└── Mean prior admissions (365d): {df_history['PRIOR_ADMISSIONS_365D'].mean():.1f}")
   
    # Readmission rate by prior admission history
    print(f"\nREADMISSION RATES BY PRIOR HISTORY:")
   
    # By frequent flyer status
    ff_rates = df_history.groupby('FREQUENT_FLYER')['READMISSION_30DAY'].mean()
    print(f"├── Non-frequent flyers: {ff_rates[0]*100:.1f}%")
    print(f"├── Frequent flyers: {ff_rates[1]*100:.1f}%")
   
    # By recent admission status
    recent_rates = df_history.groupby('RECENT_ADMISSION_30D')['READMISSION_30DAY'].mean()
    print(f"├── No recent admission: {recent_rates[0]*100:.1f}%")
    print(f"└── Recent admission (30d): {recent_rates[1]*100:.1f}%")
   
    print(f"\n✓ Prior admission history features created")
   
    return df_history

# Create prior admission features
df_with_history = create_prior_admission_features(df_with_chronic, df_inpatient)

CREATING PRIOR ADMISSION HISTORY FEATURES
Analyzing prior admissions for 66,773 index admissions...
  Processing batch 1: records 0 to 5,000
  Processing batch 2: records 5,000 to 10,000
    Processed 10,000 records...
  Processing batch 3: records 10,000 to 15,000
  Processing batch 4: records 15,000 to 20,000
    Processed 20,000 records...
  Processing batch 5: records 20,000 to 25,000
  Processing batch 6: records 25,000 to 30,000
    Processed 30,000 records...
  Processing batch 7: records 30,000 to 35,000
  Processing batch 8: records 35,000 to 40,000
    Processed 40,000 records...
  Processing batch 9: records 40,000 to 45,000
  Processing batch 10: records 45,000 to 50,000
    Processed 50,000 records...
  Processing batch 11: records 50,000 to 55,000
  Processing batch 12: records 55,000 to 60,000
    Processed 60,000 records...
  Processing batch 13: records 60,000 to 65,000
  Processing batch 14: records 65,000 to 66,773

Creating derived prior admission features...

PRIOR

### Block 8: Advanced Risk Indicators

In [36]:
def create_advanced_risk_indicators(df):
    """
    Create advanced risk indicators combining multiple feature types
    """
    print("CREATING ADVANCED RISK INDICATORS")
    print("="*40)
    
    df_risk = df.copy()
    
    # 1. Composite risk scores
    print("Creating composite risk scores...")
    
    # Age-based risk (higher weight for older patients)
    if 'AGE_AT_ADMISSION' in df_risk.columns:
        df_risk['AGE_RISK_SCORE'] = np.where(
            df_risk['AGE_AT_ADMISSION'] >= 85, 3,
            np.where(df_risk['AGE_AT_ADMISSION'] >= 75, 2,
                    np.where(df_risk['AGE_AT_ADMISSION'] >= 65, 1, 0))
        )
    
    # Comorbidity risk score
    if 'CHRONIC_CONDITION_COUNT' in df_risk.columns:
        df_risk['COMORBIDITY_RISK_SCORE'] = np.where(
            df_risk['CHRONIC_CONDITION_COUNT'] >= 6, 3,
            np.where(df_risk['CHRONIC_CONDITION_COUNT'] >= 3, 2,
                    np.where(df_risk['CHRONIC_CONDITION_COUNT'] >= 1, 1, 0))
        )
    
    # Prior utilization risk score
    if 'PRIOR_ADMISSIONS_365D' in df_risk.columns:
        df_risk['UTILIZATION_RISK_SCORE'] = np.where(
            df_risk['PRIOR_ADMISSIONS_365D'] >= 4, 3,
            np.where(df_risk['PRIOR_ADMISSIONS_365D'] >= 2, 2,
                    np.where(df_risk['PRIOR_ADMISSIONS_365D'] >= 1, 1, 0))
        )
    
    # Clinical complexity risk score
    clinical_risk_factors = []
    if 'LONG_STAY' in df_risk.columns:
        clinical_risk_factors.append('LONG_STAY')
    if 'HIGH_RISK_DRG' in df_risk.columns:
        clinical_risk_factors.append('HIGH_RISK_DRG')
    if 'HIGH_READMISSION_DX' in df_risk.columns:
        clinical_risk_factors.append('HIGH_READMISSION_DX')
    
    if clinical_risk_factors:
        df_risk['CLINICAL_COMPLEXITY_SCORE'] = df_risk[clinical_risk_factors].sum(axis=1)
    
    # 2. Comprehensive risk score
    print("Creating comprehensive risk score...")
    
    risk_components = []
    if 'AGE_RISK_SCORE' in df_risk.columns:
        risk_components.append('AGE_RISK_SCORE')
    if 'COMORBIDITY_RISK_SCORE' in df_risk.columns:
        risk_components.append('COMORBIDITY_RISK_SCORE')
    if 'UTILIZATION_RISK_SCORE' in df_risk.columns:
        risk_components.append('UTILIZATION_RISK_SCORE')
    if 'CLINICAL_COMPLEXITY_SCORE' in df_risk.columns:
        risk_components.append('CLINICAL_COMPLEXITY_SCORE')
    
    if risk_components:
        df_risk['COMPREHENSIVE_RISK_SCORE'] = df_risk[risk_components].sum(axis=1)
        
        # Risk categories
        df_risk['RISK_CATEGORY'] = pd.cut(
            df_risk['COMPREHENSIVE_RISK_SCORE'],
            bins=[-1, 2, 5, 8, 20],
            labels=['Low_Risk', 'Moderate_Risk', 'High_Risk', 'Very_High_Risk'],
            right=True
        )
        
        # Print risk distribution
        risk_dist = df_risk['RISK_CATEGORY'].value_counts()
        print(f"Risk category distribution:")
        for category, count in risk_dist.items():
            pct = count / len(df_risk) * 100
            readmission_rate = df_risk[df_risk['RISK_CATEGORY'] == category]['READMISSION_30DAY'].mean()
            print(f"  {category}: {count:,} ({pct:.1f}%) - {readmission_rate*100:.1f}% readmission rate")
    
    # 3. Specific high-risk patient profiles
    print("\nIdentifying high-risk patient profiles...")
    
    # Elderly with multiple comorbidities
    if 'HIGH_RISK_AGE' in df_risk.columns and 'HIGH_COMORBIDITY' in df_risk.columns:
        df_risk['ELDERLY_COMPLEX'] = (
            (df_risk['HIGH_RISK_AGE'] == 1) & (df_risk['HIGH_COMORBIDITY'] == 1)
        ).astype(int)
        
        elderly_complex_count = df_risk['ELDERLY_COMPLEX'].sum()
        if elderly_complex_count > 0:
            elderly_complex_rate = df_risk[df_risk['ELDERLY_COMPLEX'] == 1]['READMISSION_30DAY'].mean()
            print(f"  Elderly + Complex: {elderly_complex_count:,} ({elderly_complex_rate*100:.1f}% readmission rate)")
    
    # Frequent flyer with recent admission
    if 'FREQUENT_FLYER' in df_risk.columns and 'RECENT_ADMISSION_30D' in df_risk.columns:
        df_risk['FREQUENT_RECENT'] = (
            (df_risk['FREQUENT_FLYER'] == 1) & (df_risk['RECENT_ADMISSION_30D'] == 1)
        ).astype(int)
        
        freq_recent_count = df_risk['FREQUENT_RECENT'].sum()
        if freq_recent_count > 0:
            freq_recent_rate = df_risk[df_risk['FREQUENT_RECENT'] == 1]['READMISSION_30DAY'].mean()
            print(f"  Frequent + Recent: {freq_recent_count:,} ({freq_recent_rate*100:.1f}% readmission rate)")
    
    # Complex cardiac patients
    if 'HAS_CARDIOVASCULAR' in df_risk.columns and 'HIGH_COMORBIDITY' in df_risk.columns:
        df_risk['COMPLEX_CARDIAC'] = (
            (df_risk['HAS_CARDIOVASCULAR'] == 1) & (df_risk['HIGH_COMORBIDITY'] == 1)
        ).astype(int)
        
        complex_cardiac_count = df_risk['COMPLEX_CARDIAC'].sum()
        if complex_cardiac_count > 0:
            complex_cardiac_rate = df_risk[df_risk['COMPLEX_CARDIAC'] == 1]['READMISSION_30DAY'].mean()
            print(f"  Complex Cardiac: {complex_cardiac_count:,} ({complex_cardiac_rate*100:.1f}% readmission rate)")
    
    # 4. Discharge timing risk factors
    print("\nCreating discharge timing risk factors...")
    
    # Weekend discharge
    if 'NCH_BENE_DSCHRG_DT_DATE' in df_risk.columns:
        df_risk['DISCHARGE_DAY_OF_WEEK'] = df_risk['NCH_BENE_DSCHRG_DT_DATE'].dt.dayofweek
        df_risk['WEEKEND_DISCHARGE'] = df_risk['DISCHARGE_DAY_OF_WEEK'].isin([4, 5]).astype(int)  # Friday, Saturday
        
        weekend_discharge_count = df_risk['WEEKEND_DISCHARGE'].sum()
        if weekend_discharge_count > 0:
            weekend_discharge_rate = df_risk[df_risk['WEEKEND_DISCHARGE'] == 1]['READMISSION_30DAY'].mean()
            print(f"  Weekend discharges: {weekend_discharge_count:,} ({weekend_discharge_rate*100:.1f}% readmission rate)")
    
    # Short stay + high comorbidity (potentially premature discharge)
    if 'SHORT_STAY' in df_risk.columns and 'MODERATE_COMORBIDITY' in df_risk.columns:
        df_risk['PREMATURE_DISCHARGE_RISK'] = (
            (df_risk['SHORT_STAY'] == 1) & 
            ((df_risk['MODERATE_COMORBIDITY'] == 1) | (df_risk['HIGH_COMORBIDITY'] == 1))
        ).astype(int)
        
        premature_count = df_risk['PREMATURE_DISCHARGE_RISK'].sum()
        if premature_count > 0:
            premature_rate = df_risk[df_risk['PREMATURE_DISCHARGE_RISK'] == 1]['READMISSION_30DAY'].mean()
            print(f"  Premature discharge risk: {premature_count:,} ({premature_rate*100:.1f}% readmission rate)")
    
    # 5. Social determinants proxies
    print("\nCreating social determinants proxies...")
    
    # High-volume state as proxy for healthcare access
    if 'HIGH_VOLUME_STATE' in df_risk.columns:
        low_access_proxy = 1 - df_risk['HIGH_VOLUME_STATE']  # Inverse of high-volume state
        df_risk['LIMITED_ACCESS_PROXY'] = low_access_proxy
    
    # Minority status + high comorbidity (health disparities)
    if 'IS_MINORITY' in df_risk.columns and 'HIGH_COMORBIDITY' in df_risk.columns:
        df_risk['DISPARITY_RISK'] = (
            (df_risk['IS_MINORITY'] == 1) & (df_risk['HIGH_COMORBIDITY'] == 1)
        ).astype(int)
    
    print(f"\n✓ Advanced risk indicators created")
    
    return df_risk

# Create advanced risk indicators
df_with_risk = create_advanced_risk_indicators(df_with_history)

CREATING ADVANCED RISK INDICATORS
Creating composite risk scores...
Creating comprehensive risk score...
Risk category distribution:
  Moderate_Risk: 34,150 (51.1%) - 8.2% readmission rate
  High_Risk: 25,552 (38.3%) - 12.8% readmission rate
  Low_Risk: 4,129 (6.2%) - 4.3% readmission rate
  Very_High_Risk: 2,942 (4.4%) - 17.2% readmission rate

Identifying high-risk patient profiles...
  Elderly + Complex: 12,192 (14.6% readmission rate)
  Frequent + Recent: 1,454 (17.1% readmission rate)
  Complex Cardiac: 34,304 (14.6% readmission rate)

Creating discharge timing risk factors...
  Weekend discharges: 18,836 (10.3% readmission rate)
  Premature discharge risk: 7,418 (9.9% readmission rate)

Creating social determinants proxies...

✓ Advanced risk indicators created


### Block 9: Feature Selection and Engineering Validation

In [37]:
def validate_and_select_features(df):
    """
    Validate engineered features and select best features for modeling
    """
    print("FEATURE VALIDATION AND SELECTION")
    print("="*40)
    
    # 1. Feature completeness assessment
    print("Assessing feature completeness...")
    
    feature_completeness = {}
    total_records = len(df)
    
    for col in df.columns:
        if col != 'READMISSION_30DAY':  # Exclude target variable
            missing_count = df[col].isnull().sum()
            completeness_pct = (1 - missing_count / total_records) * 100
            feature_completeness[col] = completeness_pct
    
    # Categorize features by completeness
    complete_features = [col for col, pct in feature_completeness.items() if pct == 100]
    mostly_complete = [col for col, pct in feature_completeness.items() if 95 <= pct < 100]
    incomplete_features = [col for col, pct in feature_completeness.items() if pct < 95]
    
    print(f"├── Complete features (100%): {len(complete_features)}")
    print(f"├── Mostly complete (95-99%): {len(mostly_complete)}")
    print(f"└── Incomplete features (<95%): {len(incomplete_features)}")
    
    if incomplete_features:
        print(f"\nIncomplete features:")
        for feature in incomplete_features[:10]:  # Show first 10
            pct = feature_completeness[feature]
            print(f"  {feature}: {pct:.1f}% complete")
    
    # 2. Feature correlation with target variable
    print(f"\nAnalyzing feature correlation with target...")
    
    # Calculate correlation for numeric features
    numeric_features = df.select_dtypes(include=[np.number]).columns
    numeric_features = [col for col in numeric_features if col != 'READMISSION_30DAY']
    
    feature_correlations = {}
    for feature in numeric_features:
        if df[feature].notna().sum() > 100:  # At least 100 non-null values
            correlation = df[feature].corr(df['READMISSION_30DAY'])
            if not np.isnan(correlation):
                feature_correlations[feature] = abs(correlation)
    
    # Sort by correlation strength
    sorted_correlations = sorted(feature_correlations.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Top 15 features by correlation with readmission:")
    for i, (feature, corr) in enumerate(sorted_correlations[:15]):
        print(f"  {i+1:2d}. {feature}: {corr:.4f}")
    
    # 3. Categorical feature analysis
    print(f"\nAnalyzing categorical features...")
    
    categorical_features = df.select_dtypes(include=['object', 'category']).columns
    categorical_readmission_rates = {}
    
    for feature in categorical_features:
        if df[feature].notna().sum() > 100:
            rates = df.groupby(feature)['READMISSION_30DAY'].agg(['count', 'mean']).round(4)
            # Calculate variance in readmission rates across categories
            rate_variance = rates['mean'].var()
            categorical_readmission_rates[feature] = rate_variance
    
    # Sort by variance (features with more discriminative categories)
    sorted_categorical = sorted(categorical_readmission_rates.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Top 10 categorical features by rate variance:")
    for i, (feature, variance) in enumerate(sorted_categorical[:10]):
        print(f"  {i+1:2d}. {feature}: {variance:.6f}")
    
    # 4. Feature importance by type
    print(f"\nCategorizing features by type and importance...")
    
    # High-value features (complete + correlated)
    high_value_features = [
        col for col in complete_features 
        if col in feature_correlations and feature_correlations[col] > 0.02
    ]
    
    # Essential demographic features
    essential_demographic = [
        'AGE_AT_ADMISSION', 'GENDER', 'RACE_ETHNICITY', 'AGE_CATEGORY'
    ]
    essential_demographic = [col for col in essential_demographic if col in df.columns]
    
    # Essential clinical features
    essential_clinical = [
        'LOS_CALCULATED', 'LOS_CATEGORY', 'CLM_DRG_CD', 'ICD9_DGNS_CD_1', 
        'HIGH_RISK_DRG', 'HIGH_READMISSION_DX'
    ]
    essential_clinical = [col for col in essential_clinical if col in df.columns]
    
    # Essential chronic condition features
    essential_chronic = [
        'CHRONIC_CONDITION_COUNT', 'HAS_CARDIOVASCULAR', 'HAS_METABOLIC',
        'HIGH_COMORBIDITY', 'MULTI_SYSTEM_DISEASE'
    ]
    essential_chronic = [col for col in essential_chronic if col in df.columns]
    
    # Essential history features
    essential_history = [
        'PRIOR_ADMISSIONS_365D', 'PRIOR_ADMISSIONS_90D', 'FREQUENT_FLYER',
        'RECENT_ADMISSION_30D', 'DAYS_SINCE_LAST_ADMISSION'
    ]
    essential_history = [col for col in essential_history if col in df.columns]
    
    # Risk indicators
    risk_indicators = [
        'COMPREHENSIVE_RISK_SCORE', 'RISK_CATEGORY', 'ELDERLY_COMPLEX',
        'COMPLEX_CARDIAC', 'PREMATURE_DISCHARGE_RISK'
    ]
    risk_indicators = [col for col in risk_indicators if col in df.columns]
    
    print(f"Feature categories:")
    print(f"├── High-value features: {len(high_value_features)}")
    print(f"├── Essential demographic: {len(essential_demographic)}")
    print(f"├── Essential clinical: {len(essential_clinical)}")
    print(f"├── Essential chronic conditions: {len(essential_chronic)}")
    print(f"├── Essential history: {len(essential_history)}")
    print(f"└── Risk indicators: {len(risk_indicators)}")
    
    # 5. Create modeling feature set
    modeling_features = list(set(
        essential_demographic + essential_clinical + essential_chronic + 
        essential_history + risk_indicators + high_value_features
    ))
    
    # Add target variable
    modeling_features.append('READMISSION_30DAY')
    
    # Ensure all features exist in dataframe
    modeling_features = [col for col in modeling_features if col in df.columns]
    
    print(f"\nRecommended modeling features: {len(modeling_features)-1} features + target")
    
    return modeling_features, {
        'complete_features': complete_features,
        'high_value_features': high_value_features,
        'feature_correlations': sorted_correlations,
        'categorical_variance': sorted_categorical
    }

# Validate and select features
modeling_features, feature_analysis = validate_and_select_features(df_with_risk)

FEATURE VALIDATION AND SELECTION
Assessing feature completeness...
├── Complete features (100%): 84
├── Mostly complete (95-99%): 27
└── Incomplete features (<95%): 6

Incomplete features:
  DAYS_TO_READMISSION: 10.1% complete
  READMISSION_DATE: 10.1% complete
  DAYS_SINCE_LAST_ADMISSION: 41.6% complete
  PRIOR_AVG_LOS_365D: 37.5% complete
  DAYS_SINCE_LAST_CAT: 41.6% complete
  PRIOR_AVG_LOS_CAT: 37.5% complete

Analyzing feature correlation with target...
Top 15 features by correlation with readmission:
   1. CHRONIC_CONDITION_COUNT: 0.1772
   2. COMPLEX_CARDIAC: 0.1519
   3. HIGH_COMORBIDITY: 0.1516
   4. NCH_BENE_DSCHRG_DT: 0.1513
   5. CLM_ADMSN_DT: 0.1507
   6. discharge_year: 0.1492
   7. DISCHARGE_YEAR: 0.1492
   8. ADMISSION_YEAR: 0.1484
   9. BENE_YEAR: 0.1481
  10. COMORBIDITY_RISK_SCORE: 0.1466
  11. CKD_CHF_COMBO: 0.1407
  12. DIABETES_CHF_COMBO: 0.1193
  13. HAS_RESPIRATORY: 0.1185
  14. COMPREHENSIVE_RISK_SCORE: 0.1090
  15. MODERATE_COMORBIDITY: 0.0997

Analyzing categ

### Block 10: Create Final Feature Dataset

In [38]:
def create_final_feature_dataset(df, modeling_features):
    """
    Create final dataset optimized for machine learning
    """
    print("CREATING FINAL FEATURE DATASET")
    print("="*35)
    
    # Select modeling features
    df_final = df[modeling_features].copy()
    
    print(f"Selected {len(modeling_features)-1} features + target variable")
    
    # Handle any remaining missing values
    print(f"\nHandling missing values in final dataset...")
    
    missing_summary = df_final.isnull().sum()
    features_with_missing = missing_summary[missing_summary > 0]
    
    if len(features_with_missing) > 0:
        print(f"Features with missing values:")
        for feature, count in features_with_missing.items():
            pct = count / len(df_final) * 100
            print(f"  {feature}: {count:,} ({pct:.1f}%)")
            
            # Handle missing values based on feature type
            if df_final[feature].dtype in ['int64', 'float64']:
                # Numeric features: fill with median
                median_val = df_final[feature].median()
                df_final[feature].fillna(median_val, inplace=True)
                print(f"    → Filled with median: {median_val}")
            else:
                # Categorical features: fill with mode or 'Unknown'
                if df_final[feature].mode().empty:
                    df_final[feature].fillna('Unknown', inplace=True)
                    print(f"    → Filled with 'Unknown'")
                else:
                    mode_val = df_final[feature].mode()[0]
                    df_final[feature].fillna(mode_val, inplace=True)
                    print(f"    → Filled with mode: {mode_val}")
    else:
        print("✓ No missing values in final dataset")
    
    # Data type optimization
    print(f"\nOptimizing data types...")
    
    # Convert boolean columns to int8
    bool_columns = df_final.select_dtypes(include=['bool']).columns
    if len(bool_columns) > 0:
        for col in bool_columns:
            df_final[col] = df_final[col].astype('int8')
        print(f"  ✓ Converted {len(bool_columns)} boolean columns to int8")
    
    # Optimize integer columns
    int_columns = df_final.select_dtypes(include=['int64']).columns
    for col in int_columns:
        if col != 'READMISSION_30DAY':  # Keep target as int64
            max_val = df_final[col].max()
            min_val = df_final[col].min()
            
            if min_val >= 0 and max_val <= 255:
                df_final[col] = df_final[col].astype('uint8')
            elif min_val >= -128 and max_val <= 127:
                df_final[col] = df_final[col].astype('int8')
            elif min_val >= -32768 and max_val <= 32767:
                df_final[col] = df_final[col].astype('int16')
            elif min_val >= -2147483648 and max_val <= 2147483647:
                df_final[col] = df_final[col].astype('int32')
    
    # Convert high-cardinality categorical columns to category dtype
    categorical_columns = df_final.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        unique_values = df_final[col].nunique()
        if unique_values < len(df_final) * 0.5:  # Less than 50% unique values
            df_final[col] = df_final[col].astype('category')
    
    print(f"  ✓ Data types optimized")
    
    # Memory usage comparison
    memory_usage = df_final.memory_usage(deep=True).sum() / 1024**2
    print(f"  Final memory usage: {memory_usage:.1f} MB")
    
    # Feature summary
    print(f"\nFINAL DATASET SUMMARY:")
    print(f"├── Records: {len(df_final):,}")
    print(f"├── Features: {len(df_final.columns)-1}")
    print(f"├── Target variable: READMISSION_30DAY")
    print(f"├── Readmission rate: {df_final['READMISSION_30DAY'].mean()*100:.2f}%")
    print(f"├── Memory usage: {memory_usage:.1f} MB")
    print(f"└── Data types: {dict(df_final.dtypes.value_counts())}")
    
    # Feature type breakdown
    feature_types = {
        'Demographic': [col for col in df_final.columns if any(x in col.lower() for x in ['age', 'gender', 'race', 'sex'])],
        'Clinical': [col for col in df_final.columns if any(x in col.lower() for x in ['los', 'drg', 'dx', 'icd9', 'stay'])],
        'Chronic_Conditions': [col for col in df_final.columns if col.startswith('SP_') or 'condition' in col.lower() or 'comorbid' in col.lower()],
        'Prior_History': [col for col in df_final.columns if 'prior' in col.lower() or 'frequent' in col.lower() or 'days_since' in col.lower()],
        'Risk_Indicators': [col for col in df_final.columns if 'risk' in col.lower() or 'complex' in col.lower()],
        'Temporal': [col for col in df_final.columns if any(x in col.lower() for x in ['admission', 'discharge', 'weekend', 'season'])],
        'Other': []
    }
    
    # Classify remaining features
    classified_features = set()
    for category, features in feature_types.items():
        classified_features.update(features)
    
    feature_types['Other'] = [col for col in df_final.columns if col not in classified_features and col != 'READMISSION_30DAY']
    
    print(f"\nFEATURE TYPE BREAKDOWN:")
    for category, features in feature_types.items():
        if features:
            print(f"├── {category}: {len(features)} features")
    
    return df_final

# Create final feature dataset
df_final_features = create_final_feature_dataset(df_with_risk, modeling_features)

CREATING FINAL FEATURE DATASET
Selected 55 features + target variable

Handling missing values in final dataset...
Features with missing values:
  ICD9_DGNS_CD_1: 95 (0.1%)
    → Filled with mode: 486
  AGE_CATEGORY: 226 (0.3%)
    → Filled with mode: Young_Senior
  DAYS_SINCE_LAST_ADMISSION: 39,005 (58.4%)
    → Filled with median: 86.0
  AGE_AT_ADMISSION: 226 (0.3%)
    → Filled with median: 74.98151950718686
  GENDER: 226 (0.3%)
    → Filled with mode: Female
  RACE_ETHNICITY: 226 (0.3%)
    → Filled with mode: White

Optimizing data types...
  ✓ Data types optimized
  Final memory usage: 7.1 MB

FINAL DATASET SUMMARY:
├── Records: 66,773
├── Features: 55
├── Target variable: READMISSION_30DAY
├── Readmission rate: 10.13%
├── Memory usage: 7.1 MB
└── Data types: {dtype('uint8'): np.int64(38), dtype('int32'): np.int64(7), dtype('float64'): np.int64(3), CategoricalDtype(categories=['Low_Risk', 'Moderate_Risk', 'High_Risk', 'Very_High_Risk'], ordered=True, categories_dtype=object): np.

### Block 11: Save Feature Engineered Dataset

In [None]:
def save_feature_dataset(df_final, feature_analysis):
    """
    Save the final feature-engineered dataset and documentation
    """
    print("SAVING FEATURE-ENGINEERED DATASET")
    print("="*40)
    
    # Create features directory
    features_path = "data/features/"
    os.makedirs(features_path, exist_ok=True)
    
    # Save main feature dataset
    feature_file_csv = f"{features_path}readmission_features_final.csv"
    feature_file_parquet = f"{features_path}readmission_features_final.parquet"
    
    df_final.to_csv(feature_file_csv, index=False)
    df_final.to_parquet(feature_file_parquet)
    
    print(f"✓ Feature dataset saved:")
    print(f"  ├── CSV: {feature_file_csv}")
    print(f"  └── Parquet: {feature_file_parquet}")
    
    # Save feature list for modeling
    feature_list = [col for col in df_final.columns if col != 'READMISSION_30DAY']
    feature_list_file = f"{features_path}modeling_features_list.txt"
    
    with open(feature_list_file, 'w') as f:
        f.write("MODELING FEATURES LIST\n")
        f.write("="*30 + "\n\n")
        f.write(f"Total Features: {len(feature_list)}\n")
        f.write(f"Target Variable: READMISSION_30DAY\n")
        f.write(f"Readmission Rate: {df_final['READMISSION_30DAY'].mean()*100:.2f}%\n\n")
        f.write("Features by Category:\n")
        f.write("-" * 20 + "\n")
        
        # Categorize features that actually exist
        feature_categories = {
            'Demographic': [col for col in feature_list if any(x in col.lower() for x in ['age', 'gender', 'race', 'sex', 'male', 'female', 'white', 'black', 'minority'])],
            'Clinical': [col for col in feature_list if any(x in col.lower() for x in ['los', 'drg', 'dx', 'icd9', 'stay', 'admission', 'discharge'])],
            'Chronic_Conditions': [col for col in feature_list if col.startswith('SP_') or any(x in col.lower() for x in ['condition', 'comorbid', 'chronic', 'cardiovascular', 'metabolic', 'diabetes', 'chf'])],
            'Prior_History': [col for col in feature_list if any(x in col.lower() for x in ['prior', 'frequent', 'days_since', 'recent', 'utiliz'])],
            'Risk_Indicators': [col for col in feature_list if any(x in col.lower() for x in ['risk', 'complex', 'score', 'high_risk'])],
            'Temporal': [col for col in feature_list if any(x in col.lower() for x in ['weekend', 'season', 'month', 'quarter', 'year', 'day_of_week', 'holiday'])],
        }
        
        # Add uncategorized features
        categorized = set()
        for category, features in feature_categories.items():
            categorized.update(features)
        
        feature_categories['Other'] = [col for col in feature_list if col not in categorized]
        
        # Write categorized features
        for category, features in feature_categories.items():
            if features:
                f.write(f"\n{category} ({len(features)} features):\n")
                for i, feature in enumerate(features, 1):
                    f.write(f"  {i:2d}. {feature}\n")
    
    print(f"✓ Feature list saved: {feature_list_file}")
    
    # Save feature engineering documentation
    documentation_file = f"{features_path}feature_engineering_documentation.txt"
    
    with open(documentation_file, 'w') as f:
        f.write("FEATURE ENGINEERING DOCUMENTATION\n")
        f.write("="*50 + "\n\n")
        
        f.write("DATASET OVERVIEW:\n")
        f.write("-" * 20 + "\n")
        f.write(f"Total Records: {len(df_final):,}\n")
        f.write(f"Total Features: {len(df_final.columns)-1}\n")
        f.write(f"Target Variable: READMISSION_30DAY\n")
        f.write(f"Readmission Rate: {df_final['READMISSION_30DAY'].mean()*100:.2f}%\n")
        f.write(f"Memory Usage: {df_final.memory_usage(deep=True).sum() / 1024**2:.1f} MB\n")
        f.write(f"Missing Values: {df_final.isnull().sum().sum()}\n\n")
        
        f.write("FEATURE CREATION PROCESS:\n")
        f.write("-" * 30 + "\n")
        f.write("1. Enhanced Demographic Features\n")
        f.write("   - Age groups and risk categories\n")
        f.write("   - Gender encoding\n")
        f.write("   - Race/ethnicity grouping\n")
        f.write("   - Geographic features\n")
        f.write("   - ESRD indicators\n\n")
        
        f.write("2. Advanced Clinical Features\n")
        f.write("   - Length of stay categories\n")
        f.write("   - DRG frequency and risk indicators\n")
        f.write("   - Primary diagnosis features\n")
        f.write("   - Admission timing features\n\n")
        
        f.write("3. Chronic Condition Features\n")
        f.write("   - Individual condition analysis\n")
        f.write("   - Condition severity categories\n")
        f.write("   - Comorbidity burden indicators\n")
        f.write("   - High-risk condition combinations\n\n")
        
        f.write("4. Prior Admission History\n")
        f.write("   - Prior admissions in multiple time windows\n")
        f.write("   - Days since last admission\n")
        f.write("   - Frequent flyer identification\n")
        f.write("   - Prior length of stay metrics\n\n")
        
        f.write("5. Advanced Risk Indicators\n")
        f.write("   - Composite risk scores\n")
        f.write("   - High-risk patient profiles\n")
        f.write("   - Discharge timing risk factors\n")
        f.write("   - Social determinants proxies\n\n")
        
        # Only include validation results if they exist
        if 'feature_correlations' in feature_analysis and feature_analysis['feature_correlations']:
            f.write("TOP CORRELATED FEATURES:\n")
            for i, (feature, corr) in enumerate(feature_analysis['feature_correlations'][:10], 1):
                if feature in df_final.columns:  # Only include features that exist
                    f.write(f"{i:2d}. {feature}: {corr:.4f}\n")
            f.write("\n")
        
        f.write("DATA TYPES DISTRIBUTION:\n")
        dtype_counts = df_final.dtypes.value_counts()
        for dtype, count in dtype_counts.items():
            f.write(f"- {dtype}: {count} features\n")
    
    print(f"✓ Documentation saved: {documentation_file}")
    
    # Create correlation matrix only for numeric features that exist
    print("Creating correlation matrix for numeric features...")
    numeric_features = df_final.select_dtypes(include=[np.number]).columns.tolist()
    
    if len(numeric_features) > 1:
        try:
            correlation_matrix = df_final[numeric_features].corr()
            correlation_file = f"{features_path}feature_correlation_matrix.csv"
            correlation_matrix.to_csv(correlation_file)
            print(f"✓ Correlation matrix saved: {correlation_file}")
            print(f"  Features included: {len(numeric_features)}")
        except Exception as e:
            print(f"⚠️ Could not create correlation matrix: {e}")
    else:
        print("⚠️ Not enough numeric features for correlation matrix")
    
    # Create summary file
    summary_file = f"{features_path}feature_engineering_summary.txt"
    
    with open(summary_file, 'w') as f:
        f.write("FEATURE ENGINEERING SUMMARY\n")
        f.write("="*40 + "\n\n")
        f.write(f"FINAL DATASET STATISTICS:\n")
        f.write(f"- Records: {len(df_final):,}\n")
        f.write(f"- Features: {len(df_final.columns)-1}\n")
        f.write(f"- Target: READMISSION_30DAY\n")
        f.write(f"- Readmission Rate: {df_final['READMISSION_30DAY'].mean()*100:.2f}%\n")
        f.write(f"- Memory Usage: {df_final.memory_usage(deep=True).sum() / 1024**2:.1f} MB\n")
        f.write(f"- Missing Values: {df_final.isnull().sum().sum()}\n\n")
        f.write(f"STATUS: Ready for Model Development (Phase E)\n")
    
    print(f"✓ Summary saved: {summary_file}")

# Save feature dataset with proper error handling
print("ATTEMPTING TO SAVE FEATURE DATASET...")
try:
    save_feature_dataset(df_final_features, feature_analysis)
    print("\n All files saved successfully!")
except Exception as e:
    print(f"\n Error during save: {e}")
    print("Performing basic save...")
    
    # Backup basic save
    features_path = "data/features/"
    os.makedirs(features_path, exist_ok=True)
    
    df_final_features.to_csv(f"{features_path}readmission_features_final.csv", index=False)
    df_final_features.to_parquet(f"{features_path}readmission_features_final.parquet")
    
    print("✓ Basic dataset files saved")

ATTEMPTING TO SAVE FEATURE DATASET...
SAVING FEATURE-ENGINEERED DATASET
✓ Feature dataset saved:
  ├── CSV: data/features/readmission_features_final.csv
  └── Parquet: data/features/readmission_features_final.parquet
✓ Feature list saved: data/features/modeling_features_list.txt
✓ Documentation saved: data/features/feature_engineering_documentation.txt
Creating correlation matrix for numeric features...
✓ Correlation matrix saved: data/features/feature_correlation_matrix.csv
  Features included: 49
✓ Summary saved: data/features/feature_engineering_summary.txt

🎉 All files saved successfully!


### Block 12: Feature Engineering Validation

In [41]:
def final_feature_validation(df_final):
    """
    Perform final validation of the feature-engineered dataset
    """
    print("FINAL FEATURE ENGINEERING VALIDATION")
    print("="*45)
    
    # 1. Data quality checks
    print("Data Quality Validation:")
    
    # Check for missing values
    missing_values = df_final.isnull().sum().sum()
    print(f"├── Missing values: {missing_values} (should be 0)")
    
    # Check for infinite values
    numeric_cols = df_final.select_dtypes(include=[np.number]).columns
    infinite_values = 0
    for col in numeric_cols:
        inf_count = np.isinf(df_final[col]).sum()
        infinite_values += inf_count
        if inf_count > 0:
            print(f"├── Infinite values in {col}: {inf_count}")
    
    if infinite_values == 0:
        print(f"├── Infinite values: 0 ✓")
    else:
        print(f"├── Infinite values: {infinite_values} ⚠️")
    
    # Check target variable distribution
    target_dist = df_final['READMISSION_30DAY'].value_counts().sort_index()
    print(f"├── Target distribution: {dict(target_dist)}")
    print(f"└── Class balance: {target_dist[1]/len(df_final)*100:.1f}% positive class")
    
    # 2. Feature range validation
    print(f"\nFeature Range Validation:")
    
    # Check for reasonable ranges in key features
    key_validations = {
        'AGE_AT_ADMISSION': (0, 120),
        'LOS_CALCULATED': (0, 365),
        'CHRONIC_CONDITION_COUNT': (0, 20),
        'PRIOR_ADMISSIONS_365D': (0, 50),
        'COMPREHENSIVE_RISK_SCORE': (0, 20)
    }
    
    for feature, (min_expected, max_expected) in key_validations.items():
        if feature in df_final.columns:
            actual_min = df_final[feature].min()
            actual_max = df_final[feature].max()
            
            if min_expected <= actual_min and actual_max <= max_expected:
                status = "✓"
            else:
                status = "⚠️"
            
            print(f"├── {feature}: [{actual_min:.1f}, {actual_max:.1f}] {status}")
    
    # 3. Feature correlation analysis
    print(f"\nFeature Correlation Analysis:")
    
    # Check for highly correlated features (potential multicollinearity)
    correlation_matrix = df_final[numeric_cols].corr()
    
    # Find pairs with high correlation (>0.8, excluding self-correlation)
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_val = abs(correlation_matrix.iloc[i, j])
            if corr_val > 0.8:
                feature1 = correlation_matrix.columns[i]
                feature2 = correlation_matrix.columns[j]
                high_corr_pairs.append((feature1, feature2, corr_val))
    
    if high_corr_pairs:
        print(f"├── High correlation pairs (>0.8): {len(high_corr_pairs)}")
        for feat1, feat2, corr in high_corr_pairs[:5]:  # Show first 5
            print(f"    {feat1} ↔ {feat2}: {corr:.3f}")
    else:
        print(f"├── High correlation pairs (>0.8): 0 ✓")
    
    # 4. Business logic validation
    print(f"\nBusiness Logic Validation:")
    
    # Readmission rates should increase with risk indicators
    risk_validations = [
        ('HIGH_RISK_AGE', 'Age Risk'),
        ('HIGH_COMORBIDITY', 'Comorbidity Risk'),
        ('FREQUENT_FLYER', 'Frequent Flyer'),
        ('RECENT_ADMISSION_30D', 'Recent Admission')
    ]
    
    overall_rate = df_final['READMISSION_30DAY'].mean()
    
    for risk_feature, description in risk_validations:
        if risk_feature in df_final.columns:
            high_risk_rate = df_final[df_final[risk_feature] == 1]['READMISSION_30DAY'].mean()
            low_risk_rate = df_final[df_final[risk_feature] == 0]['READMISSION_30DAY'].mean()
            
            if high_risk_rate > low_risk_rate:
                status = "✓"
            else:
                status = "⚠️"
            
            print(f"├── {description}: {high_risk_rate*100:.1f}% vs {low_risk_rate*100:.1f}% {status}")
    
    # 5. Model readiness assessment
    print(f"\nModel Readiness Assessment:")
    
    # Check data types
    suitable_types = ['int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', 
                     'float32', 'float64', 'category']
    
    unsuitable_features = []
    for col in df_final.columns:
        if col != 'READMISSION_30DAY' and str(df_final[col].dtype) not in suitable_types:
            unsuitable_features.append((col, df_final[col].dtype))
    
    if unsuitable_features:
        print(f"├── Unsuitable data types: {len(unsuitable_features)}")
        for feat, dtype in unsuitable_features:
            print(f"    {feat}: {dtype}")
    else:
        print(f"├── Data types: All suitable for modeling ✓")
    
    # Check feature count
    feature_count = len(df_final.columns) - 1  # Exclude target
    if 20 <= feature_count <= 100:
        print(f"├── Feature count: {feature_count} (optimal range) ✓")
    elif feature_count < 20:
        print(f"├── Feature count: {feature_count} (may need more features) ⚠️")
    else:
        print(f"├── Feature count: {feature_count} (consider feature selection) ⚠️")
    
    # Check sample size
    sample_size = len(df_final)
    min_samples = feature_count * 20  # Rule of thumb: 20 samples per feature
    
    if sample_size >= min_samples:
        print(f"├── Sample size: {sample_size:,} (adequate) ✓")
    else:
        print(f"├── Sample size: {sample_size:,} (may be insufficient) ⚠️")
    
    print(f"└── Memory efficiency: {df_final.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # 6. Final recommendations
    print(f"\nFINAL RECOMMENDATIONS:")
    
    if missing_values == 0 and infinite_values == 0:
        print("├── Data quality: Excellent ✓")
    else:
        print("├── Data quality: Issues detected - review before modeling ⚠️")
    
    if len(high_corr_pairs) <= 3:
        print("├── Multicollinearity: Acceptable ✓")
    else:
        print("├── Multicollinearity: Consider feature selection ⚠️")
    
    if 10 <= df_final['READMISSION_30DAY'].mean()*100 <= 20:
        print("├── Class balance: Realistic for healthcare ✓")
    else:
        print("├── Class balance: May need resampling techniques ⚠️")
    
    print("└── Dataset ready for model development ✓")
    
    return {
        'data_quality_score': 100 - (missing_values + infinite_values),
        'feature_count': feature_count,
        'sample_size': sample_size,
        'readmission_rate': df_final['READMISSION_30DAY'].mean(),
        'memory_usage_mb': df_final.memory_usage(deep=True).sum() / 1024**2
    }

# Perform final validation
validation_results = final_feature_validation(df_final_features)

FINAL FEATURE ENGINEERING VALIDATION
Data Quality Validation:
├── Missing values: 0 (should be 0)
├── Infinite values: 0 ✓
├── Target distribution: {0: np.int64(60006), 1: np.int64(6767)}
└── Class balance: 10.1% positive class

Feature Range Validation:
├── AGE_AT_ADMISSION: [24.6, 101.7] ✓
├── LOS_CALCULATED: [1.0, 36.0] ✓
├── CHRONIC_CONDITION_COUNT: [0.0, 11.0] ✓
├── PRIOR_ADMISSIONS_365D: [0.0, 10.0] ✓
├── COMPREHENSIVE_RISK_SCORE: [0.0, 12.0] ✓

Feature Correlation Analysis:
├── High correlation pairs (>0.8): 27
    HIGH_COMORBIDITY ↔ COMORBIDITY_RISK_SCORE: 0.876
    HIGH_COMORBIDITY ↔ CHRONIC_CONDITION_COUNT: 0.827
    HIGH_COMORBIDITY ↔ COMPLEX_CARDIAC: 0.997
    CLM_ADMSN_DT ↔ ADMISSION_YEAR: 0.999
    CLM_ADMSN_DT ↔ NCH_BENE_DSCHRG_DT: 0.990

Business Logic Validation:
├── Comorbidity Risk: 14.6% vs 5.4% ✓
├── Frequent Flyer: 15.5% vs 9.8% ✓
├── Recent Admission: 14.4% vs 9.7% ✓

Model Readiness Assessment:
├── Data types: All suitable for modeling ✓
├── Feature count: 55 (o

### Block 13: Phase D Summary

In [42]:
print("\n" + "="*60)
print("PHASE D COMPLETE: FEATURE ENGINEERING")
print("="*60)

print("✅ ACCOMPLISHMENTS:")
print("├── Enhanced demographic features (age groups, race, geography)")
print("├── Advanced clinical features (LOS categories, DRG risk, diagnosis)")
print("├── Comprehensive chronic condition features")
print("├── Prior admission history analysis")
print("├── Advanced risk indicators and composite scores")
print("├── Feature validation and selection")
print("├── Data type optimization and memory efficiency")
print("└── Model-ready dataset creation")

print(f"\n📊 FINAL FEATURE DATASET STATISTICS:")
feature_count = len(df_final_features.columns) - 1
readmission_rate = df_final_features['READMISSION_30DAY'].mean() * 100
memory_usage = df_final_features.memory_usage(deep=True).sum() / 1024**2

print(f"├── Total records: {len(df_final_features):,}")
print(f"├── Engineered features: {feature_count}")
print(f"├── Target variable: READMISSION_30DAY")
print(f"├── Readmission rate: {readmission_rate:.2f}%")
print(f"├── Memory usage: {memory_usage:.1f} MB")
print(f"├── Data quality score: {validation_results['data_quality_score']:.0f}/100")
print(f"└── Missing values: {df_final_features.isnull().sum().sum()}")

print(f"\n🎯 FEATURE CATEGORIES CREATED:")
feature_categories = {
    'Demographic': len([col for col in df_final_features.columns if any(x in col.lower() for x in ['age', 'gender', 'race', 'sex'])]),
    'Clinical': len([col for col in df_final_features.columns if any(x in col.lower() for x in ['los', 'drg', 'dx', 'icd9'])]),
    'Chronic Conditions': len([col for col in df_final_features.columns if col.startswith('SP_') or 'condition' in col.lower()]),
    'Prior History': len([col for col in df_final_features.columns if 'prior' in col.lower() or 'frequent' in col.lower()]),
    'Risk Indicators': len([col for col in df_final_features.columns if 'risk' in col.lower() or 'complex' in col.lower()]),
    'Temporal': len([col for col in df_final_features.columns if any(x in col.lower() for x in ['admission', 'weekend', 'season'])])
}

for category, count in feature_categories.items():
    if count > 0:
        print(f"├── {category}: {count} features")

print(f"\n🔬 MODEL READINESS ASSESSMENT:")
if validation_results['data_quality_score'] >= 95:
    print("├── Data quality: Excellent ✓")
elif validation_results['data_quality_score'] >= 85:
    print("├── Data quality: Good ✓")
else:
    print("├── Data quality: Needs attention ⚠️")

if 20 <= feature_count <= 100:
    print("├── Feature count: Optimal range ✓")
else:
    print("├── Feature count: Consider adjustment ⚠️")

if validation_results['sample_size'] >= feature_count * 10:
    print("├── Sample size: Adequate for modeling ✓")
else:
    print("├── Sample size: May be insufficient ⚠️")

if 8 <= readmission_rate <= 20:
    print("├── Readmission rate: Realistic range ✓")
else:
    print("├── Readmission rate: Outside expected range ⚠️")

print("└── Overall assessment: Ready for modeling ✓")

print(f"\n🚀 NEXT STEPS:")
print("├── Ready for Notebook 5: Model Development")
print("├── Train multiple machine learning algorithms")
print("├── Hyperparameter tuning and optimization")
print("├── Model evaluation and comparison")
print("├── Feature importance analysis")
print("├── Business impact calculation")
print("└── Model interpretation and validation")

print(f"\n✓ Phase D Complete - Feature Engineering Successful!")
print(f"✓ Dataset saved: data/features/readmission_features_final.parquet")
print(f"✓ {feature_count} high-quality features ready for machine learning!")


PHASE D COMPLETE: FEATURE ENGINEERING
✅ ACCOMPLISHMENTS:
├── Enhanced demographic features (age groups, race, geography)
├── Advanced clinical features (LOS categories, DRG risk, diagnosis)
├── Comprehensive chronic condition features
├── Prior admission history analysis
├── Advanced risk indicators and composite scores
├── Feature validation and selection
├── Data type optimization and memory efficiency
└── Model-ready dataset creation

📊 FINAL FEATURE DATASET STATISTICS:
├── Total records: 66,773
├── Engineered features: 55
├── Target variable: READMISSION_30DAY
├── Readmission rate: 10.13%
├── Memory usage: 7.1 MB
├── Data quality score: 100/100
└── Missing values: 0

🎯 FEATURE CATEGORIES CREATED:
├── Demographic: 4 features
├── Clinical: 8 features
├── Chronic Conditions: 1 features
├── Prior History: 8 features
├── Risk Indicators: 10 features
├── Temporal: 15 features

🔬 MODEL READINESS ASSESSMENT:
├── Data quality: Excellent ✓
├── Feature count: Optimal range ✓
├── Sample size: