### Block 1: Environment Setup

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("Environment setup complete!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Environment setup complete!
Pandas version: 2.2.3
NumPy version: 2.2.2


### Block 2: Load Master Datasets

In [31]:
# Load cleaned master datasets from Phase B
import os

processed_path = "data/processed/"

print("Loading master datasets from Phase B...")

# Load master datasets
print("Loading beneficiary master dataset...")
df_beneficiary = pd.read_parquet(f"{processed_path}beneficiary_master_clean.parquet")

print("Loading inpatient master dataset...")
df_inpatient = pd.read_parquet(f"{processed_path}inpatient_master_clean.parquet")

print("✓ Master datasets loaded successfully!")
print(f"├── Beneficiary Master: {len(df_beneficiary):,} records")
print(f"├── Unique Beneficiaries: {df_beneficiary['DESYNPUF_ID'].nunique():,}")
print(f"├── Inpatient Master: {len(df_inpatient):,} claims")
print(f"└── Unique Beneficiaries with Claims: {df_inpatient['DESYNPUF_ID'].nunique():,}")

Loading master datasets from Phase B...
Loading beneficiary master dataset...
Loading inpatient master dataset...
✓ Master datasets loaded successfully!
├── Beneficiary Master: 343,644 records
├── Unique Beneficiaries: 116,352
├── Inpatient Master: 66,773 claims
└── Unique Beneficiaries with Claims: 37,780


### Block 3: Date Processing and Validation

In [32]:
def process_and_validate_dates(df):
    """
    Convert integer dates to datetime and validate date logic
    """
    print("PROCESSING AND VALIDATING DATES")
    print("="*40)
    
    df_clean = df.copy()
    
    # Convert integer dates (YYYYMMDD) to datetime
    date_columns = ['CLM_ADMSN_DT', 'NCH_BENE_DSCHRG_DT', 'CLM_FROM_DT', 'CLM_THRU_DT']
    
    print("Converting date columns from YYYYMMDD to datetime...")
    for col in date_columns:
        if col in df_clean.columns:
            # Convert to datetime, handling any errors
            df_clean[col + '_DATE'] = pd.to_datetime(
                df_clean[col].astype(str), 
                format='%Y%m%d', 
                errors='coerce'
            )
            print(f"  ✓ {col} → {col}_DATE")
    
    # Validate date logic
    print(f"\nValidating date logic...")
    
    # Check admission vs discharge dates
    if 'CLM_ADMSN_DT_DATE' in df_clean.columns and 'NCH_BENE_DSCHRG_DT_DATE' in df_clean.columns:
        invalid_dates = df_clean[
            df_clean['NCH_BENE_DSCHRG_DT_DATE'] < df_clean['CLM_ADMSN_DT_DATE']
        ]
        print(f"  Records with discharge before admission: {len(invalid_dates)}")
        
        if len(invalid_dates) > 0:
            print(f"    Removing {len(invalid_dates)} invalid records...")
            df_clean = df_clean[df_clean['NCH_BENE_DSCHRG_DT_DATE'] >= df_clean['CLM_ADMSN_DT_DATE']]
    
    # Calculate length of stay in days
    if 'CLM_ADMSN_DT_DATE' in df_clean.columns and 'NCH_BENE_DSCHRG_DT_DATE' in df_clean.columns:
        df_clean['LOS_CALCULATED'] = (
            df_clean['NCH_BENE_DSCHRG_DT_DATE'] - df_clean['CLM_ADMSN_DT_DATE']
        ).dt.days + 1  # +1 because same-day stays = 1 day
        
        print(f"  ✓ Calculated length of stay for all records")
    
    # Show date range
    if 'CLM_ADMSN_DT_DATE' in df_clean.columns:
        min_date = df_clean['CLM_ADMSN_DT_DATE'].min()
        max_date = df_clean['NCH_BENE_DSCHRG_DT_DATE'].max()
        print(f"  Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
    
    print(f"\n✓ Date processing complete")
    print(f"  Final records: {len(df_clean):,}")
    
    return df_clean

# Process dates
df_inpatient_dates = process_and_validate_dates(df_inpatient)

PROCESSING AND VALIDATING DATES
Converting date columns from YYYYMMDD to datetime...
  ✓ CLM_ADMSN_DT → CLM_ADMSN_DT_DATE
  ✓ NCH_BENE_DSCHRG_DT → NCH_BENE_DSCHRG_DT_DATE
  ✓ CLM_FROM_DT → CLM_FROM_DT_DATE
  ✓ CLM_THRU_DT → CLM_THRU_DT_DATE

Validating date logic...
  Records with discharge before admission: 0
  ✓ Calculated length of stay for all records
  Date range: 2007-11-27 to 2010-12-31

✓ Date processing complete
  Final records: 66,773


### Block 4: Define Index Admissions

In [33]:
def identify_index_admissions(df):
    """
    Identify index admissions (qualifying admissions for readmission analysis)
    Exclude deaths, transfers, and other non-qualifying discharges
    """
    print("IDENTIFYING INDEX ADMISSIONS")
    print("="*40)
    
    df_index = df.copy()
    original_count = len(df_index)
    
    print(f"Starting with {original_count:,} total admissions")
    
    # Remove records with missing critical dates
    before_date_filter = len(df_index)
    df_index = df_index[
        df_index['CLM_ADMSN_DT_DATE'].notna() & 
        df_index['NCH_BENE_DSCHRG_DT_DATE'].notna()
    ]
    after_date_filter = len(df_index)
    print(f"├── After removing missing dates: {after_date_filter:,} ({before_date_filter - after_date_filter:,} removed)")
    
    # Remove same-day stays with LOS = 0 (likely not real admissions)
    before_los_filter = len(df_index)
    df_index = df_index[df_index['LOS_CALCULATED'] >= 1]
    after_los_filter = len(df_index)
    print(f"├── After removing 0-day stays: {after_los_filter:,} ({before_los_filter - after_los_filter:,} removed)")
    
    # Remove extremely long stays (likely data errors or transfers)
    before_long_stay = len(df_index)
    df_index = df_index[df_index['LOS_CALCULATED'] <= 365]  # Max 1 year stay
    after_long_stay = len(df_index)
    print(f"├── After removing >365 day stays: {after_long_stay:,} ({before_long_stay - after_long_stay:,} removed)")
    
    # Check for potential in-hospital deaths (using beneficiary death data)
    # We'll cross-reference with beneficiary death dates later
    
    # Sort by beneficiary and discharge date for chronological analysis
    df_index = df_index.sort_values(['DESYNPUF_ID', 'NCH_BENE_DSCHRG_DT_DATE'])
    
    # Add admission sequence number for each beneficiary
    df_index['ADMISSION_SEQ'] = df_index.groupby('DESYNPUF_ID').cumcount() + 1
    
    print(f"\n✓ Index admission identification complete")
    print(f"  Final index admissions: {len(df_index):,}")
    print(f"  Exclusion rate: {(original_count - len(df_index))/original_count*100:.1f}%")
    
    # Summary statistics
    print(f"\nINDEX ADMISSION STATISTICS:")
    print(f"├── Unique beneficiaries: {df_index['DESYNPUF_ID'].nunique():,}")
    print(f"├── Average admissions per beneficiary: {len(df_index)/df_index['DESYNPUF_ID'].nunique():.1f}")
    print(f"├── Median length of stay: {df_index['LOS_CALCULATED'].median():.1f} days")
    print(f"└── Mean length of stay: {df_index['LOS_CALCULATED'].mean():.1f} days")
    
    return df_index

# Identify index admissions
df_index_admissions = identify_index_admissions(df_inpatient_dates)

IDENTIFYING INDEX ADMISSIONS
Starting with 66,773 total admissions
├── After removing missing dates: 66,773 (0 removed)
├── After removing 0-day stays: 66,773 (0 removed)
├── After removing >365 day stays: 66,773 (0 removed)

✓ Index admission identification complete
  Final index admissions: 66,773
  Exclusion rate: 0.0%

INDEX ADMISSION STATISTICS:
├── Unique beneficiaries: 37,780
├── Average admissions per beneficiary: 1.8
├── Median length of stay: 5.0 days
└── Mean length of stay: 6.7 days


### Block 5: Handle In-Hospital Deaths

In [34]:
def identify_and_exclude_deaths(df_admissions, df_beneficiaries):
    """
    Identify and exclude in-hospital deaths from index admissions
    """
    print("IDENTIFYING AND EXCLUDING IN-HOSPITAL DEATHS")
    print("="*45)
    
    df_clean = df_admissions.copy()
    
    # Get death dates from beneficiary data
    death_data = df_beneficiaries[df_beneficiaries['BENE_DEATH_DT'].notna()][
        ['DESYNPUF_ID', 'BENE_DEATH_DT', 'BENE_YEAR']
    ].copy()
    
    if len(death_data) > 0:
        # Convert death dates to datetime
        death_data['DEATH_DATE'] = pd.to_datetime(
            death_data['BENE_DEATH_DT'].astype(str), 
            format='%Y%m%d', 
            errors='coerce'
        )
        
        # Get the most recent death date for each beneficiary (in case of duplicates)
        death_dates = death_data.groupby('DESYNPUF_ID')['DEATH_DATE'].max().reset_index()
        
        print(f"Found {len(death_dates):,} beneficiaries with death records")
        
        # Merge death dates with admissions
        df_with_deaths = df_clean.merge(
            death_dates, 
            on='DESYNPUF_ID', 
            how='left'
        )
        
        # Identify in-hospital deaths (death during admission)
        in_hospital_deaths = df_with_deaths[
            (df_with_deaths['DEATH_DATE'].notna()) &
            (df_with_deaths['DEATH_DATE'] >= df_with_deaths['CLM_ADMSN_DT_DATE']) &
            (df_with_deaths['DEATH_DATE'] <= df_with_deaths['NCH_BENE_DSCHRG_DT_DATE'])
        ]
        
        print(f"├── In-hospital deaths identified: {len(in_hospital_deaths):,}")
        
        # Identify deaths within 30 days of discharge (not readmission candidates)
        death_within_30_days = df_with_deaths[
            (df_with_deaths['DEATH_DATE'].notna()) &
            (df_with_deaths['DEATH_DATE'] > df_with_deaths['NCH_BENE_DSCHRG_DT_DATE']) &
            (df_with_deaths['DEATH_DATE'] <= df_with_deaths['NCH_BENE_DSCHRG_DT_DATE'] + pd.Timedelta(days=30))
        ]
        
        print(f"├── Deaths within 30 days post-discharge: {len(death_within_30_days):,}")
        
        # Create flags for analysis
        df_with_deaths['IN_HOSPITAL_DEATH'] = (
            (df_with_deaths['DEATH_DATE'].notna()) &
            (df_with_deaths['DEATH_DATE'] >= df_with_deaths['CLM_ADMSN_DT_DATE']) &
            (df_with_deaths['DEATH_DATE'] <= df_with_deaths['NCH_BENE_DSCHRG_DT_DATE'])
        )
        
        df_with_deaths['DEATH_WITHIN_30_DAYS'] = (
            (df_with_deaths['DEATH_DATE'].notna()) &
            (df_with_deaths['DEATH_DATE'] > df_with_deaths['NCH_BENE_DSCHRG_DT_DATE']) &
            (df_with_deaths['DEATH_DATE'] <= df_with_deaths['NCH_BENE_DSCHRG_DT_DATE'] + pd.Timedelta(days=30))
        )
        
        # Exclude in-hospital deaths from readmission analysis
        df_eligible = df_with_deaths[~df_with_deaths['IN_HOSPITAL_DEATH']].copy()
        
        print(f"├── Records excluded (in-hospital deaths): {len(in_hospital_deaths):,}")
        print(f"└── Eligible admissions remaining: {len(df_eligible):,}")
        
    else:
        print("No death records found in beneficiary data")
        df_eligible = df_clean.copy()
        df_eligible['IN_HOSPITAL_DEATH'] = False
        df_eligible['DEATH_WITHIN_30_DAYS'] = False
        df_eligible['DEATH_DATE'] = pd.NaT
    
    return df_eligible

# Handle deaths
df_eligible_admissions = identify_and_exclude_deaths(df_index_admissions, df_beneficiary)

IDENTIFYING AND EXCLUDING IN-HOSPITAL DEATHS
Found 5,461 beneficiaries with death records
├── In-hospital deaths identified: 0
├── Deaths within 30 days post-discharge: 0
├── Records excluded (in-hospital deaths): 0
└── Eligible admissions remaining: 66,773


### Block 6: Create 30-Day Readmission Windows

In [35]:
def create_readmission_windows(df):
    """
    Create 30-day readmission observation windows for each eligible admission
    """
    print("CREATING 30-DAY READMISSION WINDOWS")
    print("="*40)
    
    df_windows = df.copy()
    
    # Calculate 30-day window end date
    df_windows['WINDOW_START_DATE'] = df_windows['NCH_BENE_DSCHRG_DT_DATE']
    df_windows['WINDOW_END_DATE'] = df_windows['NCH_BENE_DSCHRG_DT_DATE'] + pd.Timedelta(days=30)
    
    print(f"Created 30-day windows for {len(df_windows):,} eligible admissions")
    
    # Handle edge cases for observation windows
    
    # 1. Check for windows that extend beyond data collection period
    max_data_date = df_windows['NCH_BENE_DSCHRG_DT_DATE'].max()
    cutoff_date = max_data_date - pd.Timedelta(days=30)
    
    print(f"├── Data collection ends: {max_data_date.strftime('%Y-%m-%d')}")
    print(f"├── 30-day window cutoff: {cutoff_date.strftime('%Y-%m-%d')}")
    
    # Flag admissions with incomplete observation windows
    df_windows['INCOMPLETE_WINDOW'] = df_windows['WINDOW_END_DATE'] > max_data_date
    incomplete_windows = df_windows['INCOMPLETE_WINDOW'].sum()
    
    print(f"├── Admissions with incomplete 30-day windows: {incomplete_windows:,}")
    
    # 2. Handle deaths within observation window
    if 'DEATH_DATE' in df_windows.columns:
        df_windows['DEATH_IN_WINDOW'] = (
            df_windows['DEATH_DATE'].notna() &
            (df_windows['DEATH_DATE'] > df_windows['WINDOW_START_DATE']) &
            (df_windows['DEATH_DATE'] <= df_windows['WINDOW_END_DATE'])
        )
        
        deaths_in_window = df_windows['DEATH_IN_WINDOW'].sum()
        print(f"├── Deaths within 30-day window: {deaths_in_window:,}")
        
        # Adjust window end date for deaths (can't be readmitted after death)
        death_mask = df_windows['DEATH_IN_WINDOW']
        df_windows.loc[death_mask, 'WINDOW_END_DATE'] = df_windows.loc[death_mask, 'DEATH_DATE']
    
    # 3. Calculate actual observation days for each admission
    df_windows['OBSERVATION_DAYS'] = (
        df_windows['WINDOW_END_DATE'] - df_windows['WINDOW_START_DATE']
    ).dt.days
    
    print(f"\nOBSERVATION WINDOW STATISTICS:")
    print(f"├── Mean observation days: {df_windows['OBSERVATION_DAYS'].mean():.1f}")
    print(f"├── Median observation days: {df_windows['OBSERVATION_DAYS'].median():.1f}")
    print(f"├── Admissions with full 30-day window: {(df_windows['OBSERVATION_DAYS'] == 30).sum():,}")
    print(f"└── Admissions with partial window: {(df_windows['OBSERVATION_DAYS'] < 30).sum():,}")
    
    return df_windows

# Create readmission windows
df_with_windows = create_readmission_windows(df_eligible_admissions)

CREATING 30-DAY READMISSION WINDOWS
Created 30-day windows for 66,773 eligible admissions
├── Data collection ends: 2010-12-31
├── 30-day window cutoff: 2010-12-01
├── Admissions with incomplete 30-day windows: 313
├── Deaths within 30-day window: 0

OBSERVATION WINDOW STATISTICS:
├── Mean observation days: 30.0
├── Median observation days: 30.0
├── Admissions with full 30-day window: 66,773
└── Admissions with partial window: 0


### Block 7: Identify Readmission Events

In [36]:
def identify_readmissions(df_with_windows, df_all_admissions):
    """
    Identify 30-day readmission events by matching subsequent admissions
    """
    print("IDENTIFYING 30-DAY READMISSION EVENTS")
    print("="*40)
    
    # Prepare datasets
    index_admissions = df_with_windows.copy()
    all_admissions = df_all_admissions.copy()
    
    # Ensure all admissions have datetime dates
    if 'CLM_ADMSN_DT_DATE' not in all_admissions.columns:
        all_admissions['CLM_ADMSN_DT_DATE'] = pd.to_datetime(
            all_admissions['CLM_ADMSN_DT'].astype(str), 
            format='%Y%m%d', 
            errors='coerce'
        )
    
    print(f"Analyzing {len(index_admissions):,} index admissions for readmissions...")
    
    # Initialize readmission flags
    index_admissions['HAS_READMISSION'] = False
    index_admissions['READMISSION_DATE'] = pd.NaT
    index_admissions['DAYS_TO_READMISSION'] = np.nan
    index_admissions['READMISSION_CLM_ID'] = np.nan
    
    # Process in batches for memory efficiency
    batch_size = 10000
    readmission_count = 0
    
    for i in range(0, len(index_admissions), batch_size):
        batch_end = min(i + batch_size, len(index_admissions))
        batch = index_admissions.iloc[i:batch_end].copy()
        
        print(f"  Processing batch {i//batch_size + 1}: records {i:,} to {batch_end:,}")
        
        for idx, admission in batch.iterrows():
            beneficiary_id = admission['DESYNPUF_ID']
            window_start = admission['WINDOW_START_DATE']
            window_end = admission['WINDOW_END_DATE']
            index_claim_id = admission['CLM_ID']
            
            # Find all subsequent admissions for this beneficiary
            subsequent_admissions = all_admissions[
                (all_admissions['DESYNPUF_ID'] == beneficiary_id) &
                (all_admissions['CLM_ADMSN_DT_DATE'] > window_start) &
                (all_admissions['CLM_ADMSN_DT_DATE'] <= window_end) &
                (all_admissions['CLM_ID'] != index_claim_id)  # Exclude same admission
            ]
            
            if len(subsequent_admissions) > 0:
                # Take the first (earliest) readmission
                first_readmission = subsequent_admissions.loc[
                    subsequent_admissions['CLM_ADMSN_DT_DATE'].idxmin()
                ]
                
                # Calculate days to readmission
                days_to_readmission = (
                    first_readmission['CLM_ADMSN_DT_DATE'] - window_start
                ).days
                
                # Update index admission record
                index_admissions.loc[idx, 'HAS_READMISSION'] = True
                index_admissions.loc[idx, 'READMISSION_DATE'] = first_readmission['CLM_ADMSN_DT_DATE']
                index_admissions.loc[idx, 'DAYS_TO_READMISSION'] = days_to_readmission
                index_admissions.loc[idx, 'READMISSION_CLM_ID'] = first_readmission['CLM_ID']
                
                readmission_count += 1
    
    print(f"\n✓ Readmission identification complete")
    print(f"  Total readmissions found: {readmission_count:,}")
    
    return index_admissions

# Identify readmissions
df_final_cohort = identify_readmissions(df_with_windows, df_inpatient_dates)

IDENTIFYING 30-DAY READMISSION EVENTS
Analyzing 66,773 index admissions for readmissions...
  Processing batch 1: records 0 to 10,000
  Processing batch 2: records 10,000 to 20,000
  Processing batch 3: records 20,000 to 30,000
  Processing batch 4: records 30,000 to 40,000
  Processing batch 5: records 40,000 to 50,000
  Processing batch 6: records 50,000 to 60,000
  Processing batch 7: records 60,000 to 66,773

✓ Readmission identification complete
  Total readmissions found: 6,767


### Block 8: Calculate Readmission Statistics

In [37]:
def calculate_readmission_statistics(df):
    """
    Calculate comprehensive readmission statistics and rates
    """
    print("CALCULATING READMISSION STATISTICS")
    print("="*40)
    
    # Basic readmission metrics
    total_admissions = len(df)
    total_readmissions = df['HAS_READMISSION'].sum()
    readmission_rate = total_readmissions / total_admissions * 100
    
    print(f"OVERALL READMISSION STATISTICS:")
    print(f"├── Total eligible admissions: {total_admissions:,}")
    print(f"├── Readmissions within 30 days: {total_readmissions:,}")
    print(f"├── Overall readmission rate: {readmission_rate:.2f}%")
    print(f"└── Unique beneficiaries: {df['DESYNPUF_ID'].nunique():,}")
    
    # Readmission timing analysis
    readmitted = df[df['HAS_READMISSION'] == True]
    
    if len(readmitted) > 0:
        print(f"\nREADMISSION TIMING ANALYSIS:")
        print(f"├── Mean days to readmission: {readmitted['DAYS_TO_READMISSION'].mean():.1f}")
        print(f"├── Median days to readmission: {readmitted['DAYS_TO_READMISSION'].median():.1f}")
        print(f"├── Readmissions within 7 days: {(readmitted['DAYS_TO_READMISSION'] <= 7).sum():,} ({(readmitted['DAYS_TO_READMISSION'] <= 7).mean()*100:.1f}%)")
        print(f"├── Readmissions within 15 days: {(readmitted['DAYS_TO_READMISSION'] <= 15).sum():,} ({(readmitted['DAYS_TO_READMISSION'] <= 15).mean()*100:.1f}%)")
        print(f"└── Readmissions 16-30 days: {(readmitted['DAYS_TO_READMISSION'] > 15).sum():,} ({(readmitted['DAYS_TO_READMISSION'] > 15).mean()*100:.1f}%)")
    
    # Readmission rate by year
    if 'discharge_year' in df.columns:
        yearly_rates = df.groupby('discharge_year').agg({
            'HAS_READMISSION': ['count', 'sum', 'mean']
        }).round(3)
        yearly_rates.columns = ['Total_Admissions', 'Readmissions', 'Rate']
        yearly_rates['Rate_Percent'] = yearly_rates['Rate'] * 100
        
        print(f"\nREADMISSION RATES BY YEAR:")
        for year, row in yearly_rates.iterrows():
            print(f"├── {year}: {row['Rate_Percent']:.2f}% ({row['Readmissions']:.0f}/{row['Total_Admissions']:.0f})")
    
    # Length of stay analysis
    print(f"\nLENGTH OF STAY vs READMISSION:")
    los_groups = pd.cut(df['LOS_CALCULATED'], bins=[0, 1, 3, 7, 14, 30, 999], 
                       labels=['1 day', '2-3 days', '4-7 days', '8-14 days', '15-30 days', '>30 days'])
    los_readmission = df.groupby(los_groups)['HAS_READMISSION'].agg(['count', 'sum', 'mean']).round(3)
    los_readmission.columns = ['Total', 'Readmissions', 'Rate']
    los_readmission['Rate_Percent'] = los_readmission['Rate'] * 100
    
    for los_group, row in los_readmission.iterrows():
        print(f"├── {los_group}: {row['Rate_Percent']:.1f}% ({row['Readmissions']:.0f}/{row['Total']:.0f})")
    
    # Patient-level analysis (beneficiaries with multiple admissions)
    patient_stats = df.groupby('DESYNPUF_ID').agg({
        'HAS_READMISSION': ['count', 'sum'],
        'ADMISSION_SEQ': 'max'
    })
    patient_stats.columns = ['Total_Admissions', 'Total_Readmissions', 'Max_Admission_Seq']
    
    multiple_admissions = patient_stats[patient_stats['Total_Admissions'] > 1]
    
    print(f"\nPATIENT-LEVEL STATISTICS:")
    print(f"├── Patients with multiple admissions: {len(multiple_admissions):,}")
    print(f"├── Patients with at least one readmission: {(patient_stats['Total_Readmissions'] > 0).sum():,}")
    print(f"└── Average admissions per patient: {patient_stats['Total_Admissions'].mean():.1f}")
    
    return {
        'total_admissions': total_admissions,
        'total_readmissions': total_readmissions,
        'readmission_rate': readmission_rate,
        'unique_patients': df['DESYNPUF_ID'].nunique()
    }

# Calculate statistics
readmission_stats = calculate_readmission_statistics(df_final_cohort)

CALCULATING READMISSION STATISTICS
OVERALL READMISSION STATISTICS:
├── Total eligible admissions: 66,773
├── Readmissions within 30 days: 6,767
├── Overall readmission rate: 10.13%
└── Unique beneficiaries: 37,780

READMISSION TIMING ANALYSIS:
├── Mean days to readmission: 14.1
├── Median days to readmission: 13.0
├── Readmissions within 7 days: 1,955 (28.9%)
├── Readmissions within 15 days: 3,858 (57.0%)
└── Readmissions 16-30 days: 2,909 (43.0%)

READMISSION RATES BY YEAR:
├── 2008.0: 15.50% (4249/27496)
├── 2009.0: 7.60% (1925/25293)
├── 2010.0: 4.20% (586/13916)

LENGTH OF STAY vs READMISSION:
├── 1 day: 8.7% (83/959)
├── 2-3 days: 9.2% (1609/17574)
├── 4-7 days: 10.0% (3013/30074)
├── 8-14 days: 11.0% (1406/12800)
├── 15-30 days: 11.7% (519/4435)
├── >30 days: 14.7% (137/931)

PATIENT-LEVEL STATISTICS:
├── Patients with multiple admissions: 15,168
├── Patients with at least one readmission: 5,117
└── Average admissions per patient: 1.8


### Block 9: Validate Business Logic

In [38]:
def validate_readmission_logic(df):
    """
    Validate the readmission identification logic with detailed checks
    """
    print("VALIDATING READMISSION BUSINESS LOGIC")
    print("="*40)
    
    # 1. Check for logical consistency
    print("LOGICAL CONSISTENCY CHECKS:")
    
    # All readmissions should have valid dates
    readmitted = df[df['HAS_READMISSION'] == True]
    invalid_readmission_dates = readmitted[readmitted['READMISSION_DATE'].isna()]
    print(f"├── Readmissions with missing dates: {len(invalid_readmission_dates)} (should be 0)")
    
    # Days to readmission should be between 1-30
    if len(readmitted) > 0:
        invalid_days = readmitted[
            (readmitted['DAYS_TO_READMISSION'] < 1) | 
            (readmitted['DAYS_TO_READMISSION'] > 30)
        ]
        print(f"├── Readmissions outside 1-30 day window: {len(invalid_days)} (should be 0)")
        
        # Check that readmission dates are within observation windows
        outside_window = readmitted[
            (readmitted['READMISSION_DATE'] <= readmitted['WINDOW_START_DATE']) |
            (readmitted['READMISSION_DATE'] > readmitted['WINDOW_END_DATE'])
        ]
        print(f"├── Readmissions outside observation window: {len(outside_window)} (should be 0)")
    
    # 2. Sample validation - manually check a few cases
    print(f"\nSAMPLE VALIDATION:")
    
    # Show examples of readmissions
    if len(readmitted) > 0:
        sample_readmissions = readmitted[['DESYNPUF_ID', 'CLM_ID', 'NCH_BENE_DSCHRG_DT_DATE', 
                                        'READMISSION_DATE', 'DAYS_TO_READMISSION']].head(3)
        print("Sample readmission cases:")
        for idx, row in sample_readmissions.iterrows():
            print(f"  Patient {row['DESYNPUF_ID']}: Discharged {row['NCH_BENE_DSCHRG_DT_DATE'].strftime('%Y-%m-%d')}, "
                  f"Readmitted {row['READMISSION_DATE'].strftime('%Y-%m-%d')} ({row['DAYS_TO_READMISSION']:.0f} days)")
    
    # Show examples of non-readmissions
    non_readmitted = df[df['HAS_READMISSION'] == False].sample(3)
    print("Sample non-readmission cases:")
    for idx, row in non_readmitted.iterrows():
        window_end = row['WINDOW_END_DATE'].strftime('%Y-%m-%d')
        discharge = row['NCH_BENE_DSCHRG_DT_DATE'].strftime('%Y-%m-%d')
        print(f"  Patient {row['DESYNPUF_ID']}: Discharged {discharge}, "
              f"30-day window ends {window_end}, No readmission")
    
    # 3. Cross-validation with original data
    print(f"\nCROSS-VALIDATION:")
    
    # Check total admission counts
    original_eligible = len(df)
    flagged_readmissions = df['HAS_READMISSION'].sum()
    
    print(f"├── Eligible index admissions: {original_eligible:,}")
    print(f"├── Flagged as readmissions: {flagged_readmissions:,}")
    print(f"├── Flagged as non-readmissions: {original_eligible - flagged_readmissions:,}")
    print(f"└── Total should equal eligible: {original_eligible == len(df)} ✓")
    
    # 4. Compare with Medicare benchmarks
    print(f"\nBENCHMARK COMPARISON:")
    current_rate = (flagged_readmissions / original_eligible) * 100
    print(f"├── Our calculated rate: {current_rate:.2f}%")
    print(f"├── Typical Medicare rate: 12-15%")
    
    if 8 <= current_rate <= 20:
        print(f"└── Rate is within reasonable range ✓")
    else:
        print(f"└── Rate may need investigation ⚠️")
    
    # 5. Check for edge cases
    print(f"\nEDGE CASE ANALYSIS:")
    
    # Same-day readmissions (should be very rare)
    same_day = readmitted[readmitted['DAYS_TO_READMISSION'] == 0] if len(readmitted) > 0 else pd.DataFrame()
    print(f"├── Same-day readmissions: {len(same_day)} (should be rare)")
    
    # Very early readmissions (within 3 days)
    early_readmissions = readmitted[readmitted['DAYS_TO_READMISSION'] <= 3] if len(readmitted) > 0 else pd.DataFrame()
    print(f"├── Readmissions ≤3 days: {len(early_readmissions)} ({len(early_readmissions)/len(readmitted)*100:.1f}% of readmissions)" if len(readmitted) > 0 else "├── No readmissions to analyze")
   
    # Late readmissions (25-30 days)
    late_readmissions = readmitted[readmitted['DAYS_TO_READMISSION'] >= 25] if len(readmitted) > 0 else pd.DataFrame()
    print(f"├── Readmissions 25-30 days: {len(late_readmissions)} ({len(late_readmissions)/len(readmitted)*100:.1f}% of readmissions)" if len(readmitted) > 0 else "├── No readmissions to analyze")
   
    # Multiple readmissions for same patient
    if len(readmitted) > 0:
       multiple_readmissions = readmitted.groupby('DESYNPUF_ID').size()
       patients_multiple = (multiple_readmissions > 1).sum()
       print(f"└── Patients with multiple index admissions leading to readmission: {patients_multiple}")
   
    print(f"\n✓ Business logic validation complete")

# Validate logic
validate_readmission_logic(df_final_cohort)

VALIDATING READMISSION BUSINESS LOGIC
LOGICAL CONSISTENCY CHECKS:
├── Readmissions with missing dates: 0 (should be 0)
├── Readmissions outside 1-30 day window: 0 (should be 0)
├── Readmissions outside observation window: 0 (should be 0)

SAMPLE VALIDATION:
Sample readmission cases:
  Patient 00016F745862898F: Discharged 2009-09-02, Readmitted 2009-09-17 (15 days)
  Patient 0007F12A492FD25D: Discharged 2010-06-06, Readmitted 2010-06-16 (10 days)
  Patient 0007F12A492FD25D: Discharged 2010-06-12, Readmitted 2010-06-16 (4 days)
Sample non-readmission cases:
  Patient 2144DA6807B3F504: Discharged 2010-01-27, 30-day window ends 2010-02-26, No readmission
  Patient 7D0B7AAB91F6448A: Discharged 2009-10-02, 30-day window ends 2009-11-01, No readmission
  Patient 0B54CE74FA91FB83: Discharged 2008-12-19, 30-day window ends 2009-01-18, No readmission

CROSS-VALIDATION:
├── Eligible index admissions: 66,773
├── Flagged as readmissions: 6,767
├── Flagged as non-readmissions: 60,006
└── Total shoul

### Block 10: Create Target Variable Dataset

In [42]:
def create_target_variable_dataset(df_cohort, df_beneficiaries):
    """
    Create final dataset with target variable and key features for modeling
    """
    print("CREATING TARGET VARIABLE DATASET")
    print("="*40)
    
    # Start with cohort and select key columns
    target_df = df_cohort[[
        'DESYNPUF_ID', 'CLM_ID', 'CLM_ADMSN_DT', 'CLM_ADMSN_DT_DATE', 
        'NCH_BENE_DSCHRG_DT', 'NCH_BENE_DSCHRG_DT_DATE',
        'LOS_CALCULATED', 'CLM_DRG_CD', 'ICD9_DGNS_CD_1',
        'HAS_READMISSION', 'DAYS_TO_READMISSION', 'READMISSION_DATE',
        'ADMISSION_SEQ', 'IN_HOSPITAL_DEATH', 'DEATH_WITHIN_30_DAYS',
        'OBSERVATION_DAYS', 'discharge_year'
    ]].copy()
    
    print(f"Starting with {len(target_df):,} index admissions")
    
    # Create the binary target variable (30-day readmission)
    target_df['READMISSION_30DAY'] = target_df['HAS_READMISSION'].astype(int)
    
    # Add year and month features from admission date
    target_df['ADMISSION_YEAR'] = target_df['CLM_ADMSN_DT_DATE'].dt.year
    target_df['ADMISSION_MONTH'] = target_df['CLM_ADMSN_DT_DATE'].dt.month
    target_df['ADMISSION_QUARTER'] = target_df['CLM_ADMSN_DT_DATE'].dt.quarter
    target_df['ADMISSION_DAY_OF_WEEK'] = target_df['CLM_ADMSN_DT_DATE'].dt.dayofweek
    
    # Add discharge year and month
    target_df['DISCHARGE_YEAR'] = target_df['NCH_BENE_DSCHRG_DT_DATE'].dt.year
    target_df['DISCHARGE_MONTH'] = target_df['NCH_BENE_DSCHRG_DT_DATE'].dt.month
    
    # Merge with beneficiary data to get demographics and chronic conditions
    # Use the beneficiary data from the year of admission
    print("Merging with beneficiary demographics and chronic conditions...")
    
    beneficiary_features = df_beneficiaries[[
        'DESYNPUF_ID', 'BENE_YEAR', 'BENE_BIRTH_DT', 'BENE_SEX_IDENT_CD', 
        'BENE_RACE_CD', 'BENE_ESRD_IND', 'SP_STATE_CODE',
        'SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD',
        'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 
        'SP_RA_OA', 'SP_STRKETIA'
    ]].copy()
    
    # Merge with beneficiary data based on admission year
    target_with_demo = target_df.merge(
        beneficiary_features,
        left_on=['DESYNPUF_ID', 'ADMISSION_YEAR'],
        right_on=['DESYNPUF_ID', 'BENE_YEAR'],
        how='left'
    )
    
    merged_count = len(target_with_demo[target_with_demo['BENE_YEAR'].notna()])
    print(f"  ✓ Successfully merged {merged_count:,} records with beneficiary data")
    
    # Calculate age at admission - CORRECTED VERSION
    if 'BENE_BIRTH_DT' in target_with_demo.columns:
        print("Calculating age at admission...")
        
        # Handle birth dates as floats and convert to integers first
        birth_date_mask = target_with_demo['BENE_BIRTH_DT'].notna()
        print(f"  Records with birth dates: {birth_date_mask.sum():,}")
        
        if birth_date_mask.sum() > 0:
            # Convert float birth dates to integers, then to datetime
            target_with_demo.loc[birth_date_mask, 'BIRTH_DATE'] = pd.to_datetime(
                target_with_demo.loc[birth_date_mask, 'BENE_BIRTH_DT'].astype(int).astype(str), 
                format='%Y%m%d', 
                errors='coerce'
            )
            
            # Calculate age only for valid birth dates and admission dates
            valid_dates_mask = (
                target_with_demo['BIRTH_DATE'].notna() & 
                target_with_demo['CLM_ADMSN_DT_DATE'].notna()
            )
            
            target_with_demo.loc[valid_dates_mask, 'AGE_AT_ADMISSION'] = (
                target_with_demo.loc[valid_dates_mask, 'CLM_ADMSN_DT_DATE'] - 
                target_with_demo.loc[valid_dates_mask, 'BIRTH_DATE']
            ).dt.days / 365.25
            
            successful_age_calc = target_with_demo['AGE_AT_ADMISSION'].notna().sum()
            print(f"  ✓ Successfully calculated age for {successful_age_calc:,} records")
            
            # Check age ranges for validation
            if successful_age_calc > 0:
                min_age = target_with_demo['AGE_AT_ADMISSION'].min()
                max_age = target_with_demo['AGE_AT_ADMISSION'].max()
                mean_age = target_with_demo['AGE_AT_ADMISSION'].mean()
                print(f"  ✓ Age range: {min_age:.1f} to {max_age:.1f} years (mean: {mean_age:.1f})")
                
                # Sample age calculation verification
                sample_ages = target_with_demo[target_with_demo['AGE_AT_ADMISSION'].notna()][
                    ['DESYNPUF_ID', 'BENE_BIRTH_DT', 'CLM_ADMSN_DT_DATE', 'AGE_AT_ADMISSION']
                ].head(3)
                print(f"  Sample age calculations:")
                for _, row in sample_ages.iterrows():
                    print(f"    Born {int(row['BENE_BIRTH_DT'])}, admitted {row['CLM_ADMSN_DT_DATE'].strftime('%Y-%m-%d')}, age {row['AGE_AT_ADMISSION']:.1f}")
            else:
                print("  ⚠️ No successful age calculations")
        else:
            print("  ⚠️ No valid birth dates found")
            target_with_demo['AGE_AT_ADMISSION'] = np.nan
            target_with_demo['BIRTH_DATE'] = pd.NaT
    
    # Create chronic condition count
    chronic_conditions = ['SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD',
                         'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 
                         'SP_RA_OA', 'SP_STRKETIA']
    
    available_conditions = [col for col in chronic_conditions if col in target_with_demo.columns]
    if available_conditions:
        target_with_demo['CHRONIC_CONDITION_COUNT'] = target_with_demo[available_conditions].sum(axis=1)
        print(f"  ✓ Created chronic condition count from {len(available_conditions)} conditions")
    
    # Add weekend admission flag
    target_with_demo['WEEKEND_ADMISSION'] = target_with_demo['ADMISSION_DAY_OF_WEEK'].isin([5, 6])
    
    # Create season variable
    season_map = {12: 'Winter', 1: 'Winter', 2: 'Winter',
                  3: 'Spring', 4: 'Spring', 5: 'Spring',
                  6: 'Summer', 7: 'Summer', 8: 'Summer',
                  9: 'Fall', 10: 'Fall', 11: 'Fall'}
    target_with_demo['ADMISSION_SEASON'] = target_with_demo['ADMISSION_MONTH'].map(season_map)
    
    print(f"\n✓ Target variable dataset created")
    print(f"  Final records: {len(target_with_demo):,}")
    print(f"  Features available: {len(target_with_demo.columns)} columns")
    
    return target_with_demo

# Create target dataset
df_target_dataset = create_target_variable_dataset(df_final_cohort, df_beneficiary)

CREATING TARGET VARIABLE DATASET
Starting with 66,773 index admissions
Merging with beneficiary demographics and chronic conditions...
  ✓ Successfully merged 66,547 records with beneficiary data
Calculating age at admission...
  Records with birth dates: 66,547
  ✓ Successfully calculated age for 66,547 records
  ✓ Age range: 24.6 to 101.7 years (mean: 73.8)
  Sample age calculations:
    Born 19230501, admitted 2010-03-12, age 86.9
    Born 19430101, admitted 2009-04-12, age 66.3
    Born 19430101, admitted 2009-08-31, age 66.7
  ✓ Created chronic condition count from 11 conditions

✓ Target variable dataset created
  Final records: 66,773
  Features available: 46 columns


### checking block

In [43]:
# Debug age calculation issue
print("DEBUGGING AGE CALCULATION:")
print("="*30)

# Check the structure of key columns
print("Sample data:")
print(df_target_dataset[['DESYNPUF_ID', 'BENE_BIRTH_DT', 'CLM_ADMSN_DT_DATE', 'AGE_AT_ADMISSION']].head(10))

print(f"\nData types:")
print(df_target_dataset[['BENE_BIRTH_DT', 'CLM_ADMSN_DT_DATE', 'AGE_AT_ADMISSION']].dtypes)

print(f"\nMissing values:")
print(df_target_dataset[['BENE_BIRTH_DT', 'CLM_ADMSN_DT_DATE', 'AGE_AT_ADMISSION']].isnull().sum())

print(f"\nUnique birth date values (first 10):")
print(df_target_dataset['BENE_BIRTH_DT'].dropna().unique()[:10])

DEBUGGING AGE CALCULATION:
Sample data:
        DESYNPUF_ID  BENE_BIRTH_DT CLM_ADMSN_DT_DATE  AGE_AT_ADMISSION
0  00013D2EFD8E45D1     19230501.0        2010-03-12         86.863792
1  00016F745862898F     19430101.0        2009-04-12         66.277892
2  00016F745862898F     19430101.0        2009-08-31         66.663929
3  00016F745862898F     19430101.0        2009-09-17         66.710472
4  00016F745862898F     19430101.0        2010-06-26         67.482546
5  00052705243EA128     19340501.0        2008-09-12         74.368241
6  0007F12A492FD25D     19190901.0        2008-09-19         89.051335
7  0007F12A492FD25D     19190901.0        2010-06-02         90.751540
8  0007F12A492FD25D     19190901.0        2010-05-22         90.721424
9  0007F12A492FD25D     19190901.0        2010-06-16         90.789870

Data types:
BENE_BIRTH_DT               float64
CLM_ADMSN_DT_DATE    datetime64[ns]
AGE_AT_ADMISSION            float64
dtype: object

Missing values:
BENE_BIRTH_DT        226
CL

### Block 11: Final Target Variable Validation

In [44]:
def validate_target_variable(df_target):
    """
    Final validation of target variable and dataset quality
    """
    print("FINAL TARGET VARIABLE VALIDATION")
    print("="*40)
    
    # 1. Target variable distribution
    target_distribution = df_target['READMISSION_30DAY'].value_counts().sort_index()
    total_records = len(df_target)
    
    print("TARGET VARIABLE DISTRIBUTION:")
    print(f"├── No Readmission (0): {target_distribution[0]:,} ({target_distribution[0]/total_records*100:.1f}%)")
    print(f"├── Readmission (1): {target_distribution[1]:,} ({target_distribution[1]/total_records*100:.1f}%)")
    print(f"└── Total Records: {total_records:,}")
    
    # 2. Class balance assessment
    minority_class_pct = min(target_distribution) / total_records * 100
    print(f"\nCLASS BALANCE:")
    print(f"├── Minority class percentage: {minority_class_pct:.1f}%")
    
    if minority_class_pct >= 10:
        print(f"└── Class balance is acceptable for modeling ✓")
    elif minority_class_pct >= 5:
        print(f"└── Moderate class imbalance - consider resampling techniques")
    else:
        print(f"└── Severe class imbalance - resampling strongly recommended")
    
    # 3. Data completeness for key modeling features
    print(f"\nKEY FEATURE COMPLETENESS:")
    
    key_features = ['AGE_AT_ADMISSION', 'BENE_SEX_IDENT_CD', 'LOS_CALCULATED', 
                   'CHRONIC_CONDITION_COUNT', 'CLM_DRG_CD']
    
    for feature in key_features:
        if feature in df_target.columns:
            completeness = (1 - df_target[feature].isnull().mean()) * 100
            print(f"├── {feature}: {completeness:.1f}% complete")
    
    # 4. Demographic validation
    if 'AGE_AT_ADMISSION' in df_target.columns:
        # Use .mean(), .median(), .min(), .max() directly instead of describe()
        age_mean = df_target['AGE_AT_ADMISSION'].mean()
        age_median = df_target['AGE_AT_ADMISSION'].median()
        age_min = df_target['AGE_AT_ADMISSION'].min()
        age_max = df_target['AGE_AT_ADMISSION'].max()
        
        print(f"\nAGE DEMOGRAPHICS:")
        print(f"├── Mean age: {age_mean:.1f} years")
        print(f"├── Median age: {age_median:.1f} years")
        print(f"├── Age range: {age_min:.1f} to {age_max:.1f} years")
        
        # Age distribution by readmission status
        age_by_readmission = df_target.groupby('READMISSION_30DAY')['AGE_AT_ADMISSION'].mean()
        print(f"├── Mean age (no readmission): {age_by_readmission[0]:.1f} years")
        print(f"└── Mean age (readmission): {age_by_readmission[1]:.1f} years")
    
    # 5. Chronic conditions analysis
    if 'CHRONIC_CONDITION_COUNT' in df_target.columns:
        # Use direct methods instead of describe()
        chronic_mean = df_target['CHRONIC_CONDITION_COUNT'].mean()
        chronic_median = df_target['CHRONIC_CONDITION_COUNT'].median()
        chronic_max = df_target['CHRONIC_CONDITION_COUNT'].max()
        
        print(f"\nCHRONIC CONDITIONS:")
        print(f"├── Mean conditions: {chronic_mean:.1f}")
        print(f"├── Median conditions: {chronic_median:.1f}")
        print(f"├── Max conditions: {chronic_max:.0f}")
        
        # Chronic conditions by readmission status
        chronic_by_readmission = df_target.groupby('READMISSION_30DAY')['CHRONIC_CONDITION_COUNT'].mean()
        print(f"├── Mean conditions (no readmission): {chronic_by_readmission[0]:.1f}")
        print(f"└── Mean conditions (readmission): {chronic_by_readmission[1]:.1f}")
    
    # 6. Length of stay analysis
    if 'LOS_CALCULATED' in df_target.columns:
        # Use direct methods instead of describe()
        los_mean = df_target['LOS_CALCULATED'].mean()
        los_median = df_target['LOS_CALCULATED'].median()
        
        print(f"\nLENGTH OF STAY:")
        print(f"├── Mean LOS: {los_mean:.1f} days")
        print(f"├── Median LOS: {los_median:.1f} days")
        
        # LOS by readmission status
        los_by_readmission = df_target.groupby('READMISSION_30DAY')['LOS_CALCULATED'].mean()
        print(f"├── Mean LOS (no readmission): {los_by_readmission[0]:.1f} days")
        print(f"└── Mean LOS (readmission): {los_by_readmission[1]:.1f} days")
    
    # 7. Temporal distribution
    if 'ADMISSION_YEAR' in df_target.columns:
        yearly_distribution = df_target['ADMISSION_YEAR'].value_counts().sort_index()
        print(f"\nTEMPORAL DISTRIBUTION:")
        for year, count in yearly_distribution.items():
            readmission_rate = df_target[df_target['ADMISSION_YEAR'] == year]['READMISSION_30DAY'].mean() * 100
            print(f"├── {year}: {count:,} admissions ({readmission_rate:.1f}% readmission rate)")
    
    print(f"\n✓ Target variable validation complete")
    print(f"✓ Dataset ready for feature engineering and modeling")

# Validate final target variable
validate_target_variable(df_target_dataset)

FINAL TARGET VARIABLE VALIDATION
TARGET VARIABLE DISTRIBUTION:
├── No Readmission (0): 60,006 (89.9%)
├── Readmission (1): 6,767 (10.1%)
└── Total Records: 66,773

CLASS BALANCE:
├── Minority class percentage: 10.1%
└── Class balance is acceptable for modeling ✓

KEY FEATURE COMPLETENESS:
├── AGE_AT_ADMISSION: 99.7% complete
├── BENE_SEX_IDENT_CD: 99.7% complete
├── LOS_CALCULATED: 100.0% complete
├── CHRONIC_CONDITION_COUNT: 100.0% complete
├── CLM_DRG_CD: 100.0% complete

AGE DEMOGRAPHICS:
├── Mean age: 73.8 years
├── Median age: 75.0 years
├── Age range: 24.6 to 101.7 years
├── Mean age (no readmission): 73.7 years
└── Mean age (readmission): 73.8 years

CHRONIC CONDITIONS:
├── Mean conditions: 5.5
├── Median conditions: 6.0
├── Max conditions: 11
├── Mean conditions (no readmission): 5.3
└── Mean conditions (readmission): 6.7

LENGTH OF STAY:
├── Mean LOS: 6.7 days
├── Median LOS: 5.0 days
├── Mean LOS (no readmission): 6.7 days
└── Mean LOS (readmission): 7.2 days

TEMPORAL DISTRI

### Block 12: Save Target Variable Dataset

In [47]:
def save_target_dataset(df_target):
    """
    Save the final target variable dataset for modeling
    """
    print("SAVING TARGET VARIABLE DATASET")
    print("="*35)
    
    # Create features directory
    features_path = "data/features/"
    os.makedirs(features_path, exist_ok=True)
    
    # Save main target dataset
    target_file_csv = f"{features_path}readmission_target_dataset.csv"
    target_file_parquet = f"{features_path}readmission_target_dataset.parquet"
    
    df_target.to_csv(target_file_csv, index=False)
    df_target.to_parquet(target_file_parquet)
    
    print(f"✓ Target dataset saved:")
    print(f"  ├── CSV: {target_file_csv}")
    print(f"  └── Parquet: {target_file_parquet}")
    
    # Save readmission-only records for analysis
    readmissions_only = df_target[df_target['READMISSION_30DAY'] == 1].copy()
    readmission_file = f"{features_path}readmission_events_only.csv"
    readmissions_only.to_csv(readmission_file, index=False)
    
    print(f"✓ Readmission events saved: {readmission_file}")
    print(f"  └── {len(readmissions_only):,} readmission events")
    
    # Create data dictionary
    data_dict_file = f"{features_path}target_dataset_dictionary.txt"
    with open(data_dict_file, 'w') as f:
        f.write("READMISSION TARGET DATASET - DATA DICTIONARY\n")
        f.write("="*60 + "\n\n")
        
        f.write("TARGET VARIABLE:\n")
        f.write("- READMISSION_30DAY: Binary flag (0/1) for 30-day readmission\n\n")
        
        f.write("KEY IDENTIFIERS:\n")
        f.write("- DESYNPUF_ID: Patient identifier\n")
        f.write("- CLM_ID: Claim identifier for index admission\n\n")
        
        f.write("TEMPORAL FEATURES:\n")
        f.write("- CLM_ADMSN_DT_DATE: Admission date\n")
        f.write("- NCH_BENE_DSCHRG_DT_DATE: Discharge date\n")
        f.write("- ADMISSION_YEAR/MONTH/QUARTER: Temporal components\n")
        f.write("- ADMISSION_DAY_OF_WEEK: Day of week (0=Monday)\n")
        f.write("- WEEKEND_ADMISSION: Boolean for weekend admission\n")
        f.write("- ADMISSION_SEASON: Season of admission\n\n")
        
        f.write("CLINICAL FEATURES:\n")
        f.write("- LOS_CALCULATED: Length of stay in days\n")
        f.write("- CLM_DRG_CD: Diagnosis Related Group code\n")
        f.write("- ICD9_DGNS_CD_1: Primary diagnosis code\n\n")
        
        f.write("DEMOGRAPHIC FEATURES:\n")
        f.write("- AGE_AT_ADMISSION: Patient age at admission\n")
        f.write("- BENE_SEX_IDENT_CD: Gender (1=Male, 2=Female)\n")
        f.write("- BENE_RACE_CD: Race code\n")
        f.write("- SP_STATE_CODE: State code\n\n")
        
        f.write("CHRONIC CONDITIONS (Boolean flags):\n")
        f.write("- SP_ALZHDMTA: Alzheimer's/Dementia\n")
        f.write("- SP_CHF: Congestive Heart Failure\n")
        f.write("- SP_CHRNKIDN: Chronic Kidney Disease\n")
        f.write("- SP_CNCR: Cancer\n")
        f.write("- SP_COPD: COPD\n")
        f.write("- SP_DEPRESSN: Depression\n")
        f.write("- SP_DIABETES: Diabetes\n")
        f.write("- SP_ISCHMCHT: Ischemic Heart Disease\n")
        f.write("- SP_OSTEOPRS: Osteoporosis\n")
        f.write("- SP_RA_OA: Rheumatoid Arthritis/Osteoarthritis\n")
        f.write("- SP_STRKETIA: Stroke/TIA\n")
        f.write("- CHRONIC_CONDITION_COUNT: Total chronic conditions\n\n")
        
        f.write("READMISSION DETAILS:\n")
        f.write("- HAS_READMISSION: Same as READMISSION_30DAY\n")
        f.write("- DAYS_TO_READMISSION: Days from discharge to readmission\n")
        f.write("- READMISSION_DATE: Date of readmission (if occurred)\n\n")
        
        f.write("QUALITY FLAGS:\n")
        f.write("- ADMISSION_SEQ: Admission sequence number for patient\n")
        f.write("- IN_HOSPITAL_DEATH: Flag for in-hospital death\n")
        f.write("- DEATH_WITHIN_30_DAYS: Flag for death within 30 days\n")
        f.write("- OBSERVATION_DAYS: Actual observation period length\n\n")
        
        f.write(f"DATASET STATISTICS:\n")
        f.write(f"- Total Records: {len(df_target):,}\n")
        f.write(f"- Readmissions: {df_target['READMISSION_30DAY'].sum():,}\n")
        f.write(f"- Readmission Rate: {df_target['READMISSION_30DAY'].mean()*100:.2f}%\n")
        f.write(f"- Unique Patients: {df_target['DESYNPUF_ID'].nunique():,}\n")
        f.write(f"- Date Range: {df_target['CLM_ADMSN_DT_DATE'].min().strftime('%Y-%m-%d')} to {df_target['NCH_BENE_DSCHRG_DT_DATE'].max().strftime('%Y-%m-%d')}\n")
    
    print(f"✓ Data dictionary saved: {data_dict_file}")
    
    # Save summary statistics
    summary_file = f"{features_path}target_creation_summary.txt"
    with open(summary_file, 'w') as f:
        f.write("TARGET VARIABLE CREATION SUMMARY\n")
        f.write("="*40 + "\n\n")
        f.write(f"Processing Pipeline:\n")
        f.write(f"1. Started with {len(df_target):,} eligible admissions\n")
        f.write(f"2. Excluded in-hospital deaths and invalid dates\n")
        f.write(f"3. Created 30-day observation windows\n")
        f.write(f"4. Identified readmission events\n")
        f.write(f"5. Merged with demographic and clinical features\n\n")
        f.write(f"Final Dataset:\n")
        f.write(f"- Records: {len(df_target):,}\n")
        f.write(f"- Features: {len(df_target.columns)}\n")
        f.write(f"- Readmission Rate: {df_target['READMISSION_30DAY'].mean()*100:.2f}%\n")
        f.write(f"- Ready for Feature Engineering (Phase D)\n")
    
    print(f"✓ Summary saved: {summary_file}")

# Save target dataset
save_target_dataset(df_target_dataset)

SAVING TARGET VARIABLE DATASET
✓ Target dataset saved:
  ├── CSV: data/features/readmission_target_dataset.csv
  └── Parquet: data/features/readmission_target_dataset.parquet
✓ Readmission events saved: data/features/readmission_events_only.csv
  └── 6,767 readmission events
✓ Data dictionary saved: data/features/target_dataset_dictionary.txt
✓ Summary saved: data/features/target_creation_summary.txt


### Block 13: Phase C Summary

In [48]:
print("\n" + "="*60)
print("PHASE C COMPLETE: TARGET VARIABLE CREATION")
print("="*60)

print("✅ ACCOMPLISHMENTS:")
print("├── Processed and validated all admission/discharge dates")
print("├── Identified index admissions (eligible for readmission analysis)")
print("├── Excluded in-hospital deaths and invalid records")
print("├── Created 30-day observation windows")
print("├── Identified readmission events using temporal matching")
print("├── Merged with demographic and chronic condition data")
print("├── Created binary target variable (READMISSION_30DAY)")
print("└── Validated business logic and data quality")

print(f"\n📊 FINAL TARGET DATASET STATISTICS:")
total_records = len(df_target_dataset)
readmissions = df_target_dataset['READMISSION_30DAY'].sum()
readmission_rate = readmissions / total_records * 100

print(f"├── Total eligible admissions: {total_records:,}")
print(f"├── 30-day readmissions: {readmissions:,}")
print(f"├── Readmission rate: {readmission_rate:.2f}%")
print(f"├── Unique patients: {df_target_dataset['DESYNPUF_ID'].nunique():,}")
print(f"├── Date range: {df_target_dataset['CLM_ADMSN_DT_DATE'].min().strftime('%Y-%m-%d')} to {df_target_dataset['NCH_BENE_DSCHRG_DT_DATE'].max().strftime('%Y-%m-%d')}")
print(f"└── Available features: {len(df_target_dataset.columns)} columns")

print(f"\n🎯 TARGET VARIABLE QUALITY:")
print(f"├── Class balance: {(1-readmission_rate/100)*100:.1f}% / {readmission_rate:.1f}%")
print(f"├── Medicare benchmark: 12-15% (Our rate: {readmission_rate:.1f}%)")

if 8 <= readmission_rate <= 20:
    print(f"├── Rate within expected range ✓")
else:
    print(f"├── Rate may need investigation ⚠️")

print(f"└── Ready for machine learning modeling ✓")

print(f"\n🚀 NEXT STEPS:")
print("├── Ready for Notebook 4: Feature Engineering")
print("├── Create demographic and clinical features")
print("├── Engineer prior admission history features")
print("├── Build comorbidity scores and risk indicators")
print("├── Prepare features for model training")
print("└── Feature validation and selection")

print(f"\n✓ Phase C Complete - Target Variable Successfully Created!")


PHASE C COMPLETE: TARGET VARIABLE CREATION
✅ ACCOMPLISHMENTS:
├── Processed and validated all admission/discharge dates
├── Identified index admissions (eligible for readmission analysis)
├── Excluded in-hospital deaths and invalid records
├── Created 30-day observation windows
├── Identified readmission events using temporal matching
├── Merged with demographic and chronic condition data
├── Created binary target variable (READMISSION_30DAY)
└── Validated business logic and data quality

📊 FINAL TARGET DATASET STATISTICS:
├── Total eligible admissions: 66,773
├── 30-day readmissions: 6,767
├── Readmission rate: 10.13%
├── Unique patients: 37,780
├── Date range: 2007-11-27 to 2010-12-31
└── Available features: 46 columns

🎯 TARGET VARIABLE QUALITY:
├── Class balance: 89.9% / 10.1%
├── Medicare benchmark: 12-15% (Our rate: 10.1%)
├── Rate within expected range ✓
└── Ready for machine learning modeling ✓

🚀 NEXT STEPS:
├── Ready for Notebook 4: Feature Engineering
├── Create demographic