In [2]:
# ===================================================================
# Step 1: Setup and Imports
# ===================================================================
import pandas as pd
from datetime import datetime
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from tqdm import tqdm
import joblib

print("Step 1: All libraries imported successfully.")

# ===================================================================
# Step 2: Load and Prepare Initial Data
# ===================================================================
print("\nStep 2: Loading raw data...")

# --- PATHS CORRECTED ---
# Paths are now relative to the 'notebooks' folder
path_to_raw_data = '../../data_collection/synthea/output/csv/'
path_to_processed_data = '../data/processed/'
# --- END CORRECTION ---

os.makedirs(path_to_processed_data, exist_ok=True)

try:
    patients_df = pd.read_csv(path_to_raw_data + 'patients.csv')
    conditions_df = pd.read_csv(path_to_raw_data + 'conditions.csv')
    observations_df = pd.read_csv(path_to_raw_data + 'observations.csv')
    
    conditions_df['START'] = pd.to_datetime(conditions_df['START'])
    observations_df['DATE'] = pd.to_datetime(observations_df['DATE'])
    patients_df['BIRTHDATE'] = pd.to_datetime(patients_df['BIRTHDATE'])
    
    if observations_df['DATE'].dt.tz is not None:
         observations_df['DATE'] = observations_df['DATE'].dt.tz_localize(None)

    print("-> Raw data files loaded successfully and dates standardized.")
except FileNotFoundError as e:
    print(f"\nERROR: Could not find raw data files. Checked location: {path_to_raw_data}")
    print(f"Full error: {e}")
    # exit() # Commented out exit() to allow notebook to run partially
except Exception as e:
    print(f"\nAn unexpected error occurred loading data: {e}")
    # exit()

# ===================================================================
# Step 3: Define Cohorts and Snapshot Dates
# ===================================================================
print("\nStep 3: Defining patient cohorts and snapshot dates to prevent leakage...")

if 'conditions_df' in locals() and 'patients_df' in locals():
    ckd_conditions = conditions_df[conditions_df['DESCRIPTION'].str.contains('Chronic kidney disease', case=False)].copy()
    ckd_event_dates = ckd_conditions.loc[ckd_conditions.groupby('PATIENT')['START'].idxmin()]
    ckd_event_dates = ckd_event_dates[['PATIENT', 'START']].rename(columns={'START': 'EVENT_DATE'})
    ckd_event_dates['SNAPSHOT_DATE'] = ckd_event_dates['EVENT_DATE'] - pd.DateOffset(years=2)
    ckd_event_dates['CKD'] = 1

    ckd_patient_ids = ckd_event_dates['PATIENT'].unique()
    control_patient_ids = patients_df[~patients_df['Id'].isin(ckd_patient_ids)]['Id']
    control_last_obs = observations_df[observations_df['PATIENT'].isin(control_patient_ids)].copy()
    control_snapshot_dates = control_last_obs.loc[control_last_obs.groupby('PATIENT')['DATE'].idxmax()]
    control_snapshot_dates = control_snapshot_dates[['PATIENT', 'DATE']].rename(columns={'DATE': 'SNAPSHOT_DATE'})
    control_snapshot_dates['CKD'] = 0

    snapshot_df = pd.concat([ckd_event_dates[['PATIENT', 'SNAPSHOT_DATE', 'CKD']], control_snapshot_dates])
    print(f"-> Created time-aware cohorts for {len(snapshot_df)} patients.")
else:
    print("-> Skipping Step 3, raw data not loaded.")


# ===================================================================
# Step 4: Time-Aware Feature Engineering
# ===================================================================
print("\nStep 4: Engineering features using time-aware logic...")
tqdm.pandas(desc="Processing Patients") 

FEATURE_CODES = {
    'BMI': '39156-5',
    'SYSTOLIC_BP': '8480-6',
    'SERUM_CREATININE': '2160-0'
}

def get_time_aware_features(patient_row):
    patient_id = patient_row['PATIENT']
    snapshot_date = patient_row['SNAPSHOT_DATE']
    
    obs_before_snapshot = observations_df[(observations_df['PATIENT'] == patient_id) & (observations_df['DATE'] < snapshot_date)]
    cond_before_snapshot = conditions_df[(conditions_df['PATIENT'] == patient_id) & (conditions_df['START'] < snapshot_date)]
    
    features = {'PATIENT': patient_id}
    
    for name, code in FEATURE_CODES.items():
        latest_obs = obs_before_snapshot[obs_before_snapshot['CODE'] == code].sort_values('DATE', ascending=False).head(1)
        features[name] = latest_obs['VALUE'].iloc[0] if not latest_obs.empty else None
        
    features['HAS_DIABETES'] = 1 if not cond_before_snapshot[cond_before_snapshot['DESCRIPTION'].str.contains('diabetes', na=False) & ~cond_before_snapshot['DESCRIPTION'].str.contains('Prediabetes', na=False)].empty else 0
    features['HAS_HYPERTENSION'] = 1 if not cond_before_snapshot[cond_before_snapshot['DESCRIPTION'].str.contains('Hypertension', na=False)].empty else 0
        
    return pd.Series(features)

if 'snapshot_df' in locals():
    feature_df = snapshot_df.progress_apply(get_time_aware_features, axis=1)
    print("-> Time-aware features created.")
else:
    print("-> Skipping Step 4, cohorts not defined.")

# ===================================================================
# Step 5: Combine, Clean, and Save
# ===================================================================
print("\nStep 5: Combining and cleaning data...")

if 'feature_df' in locals():
    final_df = pd.merge(feature_df, snapshot_df, on='PATIENT')
    final_df = pd.merge(final_df, patients_df[['Id', 'BIRTHDATE']], left_on='PATIENT', right_on='Id')
    final_df['AGE'] = (final_df['SNAPSHOT_DATE'].dt.year - final_df['BIRTHDATE'].dt.year)
    final_df = final_df.drop(columns=['Id', 'BIRTHDATE'])

    continuous_features = ['AGE', 'BMI', 'SYSTOLIC_BP', 'SERUM_CREATININE']
    binary_features = ['HAS_DIABETES', 'HAS_HYPERTENSION']
    feature_columns = continuous_features + binary_features

    final_df = final_df[feature_columns + ['CKD']].copy()

    print("-> Performing robust data cleaning and imputation...")
    cols_to_drop = []
    for col in feature_columns:
        final_df[col] = pd.to_numeric(final_df[col], errors='coerce')

    for col in feature_columns:
        if final_df[col].isnull().all():
            cols_to_drop.append(col)
            print(f"  WARNING: Feature column '{col}' is completely empty and will be dropped.")

    if cols_to_drop:
        final_df = final_df.drop(columns=cols_to_drop)
        continuous_features = [c for c in continuous_features if c not in cols_to_drop]
        binary_features = [b for b in binary_features if b not in cols_to_drop]

    for col in continuous_features + binary_features:
        if final_df[col].isnull().any():
            median_val = final_df[col].median()
            final_df[col] = final_df[col].fillna(median_val)

    print("-> Missing values handled.")
    output_path = path_to_processed_data + 'cleaned_ckd_predictive_data.csv'
    final_df.to_csv(output_path, index=False)
    print(f"-> Clean data saved to '{output_path}'")
    print(f"-> Shape of final model-ready data: {final_df.shape}")
else:
    print("-> Skipping Step 5, features not engineered.")

# ===================================================================
# Step 6: Prepare for Modeling
# ===================================================================
print("\nStep 6: Preprocessing features and splitting data...")

if 'final_df' in locals():
    feature_columns = continuous_features + binary_features
    X = final_df[feature_columns]
    y = final_df['CKD']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), continuous_features),
            ('passthrough_binary', 'passthrough', binary_features)
        ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"-> Training set size: {X_train.shape[0]} samples")
    print(f"-> Test set size: {X_test.shape[0]} samples")
else:
    print("-> Skipping Step 6, final_df not created.")

# ===================================================================
# Step 7: Train and Evaluate Models
# ===================================================================
print("\nStep 7: Training and evaluating predictive models...")

if 'X_train' in locals():
    try:
        scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    except (ZeroDivisionError, KeyError):
        scale_pos_weight = 1

    models = {
        "Logistic Regression": Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(random_state=42, class_weight='balanced'))]),
        "Random Forest": Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))]),
        "XGBoost": Pipeline(steps=[('preprocessor', preprocessor), ('classifier', xgb.XGBClassifier(random_state=42, scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss'))])
    }

    for name, pipeline in models.items():
        print(f"\n{'='*20} {name} {'='*20}")
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
        
        print(f"\nClassification Report:")
        print(classification_report(y_test, y_pred))
        acc_score = accuracy_score(y_test, y_pred)
        print(f"Accuracy Score: {acc_score:.4f}")
        auc_score = roc_auc_score(y_test, y_pred_proba)
        print(f"AUC-ROC Score: {auc_score:.4f}\n")
else:
    print("-> Skipping Step 7, data not split.")

# ===================================================================
# Step 8: Save the Final Model
# ===================================================================
print("\nStep 8: Saving best model to file...")

if 'models' in locals():
    # This selects the trained Random Forest model from the 'models' dictionary
    final_model = models["Random Forest"]

    # --- PATHS CORRECTED ---
    # Define the directory and create it if it doesn't exist
    model_directory = '../models/'
    # --- END CORRECTION ---
    
    os.makedirs(model_directory, exist_ok=True)

    # Define the full filename
    model_filename = model_directory + 'ckd_model.joblib'

    # Save the model
    joblib.dump(final_model, model_filename)

    print(f"✅ CKD model saved successfully to {model_filename}")
else:
    print("-> Skipping Step 8, models not trained.")

print("\n--- End of Script ---")

Step 1: All libraries imported successfully.

Step 2: Loading raw data...
-> Raw data files loaded successfully and dates standardized.

Step 3: Defining patient cohorts and snapshot dates to prevent leakage...
-> Created time-aware cohorts for 1171 patients.

Step 4: Engineering features using time-aware logic...


Processing Patients: 100%|██████████| 1171/1171 [01:06<00:00, 17.63it/s]


-> Time-aware features created.

Step 5: Combining and cleaning data...
-> Performing robust data cleaning and imputation...
-> Missing values handled.
-> Clean data saved to '../data/processed/cleaned_ckd_predictive_data.csv'
-> Shape of final model-ready data: (1171, 7)

Step 6: Preprocessing features and splitting data...
-> Training set size: 936 samples
-> Test set size: 235 samples

Step 7: Training and evaluating predictive models...


Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.53      0.68       213
           1       0.13      0.68      0.22        22

    accuracy                           0.54       235
   macro avg       0.54      0.61      0.45       235
weighted avg       0.87      0.54      0.64       235

Accuracy Score: 0.5447
AUC-ROC Score: 0.6391



Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       213
           1       1.00

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
