In [None]:
# Standard library imports
import os
import sys
import time
from datetime import datetime
from pathlib import Path

# Disable GPU (CUDA drivers not available in dev container)
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow warnings

# Set environment variable to limit thread usage
os.environ['OMP_NUM_THREADS'] = '8'

# Third party imports
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline

# Add models directory to path for custom transformers
models_path = Path('../models').resolve()
sys.path.insert(0, str(models_path))

# Import custom transformers from models directory
from logistic_regression_transformers import IDColumnDropper, IQRClipper

# Import ensemble modules from functions package
from functions import ensemble_database
from functions.ensemble_hill_climbing import (
    generate_random_pipeline,
    calculate_ensemble_diversity,
    quick_optimize_pipeline,
    adaptive_simulated_annealing_acceptance,
    update_temperature,
    compute_pipeline_hash,
    log_iteration
)
from functions.ensemble_stage2_model import (
    build_stage2_dnn,
    train_stage2_dnn,
    save_checkpoint,
    load_checkpoint,
    evaluate_ensemble
)

# Configure TensorFlow
tf.get_logger().setLevel('ERROR')
print(f"TensorFlow version: {tf.__version__}")
print(f"Running on: CPU (24 cores)")
print(f"GPU disabled: CUDA drivers not available in dev container")

### Configuration

In [None]:
# Random state for reproducibility (only for pipeline generation, not data sampling)
RANDOM_STATE = 315

# Number of founder models
N_FOUNDERS = 5

# Hill climbing configuration
MAX_ITERATIONS = 500
PLATEAU_ITERATIONS = 100  # Stop if no improvement for this many iterations
BASE_TEMPERATURE = 0.01
TEMPERATURE_DECAY = 0.995

# Training sample size
TRAINING_SAMPLE_SIZE = 50000  # Use subset of data for faster training

# Stage 1 optimization
QUICK_OPTIMIZE_ITERATIONS = 10
QUICK_OPTIMIZE_CV = 3
QUICK_OPTIMIZE_JOBS = 8

# Founder ensemble optimization (reduced for speed)
FOUNDER_OPTIMIZE_ITERATIONS = 5
FOUNDER_OPTIMIZE_CV = 3

# Stage 2 DNN configuration
STAGE2_EPOCHS = 100
STAGE2_BATCH_SIZE = 128
STAGE2_PATIENCE = 10

# Checkpoint configuration
RESUME_FROM_CHECKPOINT = False  # Set to True to resume from saved checkpoint

# Paths
DATA_DIR = Path('../data')
MODELS_DIR = Path('../models')
ENSEMBLE_DIR = MODELS_DIR / 'ensemble_stage1_models'
CHECKPOINT_PATH = MODELS_DIR / 'ensemble_checkpoint.pkl'

# Create directories
DATA_DIR.mkdir(parents=True, exist_ok=True)
ENSEMBLE_DIR.mkdir(parents=True, exist_ok=True)

# Initialize SQLite database for logging
ensemble_database.init_database()

print(f"Configuration:")
print(f"  Random state: {RANDOM_STATE}")
print(f"  Number of founder models: {N_FOUNDERS}")
print(f"  Training sample size: {TRAINING_SAMPLE_SIZE:,}")
print(f"  Max iterations: {MAX_ITERATIONS}")
print(f"  Plateau threshold: {PLATEAU_ITERATIONS}")
print(f"  Founder optimization iterations: {FOUNDER_OPTIMIZE_ITERATIONS}")
print(f"  Resume from checkpoint: {RESUME_FROM_CHECKPOINT}")
print(f"  Ensemble directory: {ENSEMBLE_DIR}")
print(f"  Database: {ensemble_database.DB_PATH}")

### Data Loading

In [None]:
# Load training data
train_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_train.csv'
train_df = pd.read_csv(train_df_path)
train_df.drop_duplicates(inplace=True)

print(f"Training data shape: {train_df.shape}")
print(f"Class distribution:")
print(train_df['diagnosed_diabetes'].value_counts(normalize=True))

# Define label and features
label = 'diagnosed_diabetes'

# Feature definitions (from logistic regression notebook)
numerical_features = [
    'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
    'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
    'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
    'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides',
    'family_history_diabetes', 'hypertension_history', 'cardiovascular_history'
]

ordinal_features = ['education_level', 'income_level']
education_categories = [['No formal', 'Highschool', 'Graduate', 'Postgraduate']]
income_categories = [['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']]

nominal_features = ['gender', 'ethnicity', 'smoking_status', 'employment_status']

### Create Base Preprocessor

This preprocessor will be shared across all stage 1 models for consistent encoding.

In [None]:
# Create numerical pipeline
numerical_pipeline = Pipeline([
    ('clipper', IQRClipper(iqr_multiplier=2.0)),
    ('scaler', StandardScaler())
])

# Create encoders
ordinal_encoder = OrdinalEncoder(
    categories=education_categories + income_categories,
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

onehot_encoder = OneHotEncoder(
    drop='first',
    sparse_output=False,
    handle_unknown='ignore'
)

# Create base preprocessor
base_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('ord', ordinal_encoder, ordinal_features),
        ('nom', onehot_encoder, nominal_features)
    ]
)

print("Base preprocessor created")
print(f"  Numerical features: {len(numerical_features)}")
print(f"  Ordinal features: {len(ordinal_features)}")
print(f"  Nominal features: {len(nominal_features)}")

## Initialize or Resume Ensemble

### Option 1: Initialize Founder Model (if starting fresh)

In [None]:
if not RESUME_FROM_CHECKPOINT:
    print("=" * 80)
    print("INITIALIZING FOUNDER MODEL")
    print("=" * 80)
    
    ensemble_models = []
    
    # Prepare features and labels
    X_full = train_df.drop(columns=[label])
    y_full = train_df[label]
    
    # Random sample size for founder
    rng = np.random.RandomState(RANDOM_STATE)
    founder_sample_size = rng.randint(10000, 50001)
    
    # Random train/test split (no random_state = different split each time)
    X_train, X_val, y_train, y_val = train_test_split(
        X_full, 
        y_full, 
        train_size=founder_sample_size,
        stratify=y_full
    )
    
    print(f"\nTraining founder model")
    print("-" * 80)
    print(f"  Train samples: {len(X_train):,}")
    print(f"  Validation samples: {len(X_val):,}")
    
    # Generate random pipeline for founder
    pipeline, metadata = generate_random_pipeline(
        iteration=0,
        random_state=RANDOM_STATE,
        base_preprocessor=base_preprocessor
    )
    
    print(f"  Pipeline config:")
    print(f"    Classifier: {metadata['classifier_type']}")
    print(f"    Transformers: {metadata['transformers_used']}")
    print(f"    Use PCA: {metadata['use_pca']}")
    
    # Train on training sample
    print(f"  Training on sample...")
    start_time = time.time()
    
    fitted_pipeline, cv_score = quick_optimize_pipeline(
        pipeline=pipeline,
        X=X_train,
        y=y_train,
        n_iter=FOUNDER_OPTIMIZE_ITERATIONS,
        cv=FOUNDER_OPTIMIZE_CV,
        n_jobs=QUICK_OPTIMIZE_JOBS,
        random_state=RANDOM_STATE
    )
    
    training_time = time.time() - start_time
    print(f"  Training complete ({training_time:.1f}s)")
    print(f"  CV ROC-AUC: {cv_score:.6f}")
    
    # Generate predictions on validation set
    if hasattr(fitted_pipeline, 'predict_proba'):
        val_pred = fitted_pipeline.predict_proba(X_val)[:, 1]
    else:
        val_pred = fitted_pipeline.decision_function(X_val)
    
    # Calculate validation AUC
    val_auc = roc_auc_score(y_val, val_pred)
    
    print(f"  Validation ROC-AUC: {val_auc:.6f}")
    
    # Save founder model
    ensemble_models.append(fitted_pipeline)
    model_path = ENSEMBLE_DIR / 'founder_model.joblib'
    joblib.dump(fitted_pipeline, model_path)
    
    # Log founder
    pipeline_hash = compute_pipeline_hash(fitted_pipeline, metadata)
    ensemble_id = "founder"
    log_iteration(
        iteration=0,
        fold=0,
        accepted=True,
        rejection_reason='founder',
        pipeline_hash=pipeline_hash,
        stage1_cv_score=cv_score,
        stage1_val_auc=val_auc,
        stage2_val_auc=val_auc,  # No stage 2 yet, use stage 1 score
        ensemble_size=1,
        diversity_score=0.0,
        temperature=BASE_TEMPERATURE,
        metadata=metadata,
        ensemble_id=ensemble_id
    )
    
    print(f"\n{'=' * 80}")
    print("FOUNDER MODEL COMPLETE")
    print(f"{'=' * 80}")
    print(f"Validation ROC-AUC: {val_auc:.6f}")

### Initialize Stage 2 DNN (if starting fresh)

In [None]:
if not RESUME_FROM_CHECKPOINT:
    print(f"\n{'=' * 80}")
    print("INITIALIZING STAGE 2 DNN")
    print(f"{'=' * 80}")
    
    # For initial stage 2 with single model, use minimal architecture
    stage2_model = build_stage2_dnn(
        n_models=1,
        n_layers=1,
        units_per_layer=32,
        dropout=0.2,
        batch_norm=False,
        activation='relu',
        learning_rate=0.001
    )
    
    print(f"Initial stage 2 DNN architecture:")
    stage2_model.summary()
    
    # Use validation predictions from founder for initial stage 2 training
    X_stage2 = val_pred.reshape(-1, 1)
    y_stage2 = y_val.values
    
    # Simple train/val split
    split_idx = int(len(X_stage2) * 0.8)
    X_train_s2 = X_stage2[:split_idx]
    y_train_s2 = y_stage2[:split_idx]
    X_val_s2 = X_stage2[split_idx:]
    y_val_s2 = y_stage2[split_idx:]
    
    print(f"\nTraining stage 2 on founder validation set...")
    
    # Train
    ensemble_id = "founder_stage2"
    stage2_model, history = train_stage2_dnn(
        model=stage2_model,
        X_train=X_train_s2,
        y_train=y_train_s2,
        X_val=X_val_s2,
        y_val=y_val_s2,
        epochs=50,
        batch_size=32,
        patience=10,
        log_path=ensemble_id,
        iteration=0,
        fold=0
    )
    
    # Evaluate on full validation set
    stage2_pred = stage2_model.predict(X_stage2, verbose=0).flatten()
    stage2_auc = roc_auc_score(y_stage2, stage2_pred)
    
    print(f"  Stage 2 ROC-AUC: {stage2_auc:.6f}")
    
    # Set initial best score
    best_ensemble_score = stage2_auc
    
    print(f"\nInitial ensemble performance:")
    print(f"  Stage 2 ROC-AUC: {best_ensemble_score:.6f}")
    
    # Initialize hill climbing variables
    start_iteration = 1  # Start from iteration 1 (founder is 0)
    temperature = BASE_TEMPERATURE
    acceptance_history = []

## Hill Climbing Loop

Iteratively add diverse models with simulated annealing acceptance.

In [None]:
print(f"\n{'=' * 80}")
print("STARTING HILL CLIMBING LOOP")
print(f"{'=' * 80}")

iterations_since_improvement = 0
consecutive_rejections = 0

for iteration in range(start_iteration, MAX_ITERATIONS):
    print(f"\n{'=' * 80}")
    print(f"Iteration {iteration + 1}/{MAX_ITERATIONS}")
    print(f"{'=' * 80}")
    print(f"Current ensemble size: {len(ensemble_models)}")
    print(f"Best score: {best_ensemble_score:.6f}")
    print(f"Temperature: {temperature:.6f}")
    print(f"Iterations since improvement: {iterations_since_improvement}/{PLATEAU_ITERATIONS}")
    
    # Random sample size for this iteration
    rng = np.random.RandomState(RANDOM_STATE + iteration)
    iteration_sample_size = rng.randint(10000, 50001)
    
    # Random train/test split for this iteration (no random_state = different split each time)
    X_train, X_val, y_train, y_val = train_test_split(
        X_full,
        y_full,
        train_size=iteration_sample_size,
        stratify=y_full
    )
    
    print(f"  Train samples: {len(X_train):,}")
    print(f"  Validation samples: {len(X_val):,}")
    
    # Generate random pipeline
    pipeline, metadata = generate_random_pipeline(
        iteration=iteration,
        random_state=RANDOM_STATE + iteration,
        base_preprocessor=base_preprocessor
    )
    
    print(f"\nPipeline configuration:")
    print(f"  Classifier: {metadata['classifier_type']}")
    print(f"  Transformers: {', '.join(metadata['transformers_used']) if metadata['transformers_used'] else 'None'}")
    
    # Quick optimize
    print(f"\nOptimizing pipeline...")
    optimized_pipeline, cv_score = quick_optimize_pipeline(
        pipeline=pipeline,
        X=X_train,
        y=y_train,
        n_iter=QUICK_OPTIMIZE_ITERATIONS,
        cv=QUICK_OPTIMIZE_CV,
        n_jobs=QUICK_OPTIMIZE_JOBS,
        random_state=RANDOM_STATE + iteration
    )
    
    print(f"  Stage 1 CV ROC-AUC: {cv_score:.6f}")
    
    # Evaluate on validation set
    if hasattr(optimized_pipeline, 'predict_proba'):
        val_pred = optimized_pipeline.predict_proba(X_val)[:, 1]
    else:
        val_pred = optimized_pipeline.decision_function(X_val)
    
    val_auc = roc_auc_score(y_val, val_pred)
    print(f"  Stage 1 validation ROC-AUC: {val_auc:.6f}")
    
    # Add to candidate pool and evaluate ensemble
    print(f"\nEvaluating ensemble with candidate...")
    candidate_ensemble = ensemble_models + [optimized_pipeline]
    
    # Evaluate ensemble on validation set
    candidate_score = evaluate_ensemble(
        stage1_models=candidate_ensemble,
        stage2_model=stage2_model,
        X=X_val,
        y=y_val
    )
    
    print(f"  Candidate ensemble ROC-AUC: {candidate_score:.6f}")
    
    # Calculate diversity
    all_predictions = []
    for model in candidate_ensemble:
        if hasattr(model, 'predict_proba'):
            pred = model.predict_proba(X_val)[:, 1]
        else:
            pred = model.decision_function(X_val)
        all_predictions.append(pred)
    
    all_predictions = np.column_stack(all_predictions)
    diversity_score = calculate_ensemble_diversity(all_predictions)
    print(f"  Diversity score: {diversity_score:.6f}")
    
    # Simulated annealing acceptance
    accept, reason = adaptive_simulated_annealing_acceptance(
        current_score=best_ensemble_score,
        candidate_score=candidate_score,
        temperature=temperature,
        random_state=RANDOM_STATE + iteration
    )
    
    print(f"\nDecision: {'âœ“ ACCEPT' if accept else 'âœ— REJECT'}")
    print(f"  Reason: {reason}")
    
    # Log iteration
    pipeline_hash = compute_pipeline_hash(optimized_pipeline, metadata)
    ensemble_id = f"iter_{iteration}"
    log_iteration(
        iteration=iteration,
        fold=iteration % N_FOUNDERS,  # Cycle through founder indices for logging
        accepted=accept,
        rejection_reason=reason,
        pipeline_hash=pipeline_hash,
        stage1_cv_score=cv_score,
        stage1_val_auc=val_auc,
        stage2_val_auc=candidate_score,
        ensemble_size=len(candidate_ensemble) if accept else len(ensemble_models),
        diversity_score=diversity_score,
        temperature=temperature,
        metadata=metadata,
        ensemble_id=ensemble_id
    )
    
    # Update ensemble if accepted
    if accept:
        ensemble_models.append(optimized_pipeline)
        acceptance_history.append(True)
        consecutive_rejections = 0
        
        # Save model
        model_path = ENSEMBLE_DIR / f'model_{iteration}.joblib'
        joblib.dump(optimized_pipeline, model_path)
        
        # Check if this is the best score
        if candidate_score > best_ensemble_score:
            print(f"  ðŸŽ‰ New best score: {candidate_score:.6f} (Î”={candidate_score - best_ensemble_score:.6f})")
            best_ensemble_score = candidate_score
            iterations_since_improvement = 0
        else:
            iterations_since_improvement += 1
    else:
        acceptance_history.append(False)
        consecutive_rejections += 1
        iterations_since_improvement += 1
    
    # Update temperature
    temperature = update_temperature(
        iteration=iteration,
        acceptance_history=acceptance_history,
        current_temperature=temperature,
        base_temperature=BASE_TEMPERATURE,
        decay_rate=TEMPERATURE_DECAY
    )
    
    # Check termination conditions
    if iterations_since_improvement >= PLATEAU_ITERATIONS:
        print(f"\n{'=' * 80}")
        print(f"TERMINATING: No improvement for {PLATEAU_ITERATIONS} iterations")
        print(f"{'=' * 80}")
        break

print(f"\n{'=' * 80}")
print("HILL CLIMBING COMPLETE")
print(f"{'=' * 80}")
print(f"Final ensemble size: {len(ensemble_models)}")
print(f"Best ensemble ROC-AUC: {best_ensemble_score:.6f}")
print(f"Total iterations: {iteration + 1}")
print(f"Acceptance rate: {sum(acceptance_history) / len(acceptance_history):.1%}")

## Save Final Checkpoint and Metadata

In [None]:
# Save final checkpoint
save_checkpoint(
    checkpoint_path=CHECKPOINT_PATH,
    ensemble_models=ensemble_models,
    stage2_model=stage2_model,
    iteration=iteration,
    temperature=temperature,
    current_fold=current_fold,
    best_score=best_ensemble_score,
    acceptance_history=acceptance_history,
    metadata={
        'total_iterations': iteration + 1,
        'final_ensemble_size': len(ensemble_models),
        'acceptance_rate': sum(acceptance_history) / len(acceptance_history),
        'best_score': best_ensemble_score
    }
)

# Save ensemble metadata as JSON
import json

metadata_path = MODELS_DIR / 'ensemble_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump({
        'ensemble_size': len(ensemble_models),
        'total_iterations': iteration + 1,
        'best_score': best_ensemble_score,
        'acceptance_rate': sum(acceptance_history) / len(acceptance_history),
        'training_completed': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'n_folds': N_FOLDS,
        'random_state': RANDOM_STATE
    }, f, indent=2)

print(f"\nFinal checkpoint saved: {CHECKPOINT_PATH}")
print(f"Metadata saved: {metadata_path}")

## Summary Statistics

In [None]:
print(f"\n{'=' * 80}")
print("ENSEMBLE TRAINING SUMMARY")
print(f"{'=' * 80}")
print(f"\nFinal Statistics:")
print(f"  Ensemble size: {len(ensemble_models)}")
print(f"  Best validation ROC-AUC: {best_ensemble_score:.6f}")
print(f"  Total iterations: {iteration + 1}")
print(f"  Accepted models: {sum(acceptance_history)}")
print(f"  Rejected models: {len(acceptance_history) - sum(acceptance_history)}")
print(f"  Acceptance rate: {sum(acceptance_history) / len(acceptance_history):.1%}")
print(f"\nFiles created:")
print(f"  Training log: {TRAINING_LOG_PATH}")
print(f"  Stage 2 log: {STAGE2_LOG_PATH}")
print(f"  Ensemble models: {ENSEMBLE_DIR}")
print(f"  Checkpoint: {CHECKPOINT_PATH}")
print(f"  Metadata: {metadata_path}")
print(f"\n{'=' * 80}")