## Setup and configuration

### Imports

In [1]:
# Standard library imports
import os
import sys
import time
from datetime import datetime
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import cpu_count

# Disable GPU and limit threading
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Third party imports
import joblib
import json
import numpy as np
import pandas as pd
import tensorflow as tf

# Add models directory to path for ensemble_classifier import
sys.path.insert(0, str(Path('../models').resolve()))

# Import ensemble modules
from ensemble_classifier import EnsembleClassifier
from functions import ensemble_database
from functions.ensemble_initialization import create_data_splits, create_base_preprocessor, train_founder_model
from functions.ensemble_parallel import train_single_candidate, prepare_training_batch
from functions.ensemble_evaluation import evaluate_candidate_ensemble
from functions.ensemble_stage2_training import train_or_expand_stage2_model, save_ensemble_bundle
from functions.ensemble_hill_climbing import (
    adaptive_simulated_annealing_acceptance,
    update_temperature,
    log_iteration
)
from functions.ensemble_stage2_model import save_checkpoint

# Configure TensorFlow
tf.get_logger().setLevel('ERROR')

# Detect available CPUs
n_cpus = cpu_count()
print(f"TensorFlow version: {tf.__version__}")
print(f"Available CPUs: {n_cpus}")

2025-12-08 04:51:45.138485: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765169505.161252    4898 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765169505.168173    4898 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


TensorFlow version: 2.18.0
Available CPUs: 24


### Ensemble training & hill climb parameters

In [2]:
# Random state for reproducibility
RANDOM_STATE = 315

# CPU allocation for parallel training
# Set to None to use all available CPUs, or specify a number to limit
N_CPUS = 10

if N_CPUS is None:
    import multiprocessing
    N_CPUS = multiprocessing.cpu_count()

# Parallel training configuration
BATCH_SIZE = 10              # Train this many candidates in parallel
N_WORKERS = N_CPUS           # Use all available CPUs as workers
MODEL_TIMEOUT_MINUTES = 30   # Maximum training time per model (minutes)

# Hill climbing configuration
MAX_ITERATIONS = 500
PLATEAU_ITERATIONS = 100
BASE_TEMPERATURE = 0.05
TEMPERATURE_DECAY = 0.998

# Stage 2 DNN configuration
STAGE2_BATCH_SIZE_MODELS = 10  # Retrain DNN every N accepted models
STAGE2_EPOCHS = 100
STAGE2_BATCH_SIZE = 128
STAGE2_PATIENCE = 10

# Paths
DATA_DIR = Path('../data')
MODELS_BASE_DIR = Path('../models')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
MODELS_DIR = MODELS_BASE_DIR / f'run_{timestamp}'
ENSEMBLE_DIR = MODELS_DIR / 'ensemble_stage1_models'
CHECKPOINT_PATH = MODELS_DIR / 'ensemble_checkpoint.pkl'

# Create directories
DATA_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)
ENSEMBLE_DIR.mkdir(parents=True, exist_ok=True)

print(f"\nConfiguration:")
print(f"  Total CPUs available: {N_CPUS}")
print(f"  Parallel workers: {N_WORKERS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Model timeout: {MODEL_TIMEOUT_MINUTES} minutes")


Configuration:
  Total CPUs available: 10
  Parallel workers: 10
  Batch size: 10
  Model timeout: 30 minutes


### Initialize progress dashboard database

In [3]:
ensemble_database.reset_database()
ensemble_database.init_database()

Deleted existing database: /workspaces/diabetes-prediction/data/ensemble_training.db
Database initialized at: /workspaces/diabetes-prediction/data/ensemble_training.db


## Data preparation

### Load data

In [4]:
# Load training data
train_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_train.csv'
train_df = pd.read_csv(train_df_path)
train_df.drop_duplicates(inplace=True)

print(f'Training data shape: {train_df.shape}\n')
print('Class distribution:')
print(train_df['diagnosed_diabetes'].value_counts(normalize=True))

Training data shape: (700000, 26)

Class distribution:
diagnosed_diabetes
1.0    0.623296
0.0    0.376704
Name: proportion, dtype: float64


### Define features

In [5]:
label = 'diagnosed_diabetes'

numerical_features = [
    'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
    'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
    'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
    'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides',
    'family_history_diabetes', 'hypertension_history', 'cardiovascular_history'
]

ordinal_features = ['education_level', 'income_level']
education_categories = [['No formal', 'Highschool', 'Graduate', 'Postgraduate']]
income_categories = [['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']]
nominal_features = ['gender', 'ethnicity', 'smoking_status', 'employment_status']

### Trainining-validation-validation split

In [6]:
# Create fixed three-way data split
X_train_pool, X_val_s1, X_val_s2, y_train_pool, y_val_s1, y_val_s2 = create_data_splits(
    train_df, label, RANDOM_STATE
)

print("Data info before batch preparation:")
print(f"  X_train_pool: {type(X_train_pool)} - {X_train_pool.shape if hasattr(X_train_pool, 'shape') else 'N/A'}")
print(f"  X_val_s1: {type(X_val_s1)} - {X_val_s1.shape if hasattr(X_val_s1, 'shape') else 'N/A'}")
print(f"  X_val_s2: {type(y_val_s2)} - {y_val_s2.shape if hasattr(y_val_s2, 'shape') else 'N/A'}")

Data info before batch preparation:
  X_train_pool: <class 'pandas.core.frame.DataFrame'> - (420000, 25)
  X_val_s1: <class 'pandas.core.frame.DataFrame'> - (140000, 25)
  X_val_s2: <class 'pandas.core.series.Series'> - (140000,)


### Base data preprocessor

In [7]:
# Create base preprocessor
base_preprocessor = create_base_preprocessor(
    numerical_features, ordinal_features, nominal_features,
    education_categories, income_categories
)

base_preprocessor


Base preprocessor created
  Numerical features: 18
  Ordinal features: 2
  Nominal features: 4


## Initialize ensemble with founder model

In [8]:
# Train founder model (baseline only - NOT added to ensemble)
founder_auc = train_founder_model(
    X_train_pool, X_val_s1, X_val_s2, y_train_pool, y_val_s1, y_val_s2,
    base_preprocessor, RANDOM_STATE, BASE_TEMPERATURE, ENSEMBLE_DIR
)

# Initialize ensemble (EMPTY - founder not included)
ensemble_models = []
stage2_model = None
best_ensemble_score = founder_auc

# Initialize hill climbing variables (start at iteration 1, not 0)
start_iteration = 1
temperature = BASE_TEMPERATURE

print(f"\nFounder baseline AUC: {founder_auc:.6f}")
print(f"Ensemble starts empty - first batch will be iterations 1-{BATCH_SIZE}")
print(f"Stage 2 DNN will be trained after {STAGE2_BATCH_SIZE_MODELS} accepted models")

TRAINING FOUNDER MODEL (baseline only - NOT added to ensemble)

Training founder model
--------------------------------------------------------------------------------
  Training samples: 42,000 (10% of 420,000 pool)
  Pipeline config:
    Classifier: adaboost
    Transformers: []
    Dimensionality reduction: fast_ica
  Training pipeline...




  Training complete (156.3s)
  Stage 1 validation AUC: 0.572370
  Stage 2 validation AUC: 0.571482

FOUNDER MODEL COMPLETE - Baseline score established

Founder baseline AUC: 0.571482
Ensemble starts empty - first batch will be iterations 1-10
Stage 2 DNN will be trained after 10 accepted models


## Parallel hill climbing loop

Iteratively trains batches of candidate models in parallel, evaluates with hybrid scoring,
and accepts/rejects using simulated annealing.

In [None]:
iterations_since_improvement = 0
iteration = start_iteration

# Calculate timeout values in seconds
model_timeout_seconds = MODEL_TIMEOUT_MINUTES * 60
batch_timeout_seconds = model_timeout_seconds + 60  # Add 1 minute buffer for batch timeout

while iteration < MAX_ITERATIONS and iterations_since_improvement < PLATEAU_ITERATIONS:

    print(f"BATCH Starting at iteration {iteration}")
    print(f"Ensemble size: {len(ensemble_models)} | Best score: {best_ensemble_score:.6f} | "
          f"Temperature: {temperature:.6f} | No improvement: {iterations_since_improvement}/{PLATEAU_ITERATIONS}")
    
    # Prepare batch of training jobs with intelligent CPU allocation
    batch_jobs = prepare_training_batch(
        iteration, BATCH_SIZE, MAX_ITERATIONS, X_train_pool, y_train_pool,
        X_val_s1, y_val_s1, base_preprocessor, RANDOM_STATE, total_cpus=N_CPUS,
        timeout_minutes=MODEL_TIMEOUT_MINUTES
    )
    
    # Pre-determine classifier types for this batch (for timeout logging)
    batch_classifier_types = {}

    for job in batch_jobs:
        job_iteration = job[0]
        rng = np.random.RandomState(RANDOM_STATE + job_iteration)
        classifier_pool = [
            'logistic', 'random_forest', 'linear_svc',
            'sgd_classifier', 'extra_trees', 'adaboost',
            'naive_bayes', 'lda', 'qda', 'ridge'
            'gradient_boosting', 'mlp', 'knn'
        ]

        batch_classifier_types[job_iteration] = rng.choice(classifier_pool)
    
    print(f"\nTraining {len(batch_jobs)} candidates in parallel ({MODEL_TIMEOUT_MINUTES} min timeout per model)...")
    
    # Train candidates in parallel with timeout
    batch_start_time = time.time()
    trained_candidates = []
    failed_count = 0
    timeout_iterations = []  # Track which iterations timed out

    with ProcessPoolExecutor(max_workers=N_WORKERS) as executor:

        futures = {executor.submit(train_single_candidate, job): job for job in batch_jobs}
        completed = 0

        for future in as_completed(futures, timeout=batch_timeout_seconds):

            completed += 1
            job = futures[future]

            try:

                # Individual job timeout
                result = future.result(timeout=model_timeout_seconds)
                trained_candidates.append(result)

                print(f"  [{completed}/{len(batch_jobs)}] Iteration {result['iteration']}: "
                      f"{result['metadata']['classifier_type']} AUC={result['val_auc_s1']:.6f} "
                      f"({result['training_time']:.1f}s)")

            except TimeoutError:

                failed_count += 1
                timeout_iterations.append(job[0])  # Store iteration number that timed out
                timeout_classifier = batch_classifier_types[job[0]]
                print(f"  [{completed}/{len(batch_jobs)}] Iteration {job[0]} TIMEOUT: {timeout_classifier} exceeded {MODEL_TIMEOUT_MINUTES} minutes")
            
            except Exception as e:

                failed_count += 1
                print(f"  [{completed}/{len(batch_jobs)}] Iteration {job[0]} FAILED: {e}")
    
    if failed_count > 0:
        print(f"\n{failed_count}/{len(batch_jobs)} models failed during training")
    
    # Log timeout iterations to database
    for timeout_iter in timeout_iterations:

        timeout_classifier = batch_classifier_types[timeout_iter]

        log_iteration(
            iteration=timeout_iter,
            accepted=False,
            rejection_reason="timeout",
            pipeline_hash="timeout",
            stage1_val_auc=0.0,
            stage2_val_auc=0.0,
            ensemble_size=len(ensemble_models),
            diversity_score=0.0,
            temperature=temperature,
            metadata={'classifier_type': timeout_classifier, 'transformers_used': []},
            ensemble_id=f"iter_{timeout_iter}",
            training_memory_mb=None,
            stage2_memory_mb=None,
            training_time_sec=None,
            stage2_time_sec=None,
            timeout=True
        )
    
    if len(trained_candidates) == 0:

        print(f"\nWARNING: All models in batch failed! Continuing to next batch...")
        iteration += len(batch_jobs)  # Move past failed iterations
        continue  # Continue to next batch instead of breaking
    
    batch_time = time.time() - batch_start_time
    print(f"\nBatch complete ({batch_time:.1f}s, {batch_time/len(trained_candidates):.1f}s per model)")
    
    # Sort by iteration number
    trained_candidates.sort(key=lambda x: x['iteration'])
    
    # Process each trained candidate for acceptance/rejection
    for result in trained_candidates:
        current_iter = result['iteration']
        fitted_pipeline = result['fitted_pipeline']
        metadata = result['metadata']
        val_auc_s1 = result['val_auc_s1']
        pipeline_hash = result['pipeline_hash']
        training_memory_mb = result.get('memory_mb', None)
        training_time_sec = result.get('training_time_sec', None)
        
        print(f"\n{'-' * 80}")
        print(f"Iteration {current_iter}: {metadata['classifier_type']} | Stage 1 AUC: {val_auc_s1:.6f}")
        
        # Evaluate ensemble with candidate
        if len(ensemble_models) == 0:

            # First model - just use its score
            candidate_score = val_auc_s1
            diversity_score = 0.0
            aggregation_method = "single_model"
            print(f"  Ensemble AUC ({aggregation_method}): {candidate_score:.6f} (first model)")

        else:

            # Evaluate as ensemble
            candidate_ensemble = ensemble_models + [fitted_pipeline]
            candidate_score, diversity_score, aggregation_method = evaluate_candidate_ensemble(
                candidate_ensemble, ensemble_models, stage2_model,
                X_val_s1, X_val_s2, y_val_s1, y_val_s2
            )

            print(f"  Ensemble AUC ({aggregation_method}): {candidate_score:.6f} | Diversity: {diversity_score:.6f}")
        
        # Simulated annealing acceptance (with diversity bonus)
        accept, reason = adaptive_simulated_annealing_acceptance(
            current_score=best_ensemble_score,
            candidate_score=candidate_score,
            temperature=temperature,
            random_state=RANDOM_STATE + current_iter,
            diversity_score=diversity_score
        )
        
        print(f"  Decision: {'ACCEPT' if accept else 'REJECT'} ({reason})")
        
        # Track stage 2 memory and time for this iteration
        stage2_memory_mb = None
        stage2_time_sec = None
        
        # Log iteration
        log_iteration(
            iteration=current_iter,
            accepted=accept,
            rejection_reason=reason,
            pipeline_hash=pipeline_hash,
            stage1_val_auc=val_auc_s1,
            stage2_val_auc=candidate_score,
            ensemble_size=len(ensemble_models) + 1 if accept else len(ensemble_models),
            diversity_score=diversity_score,
            temperature=temperature,
            metadata=metadata,
            ensemble_id=f"iter_{current_iter}",
            training_memory_mb=training_memory_mb,
            stage2_memory_mb=stage2_memory_mb,
            training_time_sec=training_time_sec,
            stage2_time_sec=stage2_time_sec,
            timeout=False
        )
        
        # Update ensemble if accepted
        if accept:
            ensemble_models.append(fitted_pipeline)
            
            # Save model
            model_path = ENSEMBLE_DIR / f'model_{current_iter}.joblib'
            joblib.dump(fitted_pipeline, model_path)
            
            # Check if we should train/retrain stage 2 DNN
            if len(ensemble_models) % STAGE2_BATCH_SIZE_MODELS == 0 and len(ensemble_models) > 0:

                stage2_model, final_score, stage2_memory_mb, stage2_time_sec = train_or_expand_stage2_model(
                    ensemble_models, stage2_model, X_val_s1, y_val_s1, X_val_s2, y_val_s2,
                    STAGE2_EPOCHS, STAGE2_BATCH_SIZE, STAGE2_PATIENCE, current_iter
                )
                
                # Save ensemble bundle checkpoint
                save_ensemble_bundle(
                    ensemble_models, stage2_model, best_ensemble_score, current_iter,
                    MODELS_DIR, RANDOM_STATE, BATCH_SIZE, N_WORKERS, base_preprocessor,
                    numerical_features, ordinal_features, nominal_features,
                    education_categories, income_categories
                )
            
            # Check if this is the best score
            if candidate_score > best_ensemble_score:

                print(f"  New best score: {candidate_score:.6f} (Δ={candidate_score - best_ensemble_score:.6f})")
                best_ensemble_score = candidate_score
                iterations_since_improvement = 0

            else:
                iterations_since_improvement += 1
        else:
            iterations_since_improvement += 1
        
        # Update temperature
        temperature = update_temperature(
            iteration=current_iter,
            acceptance_history=[accept],
            current_temperature=temperature,
            base_temperature=BASE_TEMPERATURE,
            decay_rate=TEMPERATURE_DECAY
        )
    
    # Move to next batch
    iteration += len(trained_candidates)
    
    # Check termination
    if iterations_since_improvement >= PLATEAU_ITERATIONS:
        print(f"TERMINATING: No improvement for {PLATEAU_ITERATIONS} iterations")
        break

# Calculate final acceptance rate and timeout rate
conn = ensemble_database.sqlite3.connect(ensemble_database.DB_PATH)
acceptance_stats = conn.execute("SELECT COUNT(*) as total, SUM(accepted) as accepted FROM ensemble_log WHERE iteration_num > 0").fetchone()
timeout_stats = conn.execute("SELECT SUM(timeout) as timeouts FROM ensemble_log WHERE iteration_num > 0").fetchone()
conn.close()

acceptance_rate = acceptance_stats[1] / acceptance_stats[0] if acceptance_stats[0] > 0 else 0.0
timeout_rate = timeout_stats[0] / acceptance_stats[0] if acceptance_stats[0] > 0 else 0.0

print(f"Final ensemble size: {len(ensemble_models)}")
print(f"Best ensemble AUC: {best_ensemble_score:.6f}")
print(f"Total iterations: {iteration - 1}")
print(f"Acceptance rate: {acceptance_rate:.1%}")
print(f"Timeout rate: {timeout_rate:.1%}")

BATCH Starting at iteration 1
Ensemble size: 0 | Best score: 0.571482 | Temperature: 0.050000 | No improvement: 0/100

Training 10 candidates in parallel (30 min timeout per model)...
[Iteration 1] Training lda + ratio, Sample size: 146651 rows (34.9%)
[Iteration 2] Training lasso + sum, difference, kmeans, Sample size: 80348 rows (19.1%)
[Iteration 3] Training logistic + None, Sample size: 115927 rows (27.6%)
[Iteration 4] Training gradient_boosting + None, Sample size: 167804 rows (40.0%)
  [1/10] Iteration 1 FAILED: The leading minor of order 15 of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed.
[Iteration 5] Training random_forest + iqr_clipper, log, quantile_transform, Sample size: 132008 rows (31.4%)
[Iteration 6] Training qda + ratio, kmeans, Sample size: 153629 rows (36.6%)
[Iteration 7] Training mlp + None, Sample size: 93984 rows (22.4%)
  [2/10] Iteration 3: logistic AUC=0.576894 (2.6s)
[Iteration 8]



  [4/10] Iteration 11: linear_svc AUC=0.583371 (16.7s)
  [5/10] Iteration 10: lda AUC=0.523824 (120.8s)
  [6/10] Iteration 4 TIMEOUT: random_forest exceeded 30 minutes
  [7/10] Iteration 5 TIMEOUT: linear_svc exceeded 30 minutes
  [8/10] Iteration 6 TIMEOUT: extra_trees exceeded 30 minutes
  [9/10] Iteration 8 TIMEOUT: qda exceeded 30 minutes
  [10/10] Iteration 13 TIMEOUT: extra_trees exceeded 30 minutes

6/10 models failed during training

Batch complete (1804.8s, 451.2s per model)

--------------------------------------------------------------------------------
Iteration 9: lda | Stage 1 AUC: 0.622577
  Ensemble AUC (simple mean (all)): 0.677543 | Diversity: 0.269871
  Decision: ACCEPT (diversity_bonus: Δ=-0.007803, div=0.270, bonus=0.011506)

--------------------------------------------------------------------------------
Iteration 10: lda | Stage 1 AUC: 0.523824
  Ensemble AUC (simple mean (all)): 0.676786 | Diversity: 0.202813
  Decision: ACCEPT (diversity_bonus: Δ=-0.008560, div

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


[Iteration 17] Training naive_bayes + None, Sample size: 157690 rows (37.5%)
  [3/10] Iteration 14: lasso AUC=0.591899 (2.6s)
  [4/10] Iteration 15 FAILED: Input contains NaN.
  [5/10] Iteration 17: naive_bayes AUC=0.559299 (2.7s)
  [6/10] Iteration 16: random_forest AUC=0.633499 (8.0s)




  [7/10] Iteration 11: linear_svc AUC=0.510807 (14.6s)
  [8/10] Iteration 10: lda AUC=0.551753 (92.7s)


## Save final checkpoint and bundle

In [None]:
# Save final checkpoint
save_checkpoint(
    checkpoint_path=CHECKPOINT_PATH,
    ensemble_models=ensemble_models,
    stage2_model=stage2_model,
    iteration=iteration - 1,
    temperature=temperature,
    best_score=best_ensemble_score,
    acceptance_history=[],
    metadata={
        'total_iterations': iteration,
        'final_ensemble_size': len(ensemble_models),
        'acceptance_rate': acceptance_rate,
        'best_score': best_ensemble_score,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    }
)

# Save metadata
metadata_path = MODELS_DIR / 'ensemble_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump({
        'ensemble_size': len(ensemble_models),
        'total_iterations': iteration,
        'best_score': best_ensemble_score,
        'acceptance_rate': acceptance_rate,
        'training_completed': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'random_state': RANDOM_STATE,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    }, f, indent=2)

print(f"\nCheckpoint saved: {CHECKPOINT_PATH}")
print(f"Metadata saved: {metadata_path}")

In [None]:
# Save final ensemble bundle for Kaggle
ensemble_bundle_path = MODELS_DIR / 'ensemble_bundle.joblib'

ensemble_bundle = {
    'ensemble_models': ensemble_models,
    'stage2_model': stage2_model,
    'metadata': {
        'ensemble_size': len(ensemble_models),
        'total_iterations': iteration,
        'best_score': best_ensemble_score,
        'acceptance_rate': acceptance_rate,
        'training_completed': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'random_state': RANDOM_STATE,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    },
    'base_preprocessor': base_preprocessor,
    'feature_info': {
        'numerical_features': numerical_features,
        'ordinal_features': ordinal_features,
        'nominal_features': nominal_features,
        'education_categories': education_categories,
        'income_categories': income_categories
    }
}

joblib.dump(ensemble_bundle, ensemble_bundle_path, compress=3)

print(f"\nFinal ensemble bundle saved: {ensemble_bundle_path}")
print(f"File size: {ensemble_bundle_path.stat().st_size / (1024**2):.1f} MB")
print(f"\nTo load on Kaggle:")
print(f"  ensemble_bundle = joblib.load('ensemble_bundle.joblib')")
print(f"  ensemble_models = ensemble_bundle['ensemble_models']")
print(f"  stage2_model = ensemble_bundle['stage2_model']")

In [None]:
# Import the wrapper class
sys.path.insert(0, str(MODELS_BASE_DIR))

# Create wrapped model
wrapped_model = EnsembleClassifier(
    ensemble_models=ensemble_models,
    stage2_model=stage2_model,
    aggregation='mean'  # Fallback if stage2_model is None
)

# Save as single joblib file
wrapped_model_path = MODELS_DIR / 'ensemble_model.joblib'
joblib.dump(wrapped_model, wrapped_model_path, compress=3)

print(f"\nWrapped ensemble model saved: {wrapped_model_path}")
print(f"File size: {wrapped_model_path.stat().st_size / (1024**2):.1f} MB")
print(f"\nModel info: {wrapped_model}")
print(f"\nTo use on Kaggle:")
print(f"  1. Upload to Kaggle dataset:")
print(f"     - {wrapped_model_path.name}")
print(f"     - {MODELS_BASE_DIR / 'ensemble_classifier.py'}")
print(f"  2. In inference notebook:")
print(f"     from ensemble_classifier import EnsembleClassifier")
print(f"     model = joblib.load('ensemble_model.joblib')")
print(f"     predictions = model.predict(test_df)")

## Create wrapped ensemble model for Kaggle

Create a sklearn-compatible wrapper that bundles the entire ensemble into a single classifier.
This makes inference identical to the logistic regression workflow.

## Summary

In [None]:
print(f"\n{'=' * 80}")
print("ENSEMBLE TRAINING SUMMARY")
print(f"{'=' * 80}")
print(f"\nFinal Statistics:")
print(f"  Ensemble size: {len(ensemble_models)}")
print(f"  Best validation AUC: {best_ensemble_score:.6f}")
print(f"  Total iterations: {iteration}")
print(f"  Acceptance rate: {acceptance_rate:.1%}")
print(f"  Parallel configuration: {BATCH_SIZE} candidates, {N_WORKERS} workers")
print(f"\nFiles created:")
print(f"  Database: {ensemble_database.DB_PATH}")
print(f"  Models: {ENSEMBLE_DIR}")
print(f"  Checkpoint: {CHECKPOINT_PATH}")
print(f"  Metadata: {metadata_path}")
print(f"  Bundle: {ensemble_bundle_path}")
print(f"\n{'=' * 80}")