## Setup and configuration

### Critical Fixes Applied:

1. **Pre-sample Training Data**: Sample training data in main process before sending to workers - workers receive only ~2.5-15% of full pool (much faster serialization)
2. **Keep DataFrames**: Data kept as DataFrames (required by sklearn ColumnTransformer for column name access)
3. **Founder Simplification**: Founder model establishes baseline score but is NOT added to ensemble, simplifying batch/iteration indexing
4. **Batch Size = N_CPUS**: Set batch size to match CPU count for maximum parallelization (20 workers training 20 models simultaneously)
5. **Enhanced Error Handling**: Better progress tracking and graceful handling of failed models
6. **Empty Ensemble Start**: Ensemble starts empty at iteration 1, avoiding index confusion
7. **Zero Features Protection**: ConstantFeatureRemover keeps at least 1 feature to prevent downstream errors

### Serialization Impact:
- **Before**: 5 workers √ó 40,000 rows √ó 30 cols = ~48 MB per batch
- **After**: 20 workers √ó ~2,400 rows √ó 30 cols = ~14 MB per batch (~70% reduction!)

### CPU Utilization Strategy:
- **Before**: 5 workers, each with multiple cores (only ~2 cores used total)
- **After**: 20 workers, each with 1 core = all 20 cores busy simultaneously

In [1]:
# Standard library imports
import os
import sys
import time
from datetime import datetime
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import cpu_count

# Disable GPU and limit threading
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# os.environ['OMP_NUM_THREADS'] = '2'  # DISABLED - Let sklearn control parallelism via n_jobs

# Third party imports
import joblib
import json
import numpy as np
import pandas as pd
import tensorflow as tf

# Add models directory to path for ensemble_classifier import
sys.path.insert(0, str(Path('../models').resolve()))

# Import ensemble modules
from ensemble_classifier import EnsembleClassifier
from functions import ensemble_database
from functions.ensemble_initialization import create_data_splits, create_base_preprocessor, train_founder_model
from functions.ensemble_parallel import train_single_candidate, prepare_training_batch
from functions.ensemble_evaluation import evaluate_candidate_ensemble
from functions.ensemble_stage2_training import train_or_expand_stage2_model, save_ensemble_bundle
from functions.ensemble_hill_climbing import (
    adaptive_simulated_annealing_acceptance,
    update_temperature,
    log_iteration
)
from functions.ensemble_stage2_model import save_checkpoint

# Configure TensorFlow
tf.get_logger().setLevel('ERROR')

# Detect available CPUs
n_cpus = cpu_count()
print(f"TensorFlow version: {tf.__version__}")
print(f"Available CPUs: {n_cpus}")
print(f"GPU disabled: CUDA drivers not available in dev container")
print(f"OMP_NUM_THREADS: Not set (sklearn controls parallelism via n_jobs)")


2025-12-07 18:08:22.000810: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765148902.026365 1058970 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765148902.037563 1058970 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


TensorFlow version: 2.18.0
Available CPUs: 24
GPU disabled: CUDA drivers not available in dev container
OMP_NUM_THREADS: Not set (sklearn controls parallelism via n_jobs)


### Configuration parameters

In [None]:
# Random state for reproducibility
RANDOM_STATE = 315

# CPU allocation for parallel training
# Set to None to use all available CPUs, or specify a number to limit
N_CPUS = 20  # Will use all available cores by default

if N_CPUS is None:
    import multiprocessing
    N_CPUS = multiprocessing.cpu_count()

# Parallel training configuration
BATCH_SIZE = 20  # Train this many candidates in parallel (increased to match N_CPUS)
N_WORKERS = N_CPUS  # Use all available CPUs as workers
MODEL_TIMEOUT_MINUTES = 30  # Maximum time per model (minutes)

# Hill climbing configuration
MAX_ITERATIONS = 500
PLATEAU_ITERATIONS = 100
BASE_TEMPERATURE = 0.05  # Increased from 0.01 for better exploration
TEMPERATURE_DECAY = 0.998  # Slowed from 0.995 for sustained exploration

# Stage 2 DNN configuration
STAGE2_BATCH_SIZE_MODELS = 10  # Retrain DNN every N accepted models
STAGE2_EPOCHS = 100
STAGE2_BATCH_SIZE = 128
STAGE2_PATIENCE = 10

# Paths
DATA_DIR = Path('../data')
MODELS_BASE_DIR = Path('../models')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
MODELS_DIR = MODELS_BASE_DIR / f'run_{timestamp}'
ENSEMBLE_DIR = MODELS_DIR / 'ensemble_stage1_models'
CHECKPOINT_PATH = MODELS_DIR / 'ensemble_checkpoint.pkl'

# Create directories
DATA_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)
ENSEMBLE_DIR.mkdir(parents=True, exist_ok=True)

# Initialize database
ensemble_database.reset_database()
ensemble_database.init_database()

print(f"\nConfiguration:")
print(f"  Total CPUs available: {N_CPUS}")
print(f"  Parallel workers: {N_WORKERS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Model timeout: {MODEL_TIMEOUT_MINUTES} minutes")
print(f"  Note: Each worker trains ONE model. Models use internal parallelism via n_jobs=1")
print(f"        - This maximizes CPU utilization by training {N_WORKERS} models simultaneously")
print(f"        - Models that benefit from parallelism (RF, KNN) get n_jobs > 1 when few workers")

Deleted existing database: /mnt/arkk/kaggle/diabetes-prediction/data/ensemble_training.db
Database initialized at: /mnt/arkk/kaggle/diabetes-prediction/data/ensemble_training.db

Configuration:
  Total CPUs available: 20
  Parallel workers: 20
  Batch size: 20
  Model timeout: 30 minutes
  Note: Each worker trains ONE model. Models use internal parallelism via n_jobs=1
        - This maximizes CPU utilization by training 20 models simultaneously
        - Models that benefit from parallelism (RF, KNN) get n_jobs > 1 when few workers


## Data loading and preparation

In [3]:
# Load training data
train_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_train.csv'
train_df = pd.read_csv(train_df_path)
train_df.drop_duplicates(inplace=True)

print(f"Training data shape: {train_df.shape}")
print(f"Class distribution:")
print(train_df['diagnosed_diabetes'].value_counts(normalize=True))

# Define features
label = 'diagnosed_diabetes'
numerical_features = [
    'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
    'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
    'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
    'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides',
    'family_history_diabetes', 'hypertension_history', 'cardiovascular_history'
]
ordinal_features = ['education_level', 'income_level']
education_categories = [['No formal', 'Highschool', 'Graduate', 'Postgraduate']]
income_categories = [['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']]
nominal_features = ['gender', 'ethnicity', 'smoking_status', 'employment_status']

Training data shape: (700000, 26)
Class distribution:
diagnosed_diabetes
1.0    0.623296
0.0    0.376704
Name: proportion, dtype: float64


In [4]:
# Create fixed three-way data split
X_train_pool, X_val_s1, X_val_s2, y_train_pool, y_val_s1, y_val_s2 = create_data_splits(
    train_df, label, RANDOM_STATE
)


Fixed data split:
--------------------------------------------------------------------------------
  Training pool: 420,000 samples (60%)
  Stage 1 validation: 140,000 samples (20%) - for stage 1 eval & stage 2 training
  Stage 2 validation: 140,000 samples (20%) - for stage 2 eval (HELD OUT)


In [5]:
print("Data info before batch preparation:")
print(f"  X_train_pool: {type(X_train_pool)} - {X_train_pool.shape if hasattr(X_train_pool, 'shape') else 'N/A'}")
print(f"  y_train_pool: {type(y_train_pool)} - {y_train_pool.shape if hasattr(y_train_pool, 'shape') else 'N/A'}")
print(f"  X_val_s1: {type(X_val_s1)} - {X_val_s1.shape if hasattr(X_val_s1, 'shape') else 'N/A'}")
print(f"\nOptimizations:")
print(f"  1. Data kept as DataFrames (required by ColumnTransformer)")
print(f"  2. Each worker receives PRE-SAMPLED data (~2.5-27.5% of pool)")
print(f"  3. Estimated data per worker: ~{len(X_train_pool) * 0.15:.0f} rows (vs {len(X_train_pool)} full pool)")
print(f"  4. CPU strategy: Train {BATCH_SIZE} models in parallel, each with n_jobs=1")
print(f"     - Maximizes CPU utilization: {BATCH_SIZE} processes √ó 1 core = {BATCH_SIZE} cores used")

Data info before batch preparation:
  X_train_pool: <class 'pandas.core.frame.DataFrame'> - (420000, 25)
  y_train_pool: <class 'pandas.core.series.Series'> - (420000,)
  X_val_s1: <class 'pandas.core.frame.DataFrame'> - (140000, 25)

Optimizations:
  1. Data kept as DataFrames (required by ColumnTransformer)
  2. Each worker receives PRE-SAMPLED data (~2.5-27.5% of pool)
  3. Estimated data per worker: ~63000 rows (vs 420000 full pool)
  4. CPU strategy: Train 20 models in parallel, each with n_jobs=1
     - Maximizes CPU utilization: 20 processes √ó 1 core = 20 cores used


### Diagnostic: Check data types before parallel processing

In [6]:
# Create base preprocessor
base_preprocessor = create_base_preprocessor(
    numerical_features, ordinal_features, nominal_features,
    education_categories, income_categories
)


Base preprocessor created
  Numerical features: 18
  Ordinal features: 2
  Nominal features: 4


## Initialize ensemble with founder model

In [7]:
# Train founder model (baseline only - NOT added to ensemble)
founder_auc = train_founder_model(
    X_train_pool, X_val_s1, X_val_s2, y_train_pool, y_val_s1, y_val_s2,
    base_preprocessor, RANDOM_STATE, BASE_TEMPERATURE, ENSEMBLE_DIR
)

# Initialize ensemble (EMPTY - founder not included)
ensemble_models = []
stage2_model = None
best_ensemble_score = founder_auc

# Initialize hill climbing variables (start at iteration 1, not 0)
start_iteration = 1
temperature = BASE_TEMPERATURE

print(f"\nFounder baseline AUC: {founder_auc:.6f}")
print(f"Ensemble starts empty - first batch will be iterations 1-{BATCH_SIZE}")
print(f"Stage 2 DNN will be trained after {STAGE2_BATCH_SIZE_MODELS} accepted models")

TRAINING FOUNDER MODEL (baseline only - NOT added to ensemble)

Training founder model
--------------------------------------------------------------------------------
  Training samples: 42,000 (10% of 420,000 pool)
  Pipeline config:
    Classifier: random_forest
    Transformers: []
    Dimensionality reduction: factor_analysis
  Training pipeline...
  Training complete (15.5s)
  Stage 1 validation AUC: 0.592439
  Stage 2 validation AUC: 0.593767

FOUNDER MODEL COMPLETE - Baseline score established

Founder baseline AUC: 0.593767
Ensemble starts empty - first batch will be iterations 1-20
Stage 2 DNN will be trained after 10 accepted models


## Parallel hill climbing loop

Iteratively trains batches of candidate models in parallel, evaluates with hybrid scoring,
and accepts/rejects using simulated annealing.

In [None]:
print(f"\n{'=' * 80}")
print("STARTING PARALLEL HILL CLIMBING LOOP")
print(f"{'=' * 80}")
print(f"Batch size: {BATCH_SIZE} candidates trained in parallel")
print(f"Workers: {N_WORKERS} parallel processes")
print(f"Total CPUs: {N_CPUS} (distributed intelligently across models)")

iterations_since_improvement = 0
iteration = start_iteration

# Calculate timeout values in seconds
model_timeout_seconds = MODEL_TIMEOUT_MINUTES * 60
batch_timeout_seconds = model_timeout_seconds + 60  # Add 1 minute buffer for batch timeout

while iteration < MAX_ITERATIONS and iterations_since_improvement < PLATEAU_ITERATIONS:
    print(f"\n{'=' * 80}")
    print(f"BATCH Starting at iteration {iteration}")
    print(f"{'=' * 80}")
    print(f"Ensemble size: {len(ensemble_models)} | Best score: {best_ensemble_score:.6f} | "
          f"Temperature: {temperature:.6f} | No improvement: {iterations_since_improvement}/{PLATEAU_ITERATIONS}")
    
    # Prepare batch of training jobs with intelligent CPU allocation
    print(f"\nPreparing batch jobs...")
    batch_jobs = prepare_training_batch(
        iteration, BATCH_SIZE, MAX_ITERATIONS, X_train_pool, y_train_pool,
        X_val_s1, y_val_s1, base_preprocessor, RANDOM_STATE, total_cpus=N_CPUS,
        timeout_minutes=MODEL_TIMEOUT_MINUTES
    )
    print(f"Batch prepared: {len(batch_jobs)} jobs ready")
    
    # Pre-determine classifier types for this batch (for timeout logging)
    batch_classifier_types = {}
    for job in batch_jobs:
        job_iteration = job[0]
        rng = np.random.RandomState(RANDOM_STATE + job_iteration)
        classifier_pool = [
            'logistic', 'random_forest', 'linear_svc',
            'sgd_classifier', 'extra_trees', 'adaboost',
            'naive_bayes', 'lda', 'qda', 'ridge'
            # TEMPORARILY DISABLED (too slow):
            # 'gradient_boosting', 'mlp', 'knn'
        ]
        batch_classifier_types[job_iteration] = rng.choice(classifier_pool)
    
    print(f"\nTraining {len(batch_jobs)} candidates in parallel ({MODEL_TIMEOUT_MINUTES} min timeout per model)...")
    batch_start_time = time.time()
    
    # Train candidates in parallel with timeout
    trained_candidates = []
    failed_count = 0
    timeout_iterations = []  # Track which iterations timed out
    with ProcessPoolExecutor(max_workers=N_WORKERS) as executor:
        futures = {executor.submit(train_single_candidate, job): job for job in batch_jobs}
        
        completed = 0
        for future in as_completed(futures, timeout=batch_timeout_seconds):
            completed += 1
            job = futures[future]
            try:
                # Individual job timeout
                result = future.result(timeout=model_timeout_seconds)
                trained_candidates.append(result)
                print(f"  [{completed}/{len(batch_jobs)}] ‚úì Iteration {result['iteration']}: "
                      f"{result['metadata']['classifier_type']} AUC={result['val_auc_s1']:.6f} "
                      f"({result['training_time']:.1f}s)")
            except TimeoutError:
                failed_count += 1
                timeout_iterations.append(job[0])  # Store iteration number that timed out
                timeout_classifier = batch_classifier_types[job[0]]
                print(f"  [{completed}/{len(batch_jobs)}] ‚úó Iteration {job[0]} TIMEOUT: {timeout_classifier} exceeded {MODEL_TIMEOUT_MINUTES} minutes")
            except Exception as e:
                failed_count += 1
                print(f"  [{completed}/{len(batch_jobs)}] ‚úó Iteration {job[0]} FAILED: {e}")
    
    if failed_count > 0:
        print(f"\n‚ö†Ô∏è  {failed_count}/{len(batch_jobs)} models failed during training")
    
    # Log timeout iterations to database
    for timeout_iter in timeout_iterations:
        timeout_classifier = batch_classifier_types[timeout_iter]
        log_iteration(
            iteration=timeout_iter,
            accepted=False,
            rejection_reason="timeout",
            pipeline_hash="timeout",
            stage1_val_auc=0.0,
            stage2_val_auc=0.0,
            ensemble_size=len(ensemble_models),
            diversity_score=0.0,
            temperature=temperature,
            metadata={'classifier_type': timeout_classifier, 'transformers_used': []},
            ensemble_id=f"iter_{timeout_iter}",
            training_memory_mb=None,
            stage2_memory_mb=None,
            training_time_sec=None,
            stage2_time_sec=None,
            timeout=True
        )
    
    if len(trained_candidates) == 0:
        print(f"\n‚ö†Ô∏è  WARNING: All models in batch failed! Continuing to next batch...")
        iteration += len(batch_jobs)  # Move past failed iterations
        continue  # Continue to next batch instead of breaking
    
    batch_time = time.time() - batch_start_time
    print(f"\nBatch complete ({batch_time:.1f}s, {batch_time/len(trained_candidates):.1f}s per model)")
    
    # Sort by iteration number
    trained_candidates.sort(key=lambda x: x['iteration'])
    
    # Process each trained candidate for acceptance/rejection
    for result in trained_candidates:
        current_iter = result['iteration']
        fitted_pipeline = result['fitted_pipeline']
        metadata = result['metadata']
        val_auc_s1 = result['val_auc_s1']
        pipeline_hash = result['pipeline_hash']
        training_memory_mb = result.get('memory_mb', None)
        training_time_sec = result.get('training_time_sec', None)
        
        print(f"\n{'-' * 80}")
        print(f"Iteration {current_iter}: {metadata['classifier_type']} | Stage 1 AUC: {val_auc_s1:.6f}")
        
        # Evaluate ensemble with candidate
        if len(ensemble_models) == 0:
            # First model - just use its score
            candidate_score = val_auc_s1
            diversity_score = 0.0
            aggregation_method = "single_model"
            print(f"  Ensemble AUC ({aggregation_method}): {candidate_score:.6f} (first model)")
        else:
            # Evaluate as ensemble
            candidate_ensemble = ensemble_models + [fitted_pipeline]
            candidate_score, diversity_score, aggregation_method = evaluate_candidate_ensemble(
                candidate_ensemble, ensemble_models, stage2_model,
                X_val_s1, X_val_s2, y_val_s1, y_val_s2
            )
            print(f"  Ensemble AUC ({aggregation_method}): {candidate_score:.6f} | Diversity: {diversity_score:.6f}")
        
        # Simulated annealing acceptance (with diversity bonus)
        accept, reason = adaptive_simulated_annealing_acceptance(
            current_score=best_ensemble_score,
            candidate_score=candidate_score,
            temperature=temperature,
            random_state=RANDOM_STATE + current_iter,
            diversity_score=diversity_score
        )
        
        print(f"  Decision: {'‚úì ACCEPT' if accept else '‚úó REJECT'} ({reason})")
        
        # Track stage 2 memory and time for this iteration
        stage2_memory_mb = None
        stage2_time_sec = None
        
        # Log iteration
        log_iteration(
            iteration=current_iter,
            accepted=accept,
            rejection_reason=reason,
            pipeline_hash=pipeline_hash,
            stage1_val_auc=val_auc_s1,
            stage2_val_auc=candidate_score,
            ensemble_size=len(ensemble_models) + 1 if accept else len(ensemble_models),
            diversity_score=diversity_score,
            temperature=temperature,
            metadata=metadata,
            ensemble_id=f"iter_{current_iter}",
            training_memory_mb=training_memory_mb,
            stage2_memory_mb=stage2_memory_mb,
            training_time_sec=training_time_sec,
            stage2_time_sec=stage2_time_sec,
            timeout=False
        )
        
        # Update ensemble if accepted
        if accept:
            ensemble_models.append(fitted_pipeline)
            
            # Save model
            model_path = ENSEMBLE_DIR / f'model_{current_iter}.joblib'
            joblib.dump(fitted_pipeline, model_path)
            
            # Check if we should train/retrain stage 2 DNN
            if len(ensemble_models) % STAGE2_BATCH_SIZE_MODELS == 0 and len(ensemble_models) > 0:
                stage2_model, final_score, stage2_memory_mb, stage2_time_sec = train_or_expand_stage2_model(
                    ensemble_models, stage2_model, X_val_s1, y_val_s1, X_val_s2, y_val_s2,
                    STAGE2_EPOCHS, STAGE2_BATCH_SIZE, STAGE2_PATIENCE, current_iter
                )
                
                # Save ensemble bundle checkpoint
                save_ensemble_bundle(
                    ensemble_models, stage2_model, best_ensemble_score, current_iter,
                    MODELS_DIR, RANDOM_STATE, BATCH_SIZE, N_WORKERS, base_preprocessor,
                    numerical_features, ordinal_features, nominal_features,
                    education_categories, income_categories
                )
                print(f"{'=' * 80}\n")
            
            # Check if this is the best score
            if candidate_score > best_ensemble_score:
                print(f"  üéâ New best score: {candidate_score:.6f} (Œî={candidate_score - best_ensemble_score:.6f})")
                best_ensemble_score = candidate_score
                iterations_since_improvement = 0
            else:
                iterations_since_improvement += 1
        else:
            iterations_since_improvement += 1
        
        # Update temperature
        temperature = update_temperature(
            iteration=current_iter,
            acceptance_history=[accept],
            current_temperature=temperature,
            base_temperature=BASE_TEMPERATURE,
            decay_rate=TEMPERATURE_DECAY
        )
    
    # Move to next batch
    iteration += len(trained_candidates)
    
    # Check termination
    if iterations_since_improvement >= PLATEAU_ITERATIONS:
        print(f"\n{'=' * 80}")
        print(f"TERMINATING: No improvement for {PLATEAU_ITERATIONS} iterations")
        print(f"{'=' * 80}")
        break

# Calculate final acceptance rate and timeout rate
conn = ensemble_database.sqlite3.connect(ensemble_database.DB_PATH)
acceptance_stats = conn.execute("SELECT COUNT(*) as total, SUM(accepted) as accepted FROM ensemble_log WHERE iteration_num > 0").fetchone()
timeout_stats = conn.execute("SELECT SUM(timeout) as timeouts FROM ensemble_log WHERE iteration_num > 0").fetchone()
conn.close()
acceptance_rate = acceptance_stats[1] / acceptance_stats[0] if acceptance_stats[0] > 0 else 0.0
timeout_rate = timeout_stats[0] / acceptance_stats[0] if acceptance_stats[0] > 0 else 0.0

print(f"\n{'=' * 80}")
print("HILL CLIMBING COMPLETE")
print(f"{'=' * 80}")
print(f"Final ensemble size: {len(ensemble_models)}")
print(f"Best ensemble AUC: {best_ensemble_score:.6f}")
print(f"Total iterations: {iteration - 1}")
print(f"Acceptance rate: {acceptance_rate:.1%}")
print(f"Timeout rate: {timeout_rate:.1%}")


STARTING PARALLEL HILL CLIMBING LOOP
Batch size: 20 candidates trained in parallel
Workers: 20 parallel processes
Total CPUs: 20 (distributed intelligently across models)

BATCH Starting at iteration 1
Ensemble size: 0 | Best score: 0.593767 | Temperature: 0.010000 | No improvement: 0/100

Preparing batch jobs...
Batch prepared: 20 jobs ready

Training 20 candidates in parallel (30 min timeout per model)...
Batch prepared: 20 jobs ready

Training 20 candidates in parallel (30 min timeout per model)...

[Iteration 1] Training qda
  Sample size: 53215 rows (12.0%)
  Feature sampling: 63.2%
  Transformers: ratio

[Iteration 1] Training qda
  Sample size: 53215 rows (12.0%)
  Feature sampling: 63.2%
  Transformers: ratio

[Iteration 2] Training logistic_regression
  Sample size: 22826 rows (9.2%)
  Feature sampling: 42.2%
  Transformers: sum, noise_injector, rbf_sampler

[Iteration 2] Training logistic_regression
  Sample size: 22826 rows (9.2%)
  Feature sampling: 42.2%
  Transformers: s




[Iteration 24] Training extra_trees

  Sample size: 42079 rows (6.4%)
  Feature sampling: 55.5%
  Sample size: 42079 rows (6.4%)
  Feature sampling: 55.5%
  Transformers: None
  Transformers: None

[Iteration 25] Training lda
  Sample size: 21501 rows (1.7%)
[Iteration 25] Training lda
  Sample size: 21501 rows (1.7%)
  Feature sampling: 41.3%

  Feature sampling: 41.3%
  Transformers: quantile_transform, kde
  Transformers: quantile_transform, kde
  [3/20] ‚úì Iteration 17: linear_svc AUC=0.551956 (2.5s)
  [4/20] ‚úì Iteration 20: logistic_regression AUC=0.642634 (1.9s)
  [3/20] ‚úì Iteration 17: linear_svc AUC=0.551956 (2.5s)
  [4/20] ‚úì Iteration 20: logistic_regression AUC=0.642634 (1.9s)

[Iteration 26] Training linear_svc
  Sample size: 20563 rows (12.1%)
[Iteration 26] Training linear_svc
  Sample size: 20563 rows (12.1%)
  Feature sampling: 40.6%

  Feature sampling: 40.6%
  Transformers: kmeans, quantile_transform, standard_scaler  Transformers: kmeans, quantile_transform, s



  [5/20] ‚úì Iteration 18: linear_svc AUC=0.592933 (3.6s)
  [6/20] ‚úì Iteration 23: qda AUC=0.565642 (2.0s)

[Iteration 29] Training adaboost
[Iteration 29] Training adaboost
  Sample size: 23376 rows (13.2%)

  Sample size: 23376 rows (13.2%)
  Feature sampling: 42.6%
  Transformers: kmeans, log
  Feature sampling: 42.6%
  Transformers: kmeans, log

[Iteration 30] Training extra_trees
  Sample size: 26671 rows (1.6%)

[Iteration 30] Training extra_trees
  Sample size: 26671 rows (1.6%)
  Feature sampling: 44.8%
  Transformers: square, difference  Feature sampling: 44.8%
  Transformers: square, difference


[Iteration 31] Training linear_svc
  Sample size: 39531 rows (1.5%)
[Iteration 31] Training linear_svc
  Sample size: 39531 rows (1.5%)
  Feature sampling: 53.7%
  Transformers: None
  Feature sampling: 53.7%
  Transformers: None


[Iteration 32] Training sgd_classifier

[Iteration 32] Training sgd_classifier
  Sample size: 20638 rows (12.8%)
  Feature sampling: 40.7%  Sample size:

2025-12-07 19:10:36.194375: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected



  Stage 2 DNN trained!
  DNN ensemble AUC: 0.651464
  Memory used: 35.3 MB
  Time elapsed: 34.4s
  Bundle checkpoint saved: ensemble_bundle_iter_16.joblib (0.1 MB)

  üéâ New best score: 0.652453 (Œî=0.000196)

--------------------------------------------------------------------------------
Iteration 17: linear_svc | Stage 1 AUC: 0.551956
  Ensemble AUC (hybrid (DNN√ó10 + mean√ó1)): 0.651298 | Diversity: 0.332162
  Decision: ‚úó REJECT (rejected: Œî=-0.001155, P=0.882367)

--------------------------------------------------------------------------------
Iteration 18: linear_svc | Stage 1 AUC: 0.592933
  Ensemble AUC (hybrid (DNN√ó10 + mean√ó1)): 0.651298 | Diversity: 0.332162
  Decision: ‚úó REJECT (rejected: Œî=-0.001155, P=0.882367)

--------------------------------------------------------------------------------
Iteration 18: linear_svc | Stage 1 AUC: 0.592933
  Ensemble AUC (hybrid (DNN√ó10 + mean√ó1)): 0.650961 | Diversity: 0.363477
  Decision: ‚úì ACCEPT (simulated_annealing: Œî




[Iteration 24] Training extra_trees
  Sample size: 42079 rows (6.4%)
  Feature sampling: 55.5%
  Sample size: 42079 rows (6.4%)
  Feature sampling: 55.5%
  Transformers: None

  Transformers: None

[Iteration 25] Training lda

[Iteration 25] Training lda
  Sample size: 21501 rows (1.7%)
  Sample size: 21501 rows (1.7%)
  Feature sampling: 41.3%
  Transformers: quantile_transform, kde  Feature sampling: 41.3%
  Transformers: quantile_transform, kde


[Iteration 26] Training linear_svc
[Iteration 26] Training linear_svc
  Sample size: 20563 rows (12.1%)
  Feature sampling: 40.6%
  Transformers: kmeans, quantile_transform, standard_scaler

  Sample size: 20563 rows (12.1%)
  Feature sampling: 40.6%
  Transformers: kmeans, quantile_transform, standard_scaler

[Iteration 27] Training sgd_classifier
  Sample size: 9005 rows (10.8%)
  Feature sampling: 32.6%
  Transformers: quantile_transform, binning, kde

[Iteration 27] Training sgd_classifier
  Sample size: 9005 rows (10.8%)
  Feature sam



  Sample size: 9584 rows (12.9%)





  Feature sampling: 33.0%




  Transformers: kmeans  Transformers: kmeans





[Iteration 29] Training adaboost
  Sample size: 23376 rows (13.2%)

[Iteration 29] Training adaboost
  Sample size: 23376 rows (13.2%)
  Feature sampling: 42.6%
  Transformers: kmeans, log
  Feature sampling: 42.6%
  Transformers: kmeans, log

[Iteration 30] Training extra_trees
  Sample size: 26671 rows (1.6%)
  Feature sampling: 44.8%
  Transformers: square, difference

[Iteration 30] Training extra_trees
  Sample size: 26671 rows (1.6%)
  Feature sampling: 44.8%
  Transformers: square, difference

[Iteration 31] Training linear_svc
  Sample size: 39531 rows (1.5%)

[Iteration 31] Training linear_svc
  Sample size: 39531 rows (1.5%)
  Feature sampling: 53.7%
  Transformers: None
  Feature sampling: 53.7%
  Transformers: None
  [2/20] ‚úì Iteration 22: naive_bayes AUC=0.588946 (2.0s)
  [2/20] ‚úì Iteration 22: naive_bayes AUC=0.588946 (2.0s)

[Iteration 32] Training sgd_classifier
  Sample size: 20638 rows (12.8%)
  Feature sampling: 40.7%
  Transformers: None

[Iteration 32] Traini



  [6/20] ‚úì Iteration 38: naive_bayes AUC=0.602484 (2.2s)
  [7/20] ‚úì Iteration 41: lda AUC=0.594546 (1.9s)
  [7/20] ‚úì Iteration 41: lda AUC=0.594546 (1.9s)
  [8/20] ‚úì Iteration 35: linear_svc AUC=0.557449 (5.5s)
  [8/20] ‚úì Iteration 35: linear_svc AUC=0.557449 (5.5s)
  [9/20] ‚úì Iteration 39: linear_svc AUC=0.520385 (5.1s)
  [9/20] ‚úì Iteration 39: linear_svc AUC=0.520385 (5.1s)
  [10/20] ‚úó Iteration 28 FAILED: Process terminated without result (iteration 28)
  [10/20] ‚úó Iteration 28 FAILED: Process terminated without result (iteration 28)
  [11/20] ‚úó Iteration 29 FAILED: Process terminated without result (iteration 29)
  [11/20] ‚úó Iteration 29 FAILED: Process terminated without result (iteration 29)
  [12/20] ‚úó Iteration 33 FAILED: Process terminated without result (iteration 33)
  [12/20] ‚úó Iteration 33 FAILED: Process terminated without result (iteration 33)
  [13/20] ‚úó Iteration 26 FAILED: Process terminated without result (iteration 26)
  [13/20] ‚úó Itera




[Iteration 42] Training adaboost
  Sample size: 47018 rows (9.9%)
  Feature sampling: 58.9%
  Transformers: None

[Iteration 43] Training extra_trees  [5/20] ‚úì Iteration 38: naive_bayes AUC=0.595087 (1.8s)

  Sample size: 46461 rows (13.0%)
  Feature sampling: 58.5%
  Transformers: None

[Iteration 44] Training lda
  Sample size: 56009 rows (4.8%)
  Feature sampling: 65.2%
  Transformers: sqrt, noise_injector, rbf_sampler

[Iteration 45] Training sgd_classifier
  Sample size: 33996 rows (4.0%)
  Feature sampling: 49.9%
  Transformers: power_transform

[Iteration 46] Training extra_trees
  Sample size: 11613 rows (9.2%)
  Feature sampling: 34.4%
  Transformers: product
  [6/20] ‚úì Iteration 41: lda AUC=0.623516 (2.9s)

[Iteration 47] Training sgd_classifier
  Sample size: 6171 rows (13.3%)
  Feature sampling: 30.6%
  Transformers: kmeans, sum, noise_injector

[Iteration 48] Training logistic_regression
  Sample size: 11337 rows (11.3%)
  Feature sampling: 34.2%
  Transformers: kde, 

## Save final checkpoint and bundle

In [None]:
# Save final checkpoint
save_checkpoint(
    checkpoint_path=CHECKPOINT_PATH,
    ensemble_models=ensemble_models,
    stage2_model=stage2_model,
    iteration=iteration - 1,
    temperature=temperature,
    best_score=best_ensemble_score,
    acceptance_history=[],
    metadata={
        'total_iterations': iteration,
        'final_ensemble_size': len(ensemble_models),
        'acceptance_rate': acceptance_rate,
        'best_score': best_ensemble_score,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    }
)

# Save metadata
metadata_path = MODELS_DIR / 'ensemble_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump({
        'ensemble_size': len(ensemble_models),
        'total_iterations': iteration,
        'best_score': best_ensemble_score,
        'acceptance_rate': acceptance_rate,
        'training_completed': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'random_state': RANDOM_STATE,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    }, f, indent=2)

print(f"\nCheckpoint saved: {CHECKPOINT_PATH}")
print(f"Metadata saved: {metadata_path}")

In [None]:
# Save final ensemble bundle for Kaggle
ensemble_bundle_path = MODELS_DIR / 'ensemble_bundle.joblib'

ensemble_bundle = {
    'ensemble_models': ensemble_models,
    'stage2_model': stage2_model,
    'metadata': {
        'ensemble_size': len(ensemble_models),
        'total_iterations': iteration,
        'best_score': best_ensemble_score,
        'acceptance_rate': acceptance_rate,
        'training_completed': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'random_state': RANDOM_STATE,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    },
    'base_preprocessor': base_preprocessor,
    'feature_info': {
        'numerical_features': numerical_features,
        'ordinal_features': ordinal_features,
        'nominal_features': nominal_features,
        'education_categories': education_categories,
        'income_categories': income_categories
    }
}

joblib.dump(ensemble_bundle, ensemble_bundle_path, compress=3)

print(f"\nFinal ensemble bundle saved: {ensemble_bundle_path}")
print(f"File size: {ensemble_bundle_path.stat().st_size / (1024**2):.1f} MB")
print(f"\nTo load on Kaggle:")
print(f"  ensemble_bundle = joblib.load('ensemble_bundle.joblib')")
print(f"  ensemble_models = ensemble_bundle['ensemble_models']")
print(f"  stage2_model = ensemble_bundle['stage2_model']")

In [None]:
# Import the wrapper class
sys.path.insert(0, str(MODELS_BASE_DIR))

# Create wrapped model
wrapped_model = EnsembleClassifier(
    ensemble_models=ensemble_models,
    stage2_model=stage2_model,
    aggregation='mean'  # Fallback if stage2_model is None
)

# Save as single joblib file
wrapped_model_path = MODELS_DIR / 'ensemble_model.joblib'
joblib.dump(wrapped_model, wrapped_model_path, compress=3)

print(f"\nWrapped ensemble model saved: {wrapped_model_path}")
print(f"File size: {wrapped_model_path.stat().st_size / (1024**2):.1f} MB")
print(f"\nModel info: {wrapped_model}")
print(f"\nTo use on Kaggle:")
print(f"  1. Upload to Kaggle dataset:")
print(f"     - {wrapped_model_path.name}")
print(f"     - {MODELS_BASE_DIR / 'ensemble_classifier.py'}")
print(f"  2. In inference notebook:")
print(f"     from ensemble_classifier import EnsembleClassifier")
print(f"     model = joblib.load('ensemble_model.joblib')")
print(f"     predictions = model.predict(test_df)")

## Create wrapped ensemble model for Kaggle

Create a sklearn-compatible wrapper that bundles the entire ensemble into a single classifier.
This makes inference identical to the logistic regression workflow.

## Summary

In [None]:
print(f"\n{'=' * 80}")
print("ENSEMBLE TRAINING SUMMARY")
print(f"{'=' * 80}")
print(f"\nFinal Statistics:")
print(f"  Ensemble size: {len(ensemble_models)}")
print(f"  Best validation AUC: {best_ensemble_score:.6f}")
print(f"  Total iterations: {iteration}")
print(f"  Acceptance rate: {acceptance_rate:.1%}")
print(f"  Parallel configuration: {BATCH_SIZE} candidates, {N_WORKERS} workers")
print(f"\nFiles created:")
print(f"  Database: {ensemble_database.DB_PATH}")
print(f"  Models: {ENSEMBLE_DIR}")
print(f"  Checkpoint: {CHECKPOINT_PATH}")
print(f"  Metadata: {metadata_path}")
print(f"  Bundle: {ensemble_bundle_path}")
print(f"\n{'=' * 80}")