## Setup and configuration

### Imports

In [1]:
# Standard library imports
import os
import sys
import time
from datetime import datetime
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import cpu_count

# GPU Configuration - Use P100 (GPU 1) for Stage 2 DNN training
# Set library path for CUDA libraries installed via pip
venv_cuda_libs = '/mnt/arkk/kaggle/diabetes-prediction/.venv/lib/python3.12/site-packages/nvidia/cudnn/lib'

if 'LD_LIBRARY_PATH' in os.environ:
    os.environ['LD_LIBRARY_PATH'] = f"{os.environ['LD_LIBRARY_PATH']}:{venv_cuda_libs}"

else:
    os.environ['LD_LIBRARY_PATH'] = venv_cuda_libs

os.environ['CUDA_VISIBLE_DEVICES'] = '1'  # 0=GTX1080, 1=P100
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Third party imports
import joblib
import json
import numpy as np
import pandas as pd
import tensorflow as tf

# Add models directory to path for ensemble_classifier import
sys.path.insert(0, str(Path('../models').resolve()))

# Import ensemble modules
from ensemble_classifier import EnsembleClassifier
from functions import ensemble_database
from functions.ensemble_initialization import create_data_splits, create_base_preprocessor, train_founder_model
from functions.ensemble_parallel import train_single_candidate, prepare_training_batch
from functions.ensemble_evaluation import evaluate_candidate_ensemble
from functions.ensemble_stage2_training import train_or_expand_stage2_model, save_ensemble_bundle
from functions.ensemble_hill_climbing import (
    adaptive_simulated_annealing_acceptance,
    update_temperature,
    log_iteration
)
from functions.ensemble_stage2_model import save_checkpoint

# Configure TensorFlow
tf.get_logger().setLevel('ERROR')

# Detect available CPUs
n_cpus = cpu_count()
print(f"TensorFlow version: {tf.__version__}")
print(f"Available CPUs: {n_cpus}")

# Check GPU
gpus = tf.config.list_physical_devices('GPU')
print(f"GPUs available: {gpus}")
print(f"Number of GPUs: {len(gpus)}")

2025-12-08 02:32:40.204420: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765179160.228091 1192274 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765179160.235381 1192274 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


TensorFlow version: 2.18.0
Available CPUs: 24
GPUs available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Number of GPUs: 1


### Ensemble training & hill climb parameters

In [2]:
# Random state for reproducibility
RANDOM_STATE = 315

# CPU allocation for parallel training
# Set to None to use all available CPUs, or specify a number to limit
N_CPUS = 20

if N_CPUS is None:
    import multiprocessing
    N_CPUS = multiprocessing.cpu_count()

# Parallel training configuration
BATCH_SIZE = 20                  # Train this many candidates in parallel
N_WORKERS = N_CPUS               # Use all available CPUs as workers
MODEL_TIMEOUT_MINUTES = 2 * 60   # Maximum training time per model (minutes)

# Hill climbing configuration
MAX_ITERATIONS = 1000
PLATEAU_ITERATIONS = 100
BASE_TEMPERATURE = 0.05
TEMPERATURE_DECAY = 0.998

# Stage 2 DNN configuration
STAGE2_BATCH_SIZE_MODELS = 20  # Retrain DNN every N accepted models
STAGE2_EPOCHS = 100
STAGE2_BATCH_SIZE = 128
STAGE2_PATIENCE = 10

# Paths
DATA_DIR = Path('../data')
MODELS_BASE_DIR = Path('../models')
LOGS_DIR = Path('../logs')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
MODELS_DIR = MODELS_BASE_DIR / f'run_{timestamp}'
ENSEMBLE_DIR = MODELS_DIR / 'ensemble_stage1_models'
CHECKPOINT_PATH = MODELS_DIR / 'ensemble_checkpoint.pkl'
LOG_FILE = LOGS_DIR / f'training_{timestamp}.log'

# Create directories
DATA_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)
ENSEMBLE_DIR.mkdir(parents=True, exist_ok=True)
LOGS_DIR.mkdir(parents=True, exist_ok=True)

print(f"\nConfiguration:")
print(f"  Total CPUs available: {N_CPUS}")
print(f"  Parallel workers: {N_WORKERS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Model timeout: {MODEL_TIMEOUT_MINUTES} minutes")
print(f"  Log file: {LOG_FILE}")


Configuration:
  Total CPUs available: 20
  Parallel workers: 20
  Batch size: 20
  Model timeout: 120 minutes


### Initialize progress dashboard database

In [3]:
ensemble_database.reset_database()
ensemble_database.init_database()

Deleted existing database: /mnt/arkk/kaggle/diabetes-prediction/data/ensemble_training.db
Database initialized at: /mnt/arkk/kaggle/diabetes-prediction/data/ensemble_training.db


## Data preparation

### Load data

In [4]:
# Load training data
train_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_train.csv'
train_df = pd.read_csv(train_df_path)
train_df.drop_duplicates(inplace=True)

print(f'Training data shape: {train_df.shape}\n')
print('Class distribution:')
print(train_df['diagnosed_diabetes'].value_counts(normalize=True))

Training data shape: (700000, 26)

Class distribution:
diagnosed_diabetes
1.0    0.623296
0.0    0.376704
Name: proportion, dtype: float64


### Define features

In [5]:
label = 'diagnosed_diabetes'

numerical_features = [
    'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
    'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
    'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
    'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides',
    'family_history_diabetes', 'hypertension_history', 'cardiovascular_history'
]

ordinal_features = ['education_level', 'income_level']
education_categories = [['No formal', 'Highschool', 'Graduate', 'Postgraduate']]
income_categories = [['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']]
nominal_features = ['gender', 'ethnicity', 'smoking_status', 'employment_status']

### Trainining-validation-validation split

In [6]:
# Create fixed three-way data split
X_train_pool, X_val_s1, X_val_s2, y_train_pool, y_val_s1, y_val_s2 = create_data_splits(
    train_df, label, RANDOM_STATE
)

print("Data info before batch preparation:")
print(f"  X_train_pool: {type(X_train_pool)} - {X_train_pool.shape if hasattr(X_train_pool, 'shape') else 'N/A'}")
print(f"  X_val_s1: {type(X_val_s1)} - {X_val_s1.shape if hasattr(X_val_s1, 'shape') else 'N/A'}")
print(f"  X_val_s2: {type(y_val_s2)} - {y_val_s2.shape if hasattr(y_val_s2, 'shape') else 'N/A'}")

Data info before batch preparation:
  X_train_pool: <class 'pandas.core.frame.DataFrame'> - (420000, 25)
  X_val_s1: <class 'pandas.core.frame.DataFrame'> - (140000, 25)
  X_val_s2: <class 'pandas.core.series.Series'> - (140000,)


### Base data preprocessor

In [7]:
# Create base preprocessor
base_preprocessor = create_base_preprocessor(
    numerical_features, ordinal_features, nominal_features,
    education_categories, income_categories
)

base_preprocessor


Base preprocessor created
  Numerical features: 18
  Ordinal features: 2
  Nominal features: 4


## Initialize ensemble with founder model

In [8]:
# Train founder model (baseline only - NOT added to ensemble)
founder_auc = train_founder_model(
    X_train_pool, X_val_s1, X_val_s2, y_train_pool, y_val_s1, y_val_s2,
    base_preprocessor, RANDOM_STATE, BASE_TEMPERATURE, ENSEMBLE_DIR
)

# Initialize ensemble (EMPTY - founder not included)
ensemble_models = []
stage2_model = None
best_ensemble_score = founder_auc

# Initialize hill climbing variables (start at iteration 1, not 0)
start_iteration = 1
temperature = BASE_TEMPERATURE

print(f"\nFounder baseline AUC: {founder_auc:.6f}")
print(f"Ensemble starts empty - first batch will be iterations 1-{BATCH_SIZE}")
print(f"Stage 2 DNN will be trained after {STAGE2_BATCH_SIZE_MODELS} accepted models")

TRAINING FOUNDER MODEL (baseline only - NOT added to ensemble)

Training founder model
--------------------------------------------------------------------------------
  Training samples: 42,000 (10% of 420,000 pool)
  Pipeline config:
    Classifier: naive_bayes
    Transformers: []
    Dimensionality reduction: fast_ica
  Training pipeline...

Training founder model
--------------------------------------------------------------------------------
  Training samples: 42,000 (10% of 420,000 pool)
  Pipeline config:
    Classifier: naive_bayes
    Transformers: []
    Dimensionality reduction: fast_ica
  Training pipeline...
  Training complete (227.9s)
  Training complete (227.9s)
  Stage 1 validation AUC: 0.578409
  Stage 2 validation AUC: 0.580149

FOUNDER MODEL COMPLETE - Baseline score established

Founder baseline AUC: 0.580149
Ensemble starts empty - first batch will be iterations 1-20
Stage 2 DNN will be trained after 20 accepted models
  Stage 1 validation AUC: 0.578409
  Stage 

## Parallel hill climbing loop

Iteratively trains batches of candidate models in parallel, evaluates with hybrid scoring,
and accepts/rejects using simulated annealing.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import joblib
import logging
from datetime import datetime
from pathlib import Path
from sklearn.metrics import roc_auc_score

# Add functions directory to path
sys.path.insert(0, str(Path.cwd() / 'functions'))

from functions.ensemble_initialization import initialize_founder_model
from functions.ensemble_hill_climbing import (
    run_hill_climbing_iteration, update_temperature, compute_pipeline_hash, log_iteration
)
from functions.ensemble_parallel import train_candidates_parallel
from functions.ensemble_config import (
    BASE_TEMPERATURE, TEMPERATURE_DECAY, PLATEAU_ITERATIONS, TIMEOUT_THRESHOLD_SEC,
    STAGE2_BATCH_SIZE_MODELS, STAGE2_EPOCHS, STAGE2_BATCH_SIZE, STAGE2_PATIENCE
)
from functions.ensemble_stage2_training import train_or_expand_stage2_model, save_ensemble_bundle
from functions import ensemble_database

# Get feature info
numerical_features = X_train_diabetes.select_dtypes(include=[np.number]).columns.tolist()
ordinal_features = ['education', 'income']
nominal_features = [col for col in X_train_diabetes.columns if col not in numerical_features + ordinal_features]

education_categories = ['Never attended school or only kindergarten', 
                       'Grades 1 through 8 (Elementary)', 
                       'Grades 9 through 11 (Some high school)', 
                       'Grade 12 or GED (High school graduate)', 
                       'College 1 year to 3 years (Some college or technical school)', 
                       'College 4 years or more (College graduate)']

income_categories = ['Less than $10,000', '$10,000 to less than $15,000', 
                    '$15,000 to less than $20,000', '$20,000 to less than $25,000', 
                    '$25,000 to less than $35,000', '$35,000 to less than $50,000', 
                    '$50,000 to less than $75,000', '$75,000 or more']

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(MODELS_DIR / 'ensemble_training.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Initialize ensemble
logger.info(f"Starting ensemble hill climbing at {datetime.now()}")
ensemble_models = []
stage2_model = None
temperature = BASE_TEMPERATURE
best_ensemble_score = 0.0
iteration = 0
iterations_since_improvement = 0

# Create batch directory
ENSEMBLE_DIR = MODELS_DIR / f"run_{RUN_ID}" / "ensemble_stage1_models"
ENSEMBLE_DIR.mkdir(parents=True, exist_ok=True)

# Initialize founder model (iteration 0)
founder_score = initialize_founder_model(
    X_train=X_train_diabetes,
    y_train=y_train_diabetes,
    X_val_s1=X_val_s1_diabetes,
    y_val_s1=y_val_s1_diabetes,
    X_val_s2=X_val_s2_diabetes,
    y_val_s2=y_val_s2_diabetes,
    base_preprocessor=base_preprocessor,
    base_temperature=BASE_TEMPERATURE,
    models_dir=ENSEMBLE_DIR,
    random_state=RANDOM_STATE
)

best_ensemble_score = founder_score
iteration = 1  # Start hill climbing iterations after founder

logger.info(f"Founder baseline: {founder_score:.6f}")

print(f"\n{'=' * 80}")
print("HILL CLIMBING OPTIMIZATION")
print(f"{'=' * 80}\n")

# Main hill climbing loop (batched parallel)
while iteration < MAX_ITERATIONS:
    logger.info(f"\n{'=' * 80}")
    logger.info(f"BATCH starting at iteration {iteration}")
    logger.info(f"Current ensemble size: {len(ensemble_models)}")
    logger.info(f"Best score: {best_ensemble_score:.6f}")
    logger.info(f"Temperature: {temperature:.4f}")
    logger.info(f"{'=' * 80}")
    
    print(f"\nBatch starting at iteration {iteration} (ensemble size={len(ensemble_models)}, best={best_ensemble_score:.6f})")
    
    # Train candidates in parallel
    trained_candidates = train_candidates_parallel(
        ensemble_models=ensemble_models,
        stage2_model=stage2_model,
        X_train=X_train_diabetes,
        y_train=y_train_diabetes,
        X_val_s1=X_val_s1_diabetes,
        y_val_s1=y_val_s1_diabetes,
        X_val_s2=X_val_s2_diabetes,
        y_val_s2=y_val_s2_diabetes,
        temperature=temperature,
        batch_size=BATCH_SIZE,
        n_workers=N_WORKERS,
        timeout_sec=TIMEOUT_THRESHOLD_SEC,
        base_preprocessor=base_preprocessor,
        numerical_features=numerical_features,
        ordinal_features=ordinal_features,
        nominal_features=nominal_features,
        education_categories=education_categories,
        income_categories=income_categories,
        random_state=RANDOM_STATE
    )
    
    # Process results
    accepted_count = 0
    for result in trained_candidates:
        current_iter = iteration
        iteration += 1  # Increment for next candidate
        
        if result['timeout']:
            logger.info(f"\nIteration {current_iter}: TIMEOUT (>{TIMEOUT_THRESHOLD_SEC}s)")
            
            # Log timeout entry
            log_iteration(
                iteration=current_iter,
                accepted=False,
                rejection_reason='timeout',
                pipeline_hash='timeout',
                stage1_val_auc=0.0,
                stage2_val_auc=0.0,
                ensemble_size=len(ensemble_models),
                diversity_score=0.0,
                temperature=temperature,
                metadata={'classifier_type': 'timeout', 'transformers_used': []},
                ensemble_id=f"iter_{current_iter}",
                timeout=True
            )
            continue
        
        # Extract results
        fitted_pipeline = result['fitted_pipeline']
        metadata = result['metadata']
        val_auc_s1 = result['val_auc_s1']
        candidate_score = result['candidate_score']
        diversity_score = result['diversity_score']
        pipeline_hash = result['pipeline_hash']
        accept = result['accept']
        reason = result['reason']
        training_memory_mb = result['training_memory_mb']
        training_time_sec = result['training_time_sec']
        stage2_memory_mb = result['stage2_memory_mb']
        stage2_time_sec = result['stage2_time_sec']
        
        # Determine logged score for database
        logged_ensemble_score = candidate_score if accept else best_ensemble_score
        
        # Log to database
        log_iteration(
            iteration=current_iter,
            accepted=accept,
            rejection_reason=reason,
            pipeline_hash=pipeline_hash,
            stage1_val_auc=val_auc_s1,
            stage2_val_auc=logged_ensemble_score,
            ensemble_size=len(ensemble_models) + 1 if accept else len(ensemble_models),
            diversity_score=diversity_score,
            temperature=temperature,
            metadata=metadata,
            ensemble_id=f"iter_{current_iter}",
            training_memory_mb=training_memory_mb,
            stage2_memory_mb=stage2_memory_mb,
            training_time_sec=training_time_sec,
            stage2_time_sec=stage2_time_sec,
            timeout=False
        )
        
        # Update ensemble if accepted
        if accept:
            accepted_count += 1
            ensemble_models.append(fitted_pipeline)
            
            # Save model
            model_path = ENSEMBLE_DIR / f'model_{current_iter}.joblib'
            joblib.dump(fitted_pipeline, model_path)
            
            # Check if we should train/retrain stage 2 DNN
            if len(ensemble_models) % STAGE2_BATCH_SIZE_MODELS == 0 and len(ensemble_models) > 0:

                stage2_model, final_score, stage2_memory_mb, stage2_time_sec, stage2_tp, stage2_fp, stage2_tn, stage2_fn = train_or_expand_stage2_model(
                    ensemble_models, stage2_model, X_val_s1, y_val_s1, X_val_s2, y_val_s2,
                    STAGE2_EPOCHS, STAGE2_BATCH_SIZE, STAGE2_PATIENCE, current_iter
                )
                
                # Log DNN retrain score as separate entry
                logger.info(f"\n{'=' * 80}")
                logger.info(f"DNN RETRAINED at {len(ensemble_models)} models | Final ensemble AUC: {final_score:.6f}")
                logger.info(f"Memory: {stage2_memory_mb:.1f}MB, Time: {stage2_time_sec:.1f}s")
                logger.info(f"{'=' * 80}")
                
                print(f"\n{'=' * 80}")
                print(f"DNN RETRAINED at {len(ensemble_models)} models | Final ensemble AUC: {final_score:.6f}")
                print(f"{'=' * 80}")
                
                log_iteration(
                    iteration=current_iter,
                    accepted=True,
                    rejection_reason=f"dnn_retrain_batch_{len(ensemble_models)}",
                    pipeline_hash=f"dnn_retrain_{len(ensemble_models)}",
                    stage1_val_auc=val_auc_s1,
                    stage2_val_auc=final_score,
                    ensemble_size=len(ensemble_models),
                    diversity_score=diversity_score,
                    temperature=temperature,
                    metadata={'classifier_type': 'dnn_retrain', 'transformers_used': []},
                    ensemble_id=f"dnn_retrain_{len(ensemble_models)}",
                    training_memory_mb=None,
                    stage2_memory_mb=stage2_memory_mb,
                    training_time_sec=None,
                    stage2_time_sec=stage2_time_sec,
                    timeout=False,
                    stage2_tp=stage2_tp,
                    stage2_fp=stage2_fp,
                    stage2_tn=stage2_tn,
                    stage2_fn=stage2_fn
                )
                
                # Save ensemble bundle checkpoint
                save_ensemble_bundle(
                    ensemble_models, stage2_model, best_ensemble_score, current_iter,
                    MODELS_DIR, RANDOM_STATE, BATCH_SIZE, N_WORKERS, base_preprocessor,
                    numerical_features, ordinal_features, nominal_features,
                    education_categories, income_categories
                )
            
            # Check if this is the best score
            if candidate_score > best_ensemble_score:
                logger.info(f"    NEW BEST: {candidate_score:.6f} (Δ={candidate_score - best_ensemble_score:.6f})")
                best_ensemble_score = candidate_score
                iterations_since_improvement = 0
            else:
                iterations_since_improvement += 1
        else:
            iterations_since_improvement += 1
        
        # Update temperature
        temperature = update_temperature(
            iteration=current_iter,
            acceptance_history=[accept],
            current_temperature=temperature,
            base_temperature=BASE_TEMPERATURE,
            decay_rate=TEMPERATURE_DECAY
        )
    
    # Log batch summary
    logger.info(f"Batch summary: {accepted_count}/{len(trained_candidates)} accepted, "
               f"Ensemble size now {len(ensemble_models)}, Best score {best_ensemble_score:.6f}")
    
    # Move to next batch
    iteration += len(trained_candidates)
    
    # Check termination
    if iterations_since_improvement >= PLATEAU_ITERATIONS:
        logger.info(f"TERMINATING: No improvement for {PLATEAU_ITERATIONS} iterations")
        print(f"TERMINATING: No improvement for {PLATEAU_ITERATIONS} iterations")
        break

# Calculate final acceptance rate and timeout rate
conn = ensemble_database.sqlite3.connect(ensemble_database.DB_PATH)
acceptance_stats = conn.execute("SELECT COUNT(*) as total, SUM(accepted) as accepted FROM ensemble_log WHERE iteration_num > 0").fetchone()
timeout_stats = conn.execute("SELECT SUM(timeout) as timeouts FROM ensemble_log WHERE iteration_num > 0").fetchone()
conn.close()

acceptance_rate = acceptance_stats[1] / acceptance_stats[0] if acceptance_stats[0] > 0 else 0.0
timeout_rate = timeout_stats[0] / acceptance_stats[0] if acceptance_stats[0] > 0 else 0.0

logger.info("\n" + "=" * 80)
logger.info("TRAINING COMPLETE")
logger.info("=" * 80)
logger.info(f"Total iterations: {iteration - 1}")
logger.info(f"Final ensemble size: {len(ensemble_models)}")
logger.info(f"Best ensemble AUC: {best_ensemble_score:.6f}")
logger.info(f"Acceptance rate: {acceptance_rate:.1%}")
logger.info(f"Timeout rate: {timeout_rate:.1%}")
logger.info("=" * 80)

print(f"\n{'=' * 80}")
print(f"Total iterations: {iteration - 1}")
print(f"Final ensemble size: {len(ensemble_models)}")
print(f"Best ensemble AUC: {best_ensemble_score:.6f}")
print(f"Acceptance rate: {acceptance_rate:.1%}")
print(f"Timeout rate: {timeout_rate:.1%}")
print(f"{'=' * 80}")


BATCH Starting at iteration 1
Ensemble size: 0 | Best score: 0.580149 | Temperature: 0.050000 | No improvement: 0/100

Training 20 candidates in parallel (120 min timeout per model)...

Training 20 candidates in parallel (120 min timeout per model)...
  [1/20] Iteration 1: ridge AUC=0.603326 (3.0s)
  [1/20] Iteration 1: ridge AUC=0.603326 (3.0s)
  [2/20] Iteration 3: logistic AUC=0.652528 (2.6s)
  [2/20] Iteration 3: logistic AUC=0.652528 (2.6s)
  [3/20] Iteration 2 FAILED: n_components(12) must be <= n_features(7).
  [3/20] Iteration 2 FAILED: n_components(12) must be <= n_features(7).
  [4/20] Iteration 9: ridge AUC=0.585364 (2.2s)
  [4/20] Iteration 9: ridge AUC=0.585364 (2.2s)
  [5/20] Iteration 15: lda AUC=0.526176 (3.3s)
  [5/20] Iteration 15: lda AUC=0.526176 (3.3s)
  [6/20] Iteration 14: lasso AUC=0.656725 (5.3s)
  [6/20] Iteration 14: lasso AUC=0.656725 (5.3s)
  [7/20] Iteration 17: lda AUC=0.607280 (4.1s)
  [7/20] Iteration 17: lda AUC=0.607280 (4.1s)




  [8/20] Iteration 16: random_forest AUC=0.615958 (13.5s)




  [9/20] Iteration 11: linear_svc AUC=0.498639 (32.5s)




  [10/20] Iteration 18: linear_svc AUC=0.655322 (41.3s)
  [11/20] Iteration 12: mlp AUC=0.637592 (137.2s)
  [11/20] Iteration 12: mlp AUC=0.637592 (137.2s)
  [12/20] Iteration 10: ridge AUC=0.553904 (156.2s)
  [12/20] Iteration 10: ridge AUC=0.553904 (156.2s)
  [13/20] Iteration 4 TIMEOUT: random_forest exceeded 120 minutes
  [13/20] Iteration 4 TIMEOUT: random_forest exceeded 120 minutes
  [14/20] Iteration 5 TIMEOUT: linear_svc exceeded 120 minutes
  [14/20] Iteration 5 TIMEOUT: linear_svc exceeded 120 minutes
  [15/20] Iteration 6 TIMEOUT: extra_trees exceeded 120 minutes
  [16/20] Iteration 7 TIMEOUT: logistic exceeded 120 minutes
  [15/20] Iteration 6 TIMEOUT: extra_trees exceeded 120 minutes
  [16/20] Iteration 7 TIMEOUT: logistic exceeded 120 minutes
  [17/20] Iteration 8 TIMEOUT: mlp exceeded 120 minutes
  [17/20] Iteration 8 TIMEOUT: mlp exceeded 120 minutes
  [18/20] Iteration 13 TIMEOUT: extra_trees exceeded 120 minutes
  [18/20] Iteration 13 TIMEOUT: extra_trees exceeded 12

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


  [4/20] Iteration 21 FAILED: Input contains NaN.
  [5/20] Iteration 23: ridge AUC=0.613431 (4.0s)
  [5/20] Iteration 23: ridge AUC=0.613431 (4.0s)
  [6/20] Iteration 22: ridge AUC=0.561016 (4.8s)
  [6/20] Iteration 22: ridge AUC=0.561016 (4.8s)
  [7/20] Iteration 16: random_forest AUC=0.656650 (13.3s)
  [7/20] Iteration 16: random_forest AUC=0.656650 (13.3s)




  [8/20] Iteration 31: gradient_boosting AUC=0.655331 (31.5s)




  [9/20] Iteration 25: linear_svc AUC=0.646944 (43.2s)




  [10/20] Iteration 18: linear_svc AUC=0.686301 (52.0s)
  [11/20] Iteration 12: mlp AUC=0.505658 (106.0s)
  [11/20] Iteration 12: mlp AUC=0.505658 (106.0s)
  [12/20] Iteration 27: naive_bayes AUC=0.592721 (244.2s)
  [12/20] Iteration 27: naive_bayes AUC=0.592721 (244.2s)
  [13/20] Iteration 13 TIMEOUT: extra_trees exceeded 120 minutes
  [13/20] Iteration 13 TIMEOUT: extra_trees exceeded 120 minutes
  [14/20] Iteration 19 TIMEOUT: linear_svc exceeded 120 minutes
  [14/20] Iteration 19 TIMEOUT: linear_svc exceeded 120 minutes
  [15/20] Iteration 20 TIMEOUT: qda exceeded 120 minutes
  [15/20] Iteration 20 TIMEOUT: qda exceeded 120 minutes
  [16/20] Iteration 24 TIMEOUT: qda exceeded 120 minutes
  [16/20] Iteration 24 TIMEOUT: qda exceeded 120 minutes
  [17/20] Iteration 26 TIMEOUT: sgd_classifier exceeded 120 minutes
  [17/20] Iteration 26 TIMEOUT: sgd_classifier exceeded 120 minutes
  [18/20] Iteration 28 TIMEOUT: random_forest exceeded 120 minutes
  [18/20] Iteration 28 TIMEOUT: random_

I0000 00:00:1765194033.805006 1192274 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6856 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1070, pci bus id: 0000:04:00.0, compute capability: 6.1
I0000 00:00:1765194037.355113 1279233 service.cc:148] XLA service 0x7e946c00c560 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1765194037.355142 1279233 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce GTX 1070, Compute Capability 6.1
I0000 00:00:1765194037.355113 1279233 service.cc:148] XLA service 0x7e946c00c560 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1765194037.355142 1279233 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce GTX 1070, Compute Capability 6.1
I0000 00:00:1765194037.677465 1279233 cuda_dnn.cc:529] Loaded cuDNN version 91700
I0000 00:00:1765194037.677465 1279233 cuda_dnn.cc:529] Loaded cuDNN version 91700
I

KeyboardInterrupt: 

## Save final checkpoint and bundle

In [None]:
# Save final checkpoint
save_checkpoint(
    checkpoint_path=CHECKPOINT_PATH,
    ensemble_models=ensemble_models,
    stage2_model=stage2_model,
    iteration=iteration - 1,
    temperature=temperature,
    best_score=best_ensemble_score,
    acceptance_history=[],
    metadata={
        'total_iterations': iteration,
        'final_ensemble_size': len(ensemble_models),
        'acceptance_rate': acceptance_rate,
        'best_score': best_ensemble_score,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    }
)

# Save metadata
metadata_path = MODELS_DIR / 'ensemble_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump({
        'ensemble_size': len(ensemble_models),
        'total_iterations': iteration,
        'best_score': best_ensemble_score,
        'acceptance_rate': acceptance_rate,
        'training_completed': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'random_state': RANDOM_STATE,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    }, f, indent=2)

print(f"\nCheckpoint saved: {CHECKPOINT_PATH}")
print(f"Metadata saved: {metadata_path}")

In [None]:
# Save final ensemble bundle for Kaggle
ensemble_bundle_path = MODELS_DIR / 'ensemble_bundle.joblib'

ensemble_bundle = {
    'ensemble_models': ensemble_models,
    'stage2_model': stage2_model,
    'metadata': {
        'ensemble_size': len(ensemble_models),
        'total_iterations': iteration,
        'best_score': best_ensemble_score,
        'acceptance_rate': acceptance_rate,
        'training_completed': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'random_state': RANDOM_STATE,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    },
    'base_preprocessor': base_preprocessor,
    'feature_info': {
        'numerical_features': numerical_features,
        'ordinal_features': ordinal_features,
        'nominal_features': nominal_features,
        'education_categories': education_categories,
        'income_categories': income_categories
    }
}

joblib.dump(ensemble_bundle, ensemble_bundle_path, compress=3)

print(f"\nFinal ensemble bundle saved: {ensemble_bundle_path}")
print(f"File size: {ensemble_bundle_path.stat().st_size / (1024**2):.1f} MB")
print(f"\nTo load on Kaggle:")
print(f"  ensemble_bundle = joblib.load('ensemble_bundle.joblib')")
print(f"  ensemble_models = ensemble_bundle['ensemble_models']")
print(f"  stage2_model = ensemble_bundle['stage2_model']")

In [None]:
# Import the wrapper class
sys.path.insert(0, str(MODELS_BASE_DIR))

# Create wrapped model
wrapped_model = EnsembleClassifier(
    ensemble_models=ensemble_models,
    stage2_model=stage2_model,
    aggregation='mean'  # Fallback if stage2_model is None
)

# Save as single joblib file
wrapped_model_path = MODELS_DIR / 'ensemble_model.joblib'
joblib.dump(wrapped_model, wrapped_model_path, compress=3)

print(f"\nWrapped ensemble model saved: {wrapped_model_path}")
print(f"File size: {wrapped_model_path.stat().st_size / (1024**2):.1f} MB")
print(f"\nModel info: {wrapped_model}")
print(f"\nTo use on Kaggle:")
print(f"  1. Upload to Kaggle dataset:")
print(f"     - {wrapped_model_path.name}")
print(f"     - {MODELS_BASE_DIR / 'ensemble_classifier.py'}")
print(f"  2. In inference notebook:")
print(f"     from ensemble_classifier import EnsembleClassifier")
print(f"     model = joblib.load('ensemble_model.joblib')")
print(f"     predictions = model.predict(test_df)")

## Create wrapped ensemble model for Kaggle

Create a sklearn-compatible wrapper that bundles the entire ensemble into a single classifier.
This makes inference identical to the logistic regression workflow.

## Summary

In [None]:
print(f"\n{'=' * 80}")
print("ENSEMBLE TRAINING SUMMARY")
print(f"{'=' * 80}")
print(f"\nFinal Statistics:")
print(f"  Ensemble size: {len(ensemble_models)}")
print(f"  Best validation AUC: {best_ensemble_score:.6f}")
print(f"  Total iterations: {iteration}")
print(f"  Acceptance rate: {acceptance_rate:.1%}")
print(f"  Parallel configuration: {BATCH_SIZE} candidates, {N_WORKERS} workers")
print(f"\nFiles created:")
print(f"  Database: {ensemble_database.DB_PATH}")
print(f"  Models: {ENSEMBLE_DIR}")
print(f"  Checkpoint: {CHECKPOINT_PATH}")
print(f"  Metadata: {metadata_path}")
print(f"  Bundle: {ensemble_bundle_path}")
print(f"\n{'=' * 80}")