## Setup and configuration

### Imports

In [1]:
# Standard library imports
import logging
import os
import sys
# import time
from datetime import datetime
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import cpu_count

# Tensorflow logging configuration - must set before importing TensorFlow
# turns off INFO and WARNING messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Third party imports
import joblib
import json
# import numpy as np
import pandas as pd
import tensorflow as tf

# Add models directory to path for ensemble_classifier import
sys.path.insert(0, str(Path('../models').resolve()))

# Import ensemble modules
from ensemble_classifier import EnsembleClassifier
from functions import ensemble_database
from functions.ensemble_initialization import create_data_splits, create_base_preprocessor, train_founder_model
from functions.ensemble_parallel import train_single_candidate, prepare_training_batch
from functions.ensemble_evaluation import evaluate_candidate_ensemble
from functions.ensemble_stage2_training import train_or_expand_stage2_model, save_ensemble_bundle
from functions.ensemble_hill_climbing import (
    adaptive_simulated_annealing_acceptance,
    update_temperature,
    log_iteration
)

from functions.ensemble_stage2_model import save_checkpoint
from functions.ensemble_stage2_training import (
    generate_pseudo_labels, 
    augment_training_pool_with_pseudo_labels
)

2025-12-09 13:00:05.420991: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765303205.444751   69008 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765303205.452162   69008 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Configuration

In [2]:
# Grab a timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Set up logging
Path('../logs').mkdir(parents=True, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f'../logs/training.log', mode='w')
    ]
)

logger = logging.getLogger(__name__)

# Configure TensorFlow
tf.get_logger().setLevel('ERROR')

# Check GPU
gpus = tf.config.list_physical_devices('GPU')
print(f'GPUs available: {gpus}')
print(f'Number of GPUs: {len(gpus)}')

# Check CPUs
n_cpus = cpu_count()
print(f'TensorFlow version: {tf.__version__}')
print(f'Available CPUs: {n_cpus}')

GPUs available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
Number of GPUs: 2
TensorFlow version: 2.18.0
Available CPUs: 24


### Ensemble training & hill climb parameters

In [None]:
# Configuration
RANDOM_STATE = 315
label = 'diagnosed_diabetes'

# Parallel training configuration
BATCH_SIZE = 20                  # Train this many candidates in parallel
N_WORKERS = BATCH_SIZE           # Use all available CPUs as workers
MODEL_TIMEOUT_MINUTES = 4 * 60   # Maximum training time per model (minutes)

# Hill climbing configuration
MAX_ITERATIONS = 1000
PLATEAU_ITERATIONS = 100
BASE_TEMPERATURE = 0.001
TEMPERATURE_DECAY = 0.998

# Stage 2 DNN configuration
STAGE2_BATCH_SIZE_MODELS = 20  # Retrain DNN every N accepted models
STAGE2_EPOCHS = 100
STAGE2_BATCH_SIZE = 128
STAGE2_PATIENCE = 10

# Pseudo-labeling configuration
PSEUDO_LABEL_ENABLED = True           # Enable pseudo-labeling
PSEUDO_CONFIDENCE_THRESHOLD = 0.95    # Only use very confident predictions
PSEUDO_MAX_FRACTION = 0.15            # Max 15% of training pool can be pseudo-labeled
PSEUDO_BALANCE_CLASSES = True         # Ensure balanced pseudo-labeled samples

# Paths
DATA_DIR = Path('../data')
MODELS_BASE_DIR = Path('../models')
MODELS_DIR = MODELS_BASE_DIR / f'run_{timestamp}'
ENSEMBLE_DIR = MODELS_DIR / 'ensemble_stage1_models'
CHECKPOINT_PATH = MODELS_DIR / 'ensemble_checkpoint.pkl'

# Create directories
DATA_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)
ENSEMBLE_DIR.mkdir(parents=True, exist_ok=True)

print(f'\nConfiguration:')
print(f'  Parallel workers: {N_WORKERS}')
print(f'  Batch size: {BATCH_SIZE}')
print(f'  Model timeout: {MODEL_TIMEOUT_MINUTES} minutes')
print(f'\nPseudo-labeling:')
print(f'  Enabled: {PSEUDO_LABEL_ENABLED}')
print(f'  Confidence threshold: {PSEUDO_CONFIDENCE_THRESHOLD}')
print(f'  Max fraction: {PSEUDO_MAX_FRACTION * 100:.0f}%')


Configuration:
  Parallel workers: 20
  Batch size: 20
  Model timeout: 240 minutes

Pseudo-labeling:
  Enabled: True
  Confidence threshold: 0.95
  Max fraction: 15%


### Initialize progress dashboard database

In [4]:
ensemble_database.reset_database()
ensemble_database.init_database()

Deleted existing database: /mnt/arkk/kaggle/diabetes-prediction/data/ensemble_training.db
Database initialized at: /mnt/arkk/kaggle/diabetes-prediction/data/ensemble_training.db


## Data preparation

### Load data

In [5]:
# Load training data
train_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_train.csv'
train_df = pd.read_csv(train_df_path)
train_df.drop_duplicates(inplace=True)

print(f'Training data shape: {train_df.shape}\n')
print('Class distribution:')
print(train_df['diagnosed_diabetes'].value_counts(normalize=True))

Training data shape: (700000, 26)

Class distribution:
diagnosed_diabetes
1.0    0.623296
0.0    0.376704
Name: proportion, dtype: float64


In [6]:
# Load test data for pseudo-labeling
test_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_test.csv'
test_df = pd.read_csv(test_df_path)
test_df.drop_duplicates(inplace=True)

print(f'\nTest data shape: {test_df.shape}')
print('Test data will be used for pseudo-labeling after Stage 2 DNN training')


Test data shape: (300000, 25)
Test data will be used for pseudo-labeling after Stage 2 DNN training


### Define features

In [7]:
label = 'diagnosed_diabetes'

numerical_features = [
    'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
    'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
    'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
    'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides',
    'family_history_diabetes', 'hypertension_history', 'cardiovascular_history'
]

ordinal_features = ['education_level', 'income_level']
education_categories = [['No formal', 'Highschool', 'Graduate', 'Postgraduate']]
income_categories = [['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']]
nominal_features = ['gender', 'ethnicity', 'smoking_status', 'employment_status']

### Trainining-validation-validation split

In [8]:
# Create fixed three-way data split
X_train_pool, X_val_s1, X_val_s2, y_train_pool, y_val_s1, y_val_s2 = create_data_splits(
    train_df, label, RANDOM_STATE
)

print('Data info before batch preparation:')
print(f'  X_train_pool: {type(X_train_pool)} - {X_train_pool.shape}')
print(f'  X_val_s1: {type(X_val_s1)} - {X_val_s1.shape}')
print(f'  X_val_s2: {type(y_val_s2)} - {y_val_s2.shape}')

Data info before batch preparation:
  X_train_pool: <class 'pandas.core.frame.DataFrame'> - (420000, 25)
  X_val_s1: <class 'pandas.core.frame.DataFrame'> - (245000, 25)
  X_val_s2: <class 'pandas.core.series.Series'> - (35000,)


### Base data preprocessor

In [9]:
# Create base preprocessor
base_preprocessor = create_base_preprocessor(
    numerical_features, ordinal_features, nominal_features,
    education_categories, income_categories
)

base_preprocessor


Base preprocessor created
  Numerical features: 18
  Ordinal features: 2
  Nominal features: 4


## Initialize ensemble with founder model

In [10]:
# Train founder model (baseline only - NOT added to ensemble)
founder_auc = train_founder_model(
    X_train_pool, X_val_s1, X_val_s2, y_train_pool, y_val_s1, y_val_s2,
    base_preprocessor, RANDOM_STATE, BASE_TEMPERATURE, ENSEMBLE_DIR
)

# Initialize ensemble (EMPTY - founder not included)
ensemble_models = []
stage2_model = None
best_ensemble_score = founder_auc

# Initialize hill climbing variables (start at iteration 1, not 0)
start_iteration = 1
temperature = BASE_TEMPERATURE

print(f'\nFounder baseline AUC: {founder_auc:.4f}')
print(f'Ensemble starts empty - first batch will be iterations 1-{BATCH_SIZE}')
print(f'Stage 2 DNN will be trained after {STAGE2_BATCH_SIZE_MODELS} accepted models')

TRAINING FOUNDER MODEL (baseline only - NOT added to ensemble)

Training founder model
--------------------------------------------------------------------------------
  Training samples: 42,000 (10% of 420,000 pool)
  Pipeline config:
    Classifier: naive_bayes
    Transformers: []
    Dimensionality reduction: fast_ica
  Training pipeline...

Training founder model
--------------------------------------------------------------------------------
  Training samples: 42,000 (10% of 420,000 pool)
  Pipeline config:
    Classifier: naive_bayes
    Transformers: []
    Dimensionality reduction: fast_ica
  Training pipeline...
  Training complete (11.5s)
  Training complete (11.5s)
  Stage 1 validation AUC: 0.574092
  Stage 2 validation AUC: 0.572672

FOUNDER MODEL COMPLETE - Baseline score established

Founder baseline AUC: 0.5727
Ensemble starts empty - first batch will be iterations 1-20
Stage 2 DNN will be trained after 20 accepted models
  Stage 1 validation AUC: 0.574092
  Stage 2 va

## Parallel hill climbing loop

Iteratively trains batches of candidate models in parallel, evaluates with hybrid scoring,
and accepts/rejects using simulated annealing.

In [None]:
# Initialize hill climbing loop variables
iteration = start_iteration
iterations_since_improvement = 0

logger.info(f'Starting ensemble hill climbing at {datetime.now()}')
logger.info(f'Founder baseline: {founder_auc:.4f}')

# Main hill climbing loop (batched parallel)
while iteration < MAX_ITERATIONS:

    logger.info(f'BATCH starting at iteration {iteration}')
    logger.info(f'Current ensemble size: {len(ensemble_models)}')
    logger.info(f'Best score: {best_ensemble_score:.4f}')
    logger.info(f'Temperature: {temperature:.4f}')
    
    # Prepare batch of training tasks
    batch_tasks = prepare_training_batch(
        iteration=iteration,
        batch_size=BATCH_SIZE,
        max_iterations=MAX_ITERATIONS,
        X_train_pool=X_train_pool,
        y_train_pool=y_train_pool,
        X_val_s1=X_val_s1,
        y_val_s1=y_val_s1,
        base_preprocessor=base_preprocessor,
        random_state=RANDOM_STATE,
        total_cpus=N_WORKERS,
        timeout_minutes=MODEL_TIMEOUT_MINUTES
    )
    
    # Train candidates in parallel
    with ProcessPoolExecutor(max_workers=N_WORKERS) as executor:
    
        futures = {
            executor.submit(train_single_candidate, task): task[0]
            for task in batch_tasks
        }
        
        # Collect results with timeout
        results = []

        for future in as_completed(futures, timeout=MODEL_TIMEOUT_MINUTES * 60):

            try:
                result = future.result(timeout=60)  # 1 minute for result extraction
                results.append(result)

            except TimeoutError as e:

                iter_num = futures[future]
                logger.error(f'Iteration {iter_num} timed out: {e}')
                results.append({
                    'iteration': iter_num,
                    'timeout': True,
                    'accept': False,
                    'reason': 'timeout',
                    'fitted_pipeline': None,
                    'metadata': {},
                    'val_auc_s1': 0.0,
                    'candidate_score': 0.0,
                    'diversity_score': 0.0,
                    'pipeline_hash': 'timeout',
                    'training_memory_mb': 0.0,
                    'training_time_sec': 0.0,
                    'stage2_memory_mb': 0.0,
                    'stage2_time_sec': 0.0
                })

            except Exception as e:

                iter_num = futures[future]
                logger.error(f'Iteration {iter_num} failed: {e}')
                results.append({
                    'iteration': iter_num,
                    'timeout': False,
                    'accept': False,
                    'reason': 'exception',
                    'fitted_pipeline': None,
                    'metadata': {},
                    'val_auc_s1': 0.0,
                    'candidate_score': 0.0,
                    'diversity_score': 0.0,
                    'pipeline_hash': 'exception',
                    'training_memory_mb': 0.0,
                    'training_time_sec': 0.0,
                    'stage2_memory_mb': 0.0,
                    'stage2_time_sec': 0.0
                })
    
    # Sort results by iteration number
    results.sort(key=lambda x: x['iteration'])
    
    # Process results
    accepted_count = 0

    for result in results:
        current_iter = result['iteration']
        
        # Handle timeout
        if result.get('timeout', False):
            logger.info(f'\nIteration {current_iter}: TIMEOUT')
            
            # Log timeout entry
            log_iteration(
                iteration=current_iter,
                accepted=False,
                rejection_reason='timeout',
                pipeline_hash='timeout',
                stage1_val_auc=0.0,
                stage2_val_auc=0.0,
                ensemble_size=len(ensemble_models),
                diversity_score=0.0,
                temperature=temperature,
                metadata={'classifier_type': 'timeout', 'transformers_used': []},
                ensemble_id=f'iter_{current_iter}',
                timeout=True
            )

            continue

        # Handle other exceptions
        if result.get('fitted_pipeline', None) is None:
            logger.info(f'\nIteration {current_iter}: EXCEPTION during training')
            
            # Log exception entry
            log_iteration(
                iteration=current_iter,
                accepted=False,
                rejection_reason='exception',
                pipeline_hash='exception',
                stage1_val_auc=0.0,
                stage2_val_auc=0.0,
                ensemble_size=len(ensemble_models),
                diversity_score=0.0,
                temperature=temperature,
                metadata={'classifier_type': 'exception', 'transformers_used': []},
                ensemble_id=f'iter_{current_iter}',
                timeout=False
            )

            continue
        
        # Extract basic results from training
        fitted_pipeline = result['fitted_pipeline']
        metadata = result['metadata']
        val_auc_s1 = result['val_auc_s1']
        pipeline_hash = result['pipeline_hash']
        training_memory_mb = result['memory_mb']
        training_time_sec = result['training_time_sec']
        
        # Now evaluate the candidate using the ensemble evaluation logic        
        eval_result = evaluate_candidate_ensemble(
            fitted_pipeline=fitted_pipeline,
            X_val_s2=X_val_s2,
            y_val_s2=y_val_s2,
            ensemble_models=ensemble_models,
            stage2_model=stage2_model,
            best_ensemble_score=best_ensemble_score,
            temperature=temperature
        )
        
        candidate_score = eval_result['candidate_score']
        diversity_score = eval_result['diversity_score']
        accept = eval_result['accept']
        reason = eval_result['reason']
        stage2_memory_mb = eval_result.get('stage2_memory_mb', 0.0)
        stage2_time_sec = eval_result.get('stage2_time_sec', 0.0)
        
        # Determine logged score for database
        logged_ensemble_score = candidate_score if accept else best_ensemble_score
        
        # Log to database
        log_iteration(
            iteration=current_iter,
            accepted=accept,
            rejection_reason=reason,
            pipeline_hash=pipeline_hash,
            stage1_val_auc=val_auc_s1,
            stage2_val_auc=logged_ensemble_score,
            ensemble_size=len(ensemble_models) + 1 if accept else len(ensemble_models),
            diversity_score=diversity_score,
            temperature=temperature,
            metadata=metadata,
            ensemble_id=f'iter_{current_iter}',
            training_memory_mb=training_memory_mb,
            stage2_memory_mb=stage2_memory_mb,
            training_time_sec=training_time_sec,
            stage2_time_sec=stage2_time_sec,
            timeout=False
        )
        
        # Update ensemble if accepted
        if accept:
            accepted_count += 1
            ensemble_models.append(fitted_pipeline)
            
            # Save model
            model_path = ENSEMBLE_DIR / f'model_{current_iter}.joblib'
            joblib.dump(fitted_pipeline, model_path)
            
            # Check if we should train/retrain stage 2 DNN
            if len(ensemble_models) % STAGE2_BATCH_SIZE_MODELS == 0 and len(ensemble_models) > 0:
                stage2_model, final_score, stage2_memory_mb, stage2_time_sec, stage2_tp, stage2_fp, stage2_tn, stage2_fn = train_or_expand_stage2_model(
                    ensemble_models, stage2_model, X_val_s1, y_val_s1, X_val_s2, y_val_s2,
                    STAGE2_EPOCHS, STAGE2_BATCH_SIZE, STAGE2_PATIENCE, current_iter
                )
                
                # Log DNN retrain
                logger.info(f'DNN RETRAINED at {len(ensemble_models)} models | Final ensemble AUC: {final_score:.6f}')
                logger.info(f'Memory: {stage2_memory_mb:.1f}MB, Time: {stage2_time_sec:.1f}s')
                
                # PSEUDO-LABELING: Generate and augment training pool
                if PSEUDO_LABEL_ENABLED and stage2_model is not None:
                    
                    # Generate pseudo-labels from test set
                    X_pseudo, y_pseudo, pseudo_stats = generate_pseudo_labels(
                        ensemble_models=ensemble_models,
                        stage2_model=stage2_model,
                        test_df=test_df,
                        confidence_threshold=PSEUDO_CONFIDENCE_THRESHOLD,
                        balance_classes=PSEUDO_BALANCE_CLASSES
                    )
                    
                    # Augment training pool if we got pseudo-labels
                    if len(X_pseudo) > 0:

                        X_train_pool, y_train_pool, aug_stats = augment_training_pool_with_pseudo_labels(
                            X_train_pool=X_train_pool,
                            y_train_pool=y_train_pool,
                            X_pseudo=X_pseudo,
                            y_pseudo=y_pseudo,
                            max_pseudo_fraction=PSEUDO_MAX_FRACTION
                        )
                        
                        logger.info(f'Training pool augmented with {aug_stats['pseudo_size']:,} pseudo-labeled samples')
                        logger.info(f'New training pool size: {aug_stats['augmented_size']:,}')

                    else:
                        logger.info('No pseudo-labels generated (no high-confidence predictions)')
                
                print(f'DNN RETRAINED at {len(ensemble_models)} models | Final ensemble AUC: {final_score:.6f}')
                
                log_iteration(
                    iteration=current_iter,
                    accepted=True,
                    rejection_reason=f'dnn_retrain_batch_{len(ensemble_models)}',
                    pipeline_hash=f"dnn_retrain_{len(ensemble_models)}",
                    stage1_val_auc=val_auc_s1,
                    stage2_val_auc=final_score,
                    ensemble_size=len(ensemble_models),
                    diversity_score=diversity_score,
                    temperature=temperature,
                    metadata={'classifier_type': 'dnn_retrain', 'transformers_used': []},
                    ensemble_id=f'dnn_retrain_{len(ensemble_models)}',
                    training_memory_mb=None,
                    stage2_memory_mb=stage2_memory_mb,
                    training_time_sec=None,
                    stage2_time_sec=stage2_time_sec,
                    timeout=False,
                    stage2_tp=stage2_tp,
                    stage2_fp=stage2_fp,
                    stage2_tn=stage2_tn,
                    stage2_fn=stage2_fn
                )
                
                # Save ensemble bundle checkpoint
                save_ensemble_bundle(
                    ensemble_models, stage2_model, best_ensemble_score, current_iter,
                    MODELS_DIR, RANDOM_STATE, BATCH_SIZE, N_WORKERS, base_preprocessor,
                    numerical_features, ordinal_features, nominal_features,
                    education_categories, income_categories
                )
            
            # Check if this is the best score
            if candidate_score > best_ensemble_score:

                logger.info(f'    NEW BEST: {candidate_score:.6f} (Δ={candidate_score - best_ensemble_score:.6f})')
                best_ensemble_score = candidate_score
                iterations_since_improvement = 0

            else:
                iterations_since_improvement += 1

        else:
            iterations_since_improvement += 1
        
        # Update temperature
        temperature = update_temperature(
            iteration=current_iter,
            acceptance_history=[accept],
            current_temperature=temperature,
            base_temperature=BASE_TEMPERATURE,
            decay_rate=TEMPERATURE_DECAY
        )
    
    # Log batch summary
    logger.info(
        f'Batch summary: {accepted_count}/{len(results)} accepted, '
        f'Ensemble size now {len(ensemble_models)}, Best score {best_ensemble_score:.4f}'
    )
    
    # Move to next batch
    iteration += len(batch_tasks)
    
    # Check termination
    if iterations_since_improvement >= PLATEAU_ITERATIONS:

        logger.info(f'TERMINATING: No improvement for {PLATEAU_ITERATIONS} iterations')
        print(f'TERMINATING: No improvement for {PLATEAU_ITERATIONS} iterations')

        break

# Calculate final acceptance rate and timeout rate
conn = ensemble_database.sqlite3.connect(ensemble_database.DB_PATH)
acceptance_stats = conn.execute('SELECT COUNT(*) as total, SUM(accepted) as accepted FROM ensemble_log WHERE iteration_num > 0').fetchone()
timeout_stats = conn.execute('SELECT SUM(timeout) as timeouts FROM ensemble_log WHERE iteration_num > 0').fetchone()
conn.close()

acceptance_rate = acceptance_stats[1] / acceptance_stats[0] if acceptance_stats[0] > 0 else 0.0
timeout_rate = timeout_stats[0] / acceptance_stats[0] if acceptance_stats[0] > 0 else 0.0

logger.info('TRAINING COMPLETE')
logger.info(f'Total iterations: {iteration - 1}')
logger.info(f'Final ensemble size: {len(ensemble_models)}')
logger.info(f'Best ensemble AUC: {best_ensemble_score:.4f}')
logger.info(f'Acceptance rate: {acceptance_rate:.1%}')
logger.info(f'Timeout rate: {timeout_rate:.1%}')

print(f'\nTotal iterations: {iteration - 1}')
print(f'Final ensemble size: {len(ensemble_models)}')
print(f'Best ensemble AUC: {best_ensemble_score:.6f}')
print(f'Acceptance rate: {acceptance_rate:.1%}')
print(f'Timeout rate: {timeout_rate:.1%}')



## Save final checkpoint and bundle

In [None]:
# Save final checkpoint
save_checkpoint(
    checkpoint_path=CHECKPOINT_PATH,
    ensemble_models=ensemble_models,
    stage2_model=stage2_model,
    iteration=iteration - 1,
    temperature=temperature,
    best_score=best_ensemble_score,
    acceptance_history=[],
    metadata={
        'total_iterations': iteration,
        'final_ensemble_size': len(ensemble_models),
        'acceptance_rate': acceptance_rate,
        'best_score': best_ensemble_score,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    }
)

# Save metadata
metadata_path = MODELS_DIR / 'ensemble_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump({
        'ensemble_size': len(ensemble_models),
        'total_iterations': iteration,
        'best_score': best_ensemble_score,
        'acceptance_rate': acceptance_rate,
        'training_completed': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'random_state': RANDOM_STATE,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    }, f, indent=2)

print(f"\nCheckpoint saved: {CHECKPOINT_PATH}")
print(f"Metadata saved: {metadata_path}")

In [None]:
# Save final ensemble bundle for Kaggle
ensemble_bundle_path = MODELS_DIR / 'ensemble_bundle.joblib'

ensemble_bundle = {
    'ensemble_models': ensemble_models,
    'stage2_model': stage2_model,
    'metadata': {
        'ensemble_size': len(ensemble_models),
        'total_iterations': iteration,
        'best_score': best_ensemble_score,
        'acceptance_rate': acceptance_rate,
        'training_completed': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'random_state': RANDOM_STATE,
        'parallel_batch_size': BATCH_SIZE,
        'n_workers': N_WORKERS
    },
    'base_preprocessor': base_preprocessor,
    'feature_info': {
        'numerical_features': numerical_features,
        'ordinal_features': ordinal_features,
        'nominal_features': nominal_features,
        'education_categories': education_categories,
        'income_categories': income_categories
    }
}

joblib.dump(ensemble_bundle, ensemble_bundle_path, compress=3)

print(f"\nFinal ensemble bundle saved: {ensemble_bundle_path}")
print(f"File size: {ensemble_bundle_path.stat().st_size / (1024**2):.1f} MB")
print(f"\nTo load on Kaggle:")
print(f"  ensemble_bundle = joblib.load('ensemble_bundle.joblib')")
print(f"  ensemble_models = ensemble_bundle['ensemble_models']")
print(f"  stage2_model = ensemble_bundle['stage2_model']")

In [None]:
# Import the wrapper class
sys.path.insert(0, str(MODELS_BASE_DIR))

# Create wrapped model
wrapped_model = EnsembleClassifier(
    ensemble_models=ensemble_models,
    stage2_model=stage2_model,
    aggregation='mean'  # Fallback if stage2_model is None
)

# Save as single joblib file
wrapped_model_path = MODELS_DIR / 'ensemble_model.joblib'
joblib.dump(wrapped_model, wrapped_model_path, compress=3)

print(f"\nWrapped ensemble model saved: {wrapped_model_path}")
print(f"File size: {wrapped_model_path.stat().st_size / (1024**2):.1f} MB")
print(f"\nModel info: {wrapped_model}")
print(f"\nTo use on Kaggle:")
print(f"  1. Upload to Kaggle dataset:")
print(f"     - {wrapped_model_path.name}")
print(f"     - {MODELS_BASE_DIR / 'ensemble_classifier.py'}")
print(f"  2. In inference notebook:")
print(f"     from ensemble_classifier import EnsembleClassifier")
print(f"     model = joblib.load('ensemble_model.joblib')")
print(f"     predictions = model.predict(test_df)")

## Create wrapped ensemble model for Kaggle

Create a sklearn-compatible wrapper that bundles the entire ensemble into a single classifier.
This makes inference identical to the logistic regression workflow.

## Summary

In [None]:
print(f"\n{'=' * 80}")
print("ENSEMBLE TRAINING SUMMARY")
print(f"{'=' * 80}")
print(f"\nFinal Statistics:")
print(f"  Ensemble size: {len(ensemble_models)}")
print(f"  Best validation AUC: {best_ensemble_score:.6f}")
print(f"  Total iterations: {iteration}")
print(f"  Acceptance rate: {acceptance_rate:.1%}")
print(f"  Parallel configuration: {BATCH_SIZE} candidates, {N_WORKERS} workers")
print(f"\nFiles created:")
print(f"  Database: {ensemble_database.DB_PATH}")
print(f"  Models: {ENSEMBLE_DIR}")
print(f"  Checkpoint: {CHECKPOINT_PATH}")
print(f"  Metadata: {metadata_path}")
print(f"  Bundle: {ensemble_bundle_path}")
print(f"\n{'=' * 80}")