In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime

# Add functions path for legacy modules
functions_path = Path.cwd() / 'functions'

if str(functions_path) not in sys.path:
    sys.path.insert(0, str(functions_path))

# Import refactored modules
from ensemble.config import EnsembleConfig
from ensemble.data import create_three_way_split, create_base_preprocessor
from ensemble.tracking import EnsembleDatabase, setup_logger
from ensemble.parallel import prepare_training_batch, train_batch_parallel
from ensemble.core import AcceptanceCriterion, DiversityScorer
from ensemble.stage2 import Stage2Model, Stage2Trainer

# Legacy modules (will be refactored later)
from ensemble_hill_climbing import (
    compute_pipeline_hash,
    evaluate_ensemble_stage1,
    update_temperature
)
from ensemble_evaluation import evaluate_hybrid_ensemble

ModuleNotFoundError: No module named 'ensemble'

## Configuration

In [None]:
# Load configuration
config = EnsembleConfig()

# Training parameters
MAX_ITERATIONS = 1000
BATCH_SIZE = 20
N_WORKERS = 20
TIMEOUT_MINUTES = 60

# Paths
DATA_PATH = Path.cwd().parent / 'data' / 'diabetes_prediction_train.csv'
DB_PATH = Path.cwd().parent / 'logs' / 'ensemble_refactored.db'
MODEL_DIR = Path.cwd().parent / 'models'

# Create run directory
run_id = datetime.now().strftime('run_%Y%m%d_%H%M%S')
run_dir = MODEL_DIR / run_id
run_dir.mkdir(parents=True, exist_ok=True)

print(f'Run ID: {run_id}')
print(f'Run directory: {run_dir}')
print(f'Database: {DB_PATH}')

## Initialize Components

In [None]:
# Load data
print('Loading data...')
train_df = pd.read_csv(DATA_PATH)
train_df = train_df.drop(columns=['id'])
print(f'Loaded {len(train_df):,} samples')

# Create three-way split
print('\nCreating data splits...')
data_splits = create_three_way_split(
    df=train_df,
    target_column='diagnosed_diabetes',
    random_state=config.random_state
)

print(f'Training pool: {len(data_splits.get_train_pool()[0]):,} samples')
print(f'Stage 1 validation: {len(data_splits.get_val_stage1()[0]):,} samples')
print(f'Stage 2 validation: {len(data_splits.get_val_stage2()[0]):,} samples')

# Create base preprocessor
print('\nCreating preprocessor...')
X_train_pool, y_train_pool = data_splits.get_train_pool()
base_preprocessor = create_base_preprocessor(X_train_pool)
print('✓ Preprocessor ready')

# Initialize database
print('\nInitializing database...')
database = EnsembleDatabase(db_path=DB_PATH)
database.reset()
database.initialize()

# Setup logger
logger = setup_logger(name='ensemble_training', log_file=run_dir / 'training.log')
logger.info(f'Starting training run: {run_id}')

# Initialize trackers
current_ensemble = []
current_stage1_auc = 0.0
current_stage2_auc = 0.0
temperature = config.hill_climbing.base_temperature
best_auc = 0.0
plateau_counter = 0

print('\nAll components initialized')

## Main Training Loop

In [None]:
# Initialize acceptance criterion and diversity scorer
acceptance = AcceptanceCriterion(temperature=temperature)
diversity_scorer = DiversityScorer()

# Get validation data
X_val_s1, y_val_s1 = data_splits.get_val_stage1()
X_val_s2, y_val_s2 = data_splits.get_val_stage2()

print(f'Starting hill climbing with {BATCH_SIZE} workers...')
print(f'Target: {MAX_ITERATIONS} iterations')
print('=' * 70)

for batch_num in range(MAX_ITERATIONS // BATCH_SIZE):
    iteration_start = batch_num * BATCH_SIZE
    
    # Prepare batch
    logger.info(f'Batch {batch_num}: Preparing {BATCH_SIZE} candidates')
    
    batch_jobs = prepare_training_batch(
        iteration=iteration_start,
        batch_size=BATCH_SIZE,
        max_iterations=MAX_ITERATIONS,
        X_train_pool=X_train_pool,
        y_train_pool=y_train_pool,
        X_val_s1=X_val_s1,
        y_val_s1=y_val_s1,
        base_preprocessor=base_preprocessor,
        random_state=config.random_state,
        total_cpus=N_WORKERS,
        timeout_minutes=TIMEOUT_MINUTES,
        batch_num=batch_num
    )
    
    # Train batch in parallel
    results = train_batch_parallel(
        batch_jobs=batch_jobs,
        database=database,
        logger=logger,
        max_workers=N_WORKERS
    )
    
    # Process results
    successful = [r for r in results if r is not None]
    logger.info(f'Batch {batch_num}: {len(successful)}/{len(results)} completed')
    
    for result in successful:
        iteration = result['iteration']
        pipeline = result['pipeline']
        val_accuracy = result['val_accuracy']
        
        # Compute pipeline hash
        pipeline_hash = compute_pipeline_hash(pipeline)
        
        # Evaluate candidate ensemble
        candidate_ensemble = current_ensemble + [pipeline]
        candidate_stage1_auc = evaluate_ensemble_stage1(
            candidate_ensemble, X_val_s1, y_val_s1
        )
        
        # Compute diversity
        if len(candidate_ensemble) > 1:
            predictions = np.array([p.predict_proba(X_val_s1)[:, 1] 
                                   for p in candidate_ensemble])
            diversity_score = diversity_scorer.score(predictions)
        else:
            diversity_score = 0.0
        
        # Acceptance decision
        accepted, reason = acceptance.should_accept(
            current_score=current_stage1_auc,
            candidate_score=candidate_stage1_auc
        )
        
        if accepted:
            current_ensemble = candidate_ensemble
            current_stage1_auc = candidate_stage1_auc
            
            if candidate_stage1_auc > best_auc:
                best_auc = candidate_stage1_auc
                plateau_counter = 0
            else:
                plateau_counter += 1
            
            logger.info(
                f'Iteration {iteration}: ACCEPTED - '
                f'Stage1 AUC={candidate_stage1_auc:.4f}, '
                f'Ensemble size={len(current_ensemble)}, '
                f'Diversity={diversity_score:.4f}'
            )
        else:
            plateau_counter += 1
        
        # Log to database
        database.insert_iteration({
            'timestamp': datetime.now().isoformat(),
            'iteration_num': iteration,
            'ensemble_id': f'ensemble_{iteration}',
            'stage1_val_auc': candidate_stage1_auc,
            'stage2_val_auc': current_stage2_auc,
            'diversity_score': diversity_score,
            'temperature': temperature,
            'accepted': 1 if accepted else 0,
            'rejection_reason': reason if not accepted else None,
            'num_models': len(candidate_ensemble),
            'classifier_type': result['classifier_type'],
            'transformers_used': '',
            'use_pca': 0,
            'pca_components': None,
            'pipeline_hash': pipeline_hash,
            'training_memory_mb': result.get('memory_mb', 0.0),
            'stage2_memory_mb': 0.0,
            'training_time_sec': 0.0,
            'stage2_time_sec': 0.0,
            'timeout': 0,
            'stage2_tp': None,
            'stage2_fp': None,
            'stage2_tn': None,
            'stage2_fn': None
        })
    
    # Update temperature
    temperature = update_temperature(
        temperature, 
        config.hill_climbing.temperature_decay
    )
    acceptance.temperature = temperature
    
    # Status update
    print(f'\nBatch {batch_num} complete:')
    print(f'  Ensemble size: {len(current_ensemble)}')
    print(f'  Best Stage 1 AUC: {best_auc:.4f}')
    print(f'  Temperature: {temperature:.6f}')
    print(f'  Plateau counter: {plateau_counter}')
    
    # Check for early stopping
    if plateau_counter >= config.hill_climbing.plateau_iterations:
        logger.info(f'Plateau detected after {plateau_counter} iterations. Stopping.')
        print(f'\n⚠ Plateau detected. Stopping early.')
        break

print('\n' + '=' * 70)
print('Training complete')
print(f'Final ensemble size: {len(current_ensemble)}')
print(f'Best Stage 1 AUC: {best_auc:.4f}')

## Save Results

In [None]:
import joblib

# Save ensemble models
ensemble_dir = run_dir / 'ensemble_stage1_models'
ensemble_dir.mkdir(exist_ok=True)

for idx, model in enumerate(current_ensemble):
    model_path = ensemble_dir / f'model_{idx}.joblib'
    joblib.dump(model, model_path)

print(f'✓ Saved {len(current_ensemble)} models to {ensemble_dir}')

# Save metadata
metadata = {
    'run_id': run_id,
    'timestamp': datetime.now().isoformat(),
    'ensemble_size': len(current_ensemble),
    'best_stage1_auc': best_auc,
    'config': config.__dict__
}

import json
with open(run_dir / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

print(f'Saved metadata to {run_dir / "metadata.json"}')