## Setup

In [None]:
# GPU Configuration - Use P100 (GPU 1) for best performance
import os
import sys

# Set library path for CUDA libraries installed via pip
venv_cuda_libs = '/mnt/arkk/kaggle/diabetes-prediction/.venv/lib/python3.12/site-packages/nvidia/cudnn/lib'
if 'LD_LIBRARY_PATH' in os.environ:
    os.environ['LD_LIBRARY_PATH'] = f"{os.environ['LD_LIBRARY_PATH']}:{venv_cuda_libs}"
else:
    os.environ['LD_LIBRARY_PATH'] = venv_cuda_libs

os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # 1=GTX1080, 0=P100
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# Verify GPU
import tensorflow as tf
print(f"TensorFlow version: {tf.__version__}")
gpus = tf.config.list_physical_devices('GPU')
print(f"GPUs available: {gpus}")
print(f"Number of GPUs: {len(gpus)}")

In [None]:
# Standard imports
import sys
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.metrics import roc_auc_score

# Add functions to path
sys.path.insert(0, str(Path('.').resolve()))

# Import ensemble modules - use same optimization function as online path!
from functions.ensemble_initialization import create_data_splits
from functions.ensemble_stage2_training import optimize_and_update_config
from functions import ensemble_config

### Configuration

In [None]:
# Configuration
RANDOM_STATE = 315
LABEL = 'diagnosed_diabetes'

# Hyperparameter search settings - FAST configuration to prevent hanging
MAX_TRIALS = 10  # Reduced from 50 (was causing 7+ hour hangs)
EXECUTIONS_PER_TRIAL = 1  # Reduced from 3 (statistical confidence not critical)

# Database path for loading ensemble state
DB_PATH = '../data/ensemble_training.db'

print(f"Configuration:")
print(f"  Max trials: {MAX_TRIALS}")
print(f"  Executions per trial: {EXECUTIONS_PER_TRIAL}")
print(f"  Database: {DB_PATH}")
print(f"  Estimated time: ~10-15 minutes on P100 GPU")
print(f"  (Old settings: 50 trials × 3 exec = 7+ hours!)")

## Data Preparation

In [None]:
# Load training data
train_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_train.csv'
train_df = pd.read_csv(train_df_path)
train_df.drop_duplicates(inplace=True)

print(f'Training data shape: {train_df.shape}')
print(f'Class distribution:')
print(train_df[LABEL].value_counts(normalize=True))

In [None]:
# Create 60/20/20 split (same as hill climbing)
X_train_pool, X_val_s1, X_val_s2, y_train_pool, y_val_s1, y_val_s2 = create_data_splits(
    train_df, LABEL, RANDOM_STATE
)

print(f"\nData splits (same as hill climbing):")
print(f"  X_train_pool: {X_train_pool.shape} (60%)")
print(f"  X_val_s1: {X_val_s1.shape} (20%)")
print(f"  X_val_s2: {X_val_s2.shape} (20%)")

## Load Stage 1 Models from prior ensemble run.

In [None]:
import joblib
from glob import glob

models_path = f"../models/run_20251208_045148/ensemble_stage1_models"

print(f"Loading models from: {models_path}\n")

# Find all model files (excluding founder)
model_files = sorted(glob(f"{models_path}/model_*.joblib"))

# Load first 5 models
stage1_models = []

for model_file in model_files[:5]:
    model_name = Path(model_file).stem
    model = joblib.load(model_file)
    
    # Evaluate on validation set
    # Handle both predict_proba and decision_function (e.g., LinearSVC)
    if hasattr(model, 'predict_proba'):
        pred = model.predict_proba(X_val_s1)[:, 1]
    else:
        # Use decision_function for models like LinearSVC
        pred = model.decision_function(X_val_s1)
    
    auc = roc_auc_score(y_val_s1, pred)
    
    stage1_models.append(model)
    print(f"  {model_name}: AUC = {auc:.6f}")

print(f"\n{len(stage1_models)} Stage 1 models loaded!")
print(f"Model files: {[Path(f).stem for f in model_files[:5]]}")

## Test Fast Optimization Settings

**Tests the NEW fast optimization settings (10 trials × 1 exec = ~10-15 min)**

This notebook validates:
- Fast optimization completes in reasonable time (~10-15 min vs 7+ hours)
- Uses SAME `optimize_and_update_config()` function as online hill climbing
- Results are good enough for production use

**Previous issue:**
- 50 trials × 3 executions = 150 training runs
- Each trial took ~1-2 minutes
- Total time: 7+ hours (HUNG the overnight training)

**New settings:**
- 10 trials × 1 execution = 10 training runs  
- Each trial: ~1 minute
- Total time: ~10-15 minutes (PRACTICAL!)

This ensures:
- Identical search space
- Same data preparation (conservative 95/5 split)
- Same architecture builders (pyramid/funnel)
- Results directly applicable to production

In [None]:
import time

print("=" * 80)
print(f"STARTING OPTIMIZATION TEST")
print(f"Settings: {MAX_TRIALS} trials × {EXECUTIONS_PER_TRIAL} execution(s)")
print(f"Expected time: ~10-15 minutes")
print("=" * 80)

# Time the optimization
start_time = time.time()

# Run optimization - this will update ensemble_config.STAGE2_DNN_CONFIG in-memory
optimized_config = optimize_and_update_config(
    ensemble_models=stage1_models,
    X_val_s1=X_val_s1,
    y_val_s1=y_val_s1,
    X_val_s2=X_val_s2,
    y_val_s2=y_val_s2,
    max_trials=MAX_TRIALS,
    executions_per_trial=EXECUTIONS_PER_TRIAL
)

elapsed_time = time.time() - start_time

print("\n" + "=" * 80)
print("OPTIMIZATION COMPLETE")
print("=" * 80)
print(f"Elapsed time: {elapsed_time/60:.1f} minutes ({elapsed_time:.0f} seconds)")
print(f"Average time per trial: {elapsed_time/MAX_TRIALS:.1f} seconds")
print("=" * 80)

## Results

In [None]:
# Extract key hyperparameters
layers = optimized_config['architecture']['hidden_layers']
lr = optimized_config['training']['learning_rate']

print("\nKey Hyperparameters:")
print("-" * 40)
print(f"Number of layers: {len(layers)}")
print(f"Layer configuration:")
for i, layer in enumerate(layers, 1):
    print(f"  Layer {i}: {layer['units']} units, dropout={layer['dropout']:.3f}")
print(f"Learning rate: {lr:.6f}")
print("-" * 40)

## Summary

✅ **Test notebook complete!**

### What we did:

1. ✅ Loaded training data (60/20/20 split)
2. ✅ Loaded actual Stage 1 models from batch 1
3. ✅ Ran **`optimize_and_update_config()`** (same as online hill climbing)
4. ✅ Displayed optimized hyperparameters
5. ✅ Generated copy-paste config for `ensemble_config.py`

### Key points:

- **Shared code path**: Uses identical optimization function as online training
- **Real data**: Uses actual Stage 1 models from hill climbing run
- **GPU-optimized**: ~15-30 min on P100 GPU
- **Focused search**: pyramid/funnel architectures, 2-3 layers, 64-256 units
- **Conservative split**: 95% training (266k), 5% validation (14k)

### Next steps:

1. Copy the `STAGE2_DNN_CONFIG` above into `ensemble_config.py`
2. Run full hill climbing with GPU enabled
3. Architecture will be automatically optimized at batches 10, 20, 30+
4. Compare performance against baseline