# Gold Prediction SubModel Training - Temporal Context Transformer
## Attempt 1

**Generated by**: builder_model agent  
**Architecture**: Asymmetric Transformer Autoencoder with Masked Reconstruction  
**Input**: 14 features (5 base + 9 submodel outputs)  
**Output**: 1 column (temporal_context_score, 0-1)  
**Target params**: ~6,200 (3K-10K range)  
**Window size**: 5-20 days (Optuna search)  

Self-contained: Data fetch → Preprocessing → Training → Evaluation → Save results

## Cell 1: Imports and Setup

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import json
import os
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

# Set random seeds
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"=== Gold SubModel Training: temporal_context attempt 1 ===")
print(f"Started: {datetime.now().isoformat()}")
print(f"Device: {device}")
print(f"PyTorch version: {torch.__version__}")

=== Gold SubModel Training: temporal_context attempt 1 ===
Started: 2026-02-18T01:38:06.899896
Device: cuda
PyTorch version: 2.7.1+cu118


  from .autonotebook import tqdm as notebook_tqdm


## Cell 2: Data Fetching (API-based, self-contained)

In [2]:
def fetch_data():
    """
    Fetch and prepare data for temporal context transformer.
    Returns: (train_df, val_df, test_df, full_df, scaler)
    """
    print("\n" + "="*80)
    print("DATA FETCHING")
    print("="*80)
    
    # ===== 1. Import data APIs =====
    import yfinance as yf
    try:
        from fredapi import Fred
    except ImportError:
        import subprocess
        print("Installing fredapi...")
        subprocess.run(["pip", "install", "fredapi"], check=True)
        from fredapi import Fred
    
    # Get FRED API key from environment variable
    FRED_API_KEY = "3ffb68facdf6321e180e380c00e909c8"
    fred = Fred(api_key=FRED_API_KEY)
    
    # ===== 2. Fetch base features (FRED + Yahoo) =====
    print("\n[1/5] Fetching base features from FRED and Yahoo...")
    
    # Real interest rate (10Y TIPS)
    real_rate = fred.get_series('DFII10', observation_start='2015-01-01')
    real_rate_df = pd.DataFrame({'real_rate_real_rate': real_rate})
    print(f"   Real rate: {len(real_rate_df)} rows")
    
    # DXY (Dollar Index) - using FRED for stability
    dxy = fred.get_series('DTWEXBGS', observation_start='2015-01-01')
    dxy_df = pd.DataFrame({'dxy_dxy': dxy})
    print(f"   DXY: {len(dxy_df)} rows")
    
    # VIX
    vix = fred.get_series('VIXCLS', observation_start='2015-01-01')
    vix_df = pd.DataFrame({'vix_vix': vix})
    print(f"   VIX: {len(vix_df)} rows")
    
    # Yield spread (10Y - 2Y)
    dgs10 = fred.get_series('DGS10', observation_start='2015-01-01')
    dgs2 = fred.get_series('DGS2', observation_start='2015-01-01')
    yield_spread = dgs10 - dgs2
    yield_spread_df = pd.DataFrame({'yield_curve_yield_spread': yield_spread})
    print(f"   Yield spread: {len(yield_spread_df)} rows")
    
    # Inflation expectation (10Y Breakeven)
    inflation_exp = fred.get_series('T10YIE', observation_start='2015-01-01')
    inflation_exp_df = pd.DataFrame({'inflation_expectation_inflation_expectation': inflation_exp})
    print(f"   Inflation expectation: {len(inflation_exp_df)} rows")
    
    # Merge base features
    base_df = real_rate_df.join(dxy_df, how='outer')
    base_df = base_df.join(vix_df, how='outer')
    base_df = base_df.join(yield_spread_df, how='outer')
    base_df = base_df.join(inflation_exp_df, how='outer')
    base_df = base_df.sort_index()
    
    # Forward fill and backward fill (max 5 days)
    base_df = base_df.ffill(limit=5)
    base_df = base_df.bfill(limit=5)
    base_df = base_df.dropna()
    
    print(f"   Base features merged: {len(base_df)} rows, {base_df.shape[1]} columns")
    
    # ===== 3. Transform base features =====
    print("\n[2/5] Transforming base features (diff)...")
    
    base_features = pd.DataFrame(index=base_df.index)
    base_features['real_rate_change'] = base_df['real_rate_real_rate'].diff()
    base_features['dxy_change'] = base_df['dxy_dxy'].diff()
    base_features['vix'] = base_df['vix_vix']  # No transformation (already stationary)
    base_features['yield_spread_change'] = base_df['yield_curve_yield_spread'].diff()
    base_features['inflation_exp_change'] = base_df['inflation_expectation_inflation_expectation'].diff()
    
    print(f"   Base features transformed: {list(base_features.columns)}")
    
    # ===== 4. Load submodel outputs from Kaggle Dataset =====
    print("\n[3/5] Loading submodel outputs from Kaggle Dataset...")
    
    # Note: In Kaggle environment, the dataset is mounted at /kaggle/input/
    # For local testing, adjust the path
    if os.path.exists("/kaggle/input/gold-prediction-submodels/"):
        submodel_path = "/kaggle/input/gold-prediction-submodels/"
    else:
        submodel_path = "../../data/submodel_outputs/"
    
    # VIX submodel (2 columns)
    vix_sub = pd.read_csv(submodel_path + "vix.csv")
    vix_sub['date'] = pd.to_datetime(vix_sub['date'])
    vix_sub = vix_sub.set_index('date').sort_index()
    vix_features = vix_sub[['vix_regime_probability', 'vix_mean_reversion_z']].copy()
    print(f"   VIX: {len(vix_features)} rows")
    
    # Technical submodel (3 columns) - handle timezone
    tech_sub = pd.read_csv(submodel_path + "technical.csv")
    tech_sub['date'] = tech_sub['date'].str[:10]  # Extract YYYY-MM-DD
    tech_sub['date'] = pd.to_datetime(tech_sub['date'])
    tech_sub = tech_sub.set_index('date').sort_index()
    tech_features = tech_sub[['tech_trend_regime_prob', 'tech_mean_reversion_z', 'tech_volatility_regime']].copy()
    print(f"   Technical: {len(tech_features)} rows")
    
    # Cross-asset submodel (2 columns)
    xasset_sub = pd.read_csv(submodel_path + "cross_asset.csv")
    xasset_sub['Date'] = pd.to_datetime(xasset_sub['Date'])
    xasset_sub = xasset_sub.set_index('Date').sort_index()
    xasset_features = xasset_sub[['xasset_regime_prob', 'xasset_divergence']].copy()
    print(f"   Cross-asset: {len(xasset_features)} rows")
    
    # ETF flow submodel (1 column)
    etf_sub = pd.read_csv(submodel_path + "etf_flow.csv")
    etf_sub['Date'] = pd.to_datetime(etf_sub['Date'])
    etf_sub = etf_sub.set_index('Date').sort_index()
    etf_features = etf_sub[['etf_regime_prob']].copy()
    print(f"   ETF flow: {len(etf_features)} rows")
    
    # Options market submodel (1 column) - handle timezone
    options_sub = pd.read_csv(submodel_path + "options_market.csv")
    options_sub['Date'] = options_sub['Date'].str[:10]  # Extract YYYY-MM-DD
    options_sub['Date'] = pd.to_datetime(options_sub['Date'])
    options_sub = options_sub.set_index('Date').sort_index()
    options_features = options_sub[['options_risk_regime_prob']].copy()
    print(f"   Options: {len(options_features)} rows")
    
    # ===== 5. Merge all features =====
    print("\n[4/5] Merging all features...")
    
    merged_df = base_features.copy()
    merged_df = merged_df.join(vix_features, how='inner')
    merged_df = merged_df.join(tech_features, how='inner')
    merged_df = merged_df.join(xasset_features, how='inner')
    merged_df = merged_df.join(etf_features, how='inner')
    merged_df = merged_df.join(options_features, how='inner')
    
    # Handle NaN values
    merged_df = merged_df.ffill(limit=5)
    merged_df = merged_df.bfill()
    merged_df = merged_df.dropna()
    
    # Remove infinite values
    inf_mask = np.isinf(merged_df.values).any(axis=1)
    if inf_mask.any():
        merged_df = merged_df[~inf_mask]
    
    print(f"   Final merged: {len(merged_df)} rows, {merged_df.shape[1]} columns")
    print(f"   Date range: {merged_df.index.min()} to {merged_df.index.max()}")
    
    # Verify we have exactly 14 columns
    assert merged_df.shape[1] == 14, f"Expected 14 columns, got {merged_df.shape[1]}"
    
    # ===== 6. Time-series split (70/15/15) =====
    print("\n[5/5] Splitting data (70/15/15)...")
    
    n = len(merged_df)
    train_end = int(n * 0.70)
    val_end = int(n * 0.85)
    
    train_df = merged_df.iloc[:train_end].copy()
    val_df = merged_df.iloc[train_end:val_end].copy()
    test_df = merged_df.iloc[val_end:].copy()
    
    print(f"   Train: {len(train_df)} rows ({train_df.index.min()} to {train_df.index.max()})")
    print(f"   Val:   {len(val_df)} rows ({val_df.index.min()} to {val_df.index.max()})")
    print(f"   Test:  {len(test_df)} rows ({test_df.index.min()} to {test_df.index.max()})")
    
    # ===== 7. Standardization =====
    print("\nStandardizing features...")
    
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train_df)
    val_scaled = scaler.transform(val_df)
    test_scaled = scaler.transform(test_df)
    full_scaled = scaler.transform(merged_df)
    
    # Convert back to DataFrame
    train_df = pd.DataFrame(train_scaled, index=train_df.index, columns=train_df.columns)
    val_df = pd.DataFrame(val_scaled, index=val_df.index, columns=val_df.columns)
    test_df = pd.DataFrame(test_scaled, index=test_df.index, columns=test_df.columns)
    full_df = pd.DataFrame(full_scaled, index=merged_df.index, columns=merged_df.columns)
    
    print("   [OK] Features standardized using train set statistics")
    
    print("\n" + "="*80)
    print("DATA FETCHING COMPLETE")
    print("="*80)
    
    return train_df, val_df, test_df, full_df, scaler

# Fetch data
train_data, val_data, test_data, full_data, scaler = fetch_data()
print(f"\nData ready: train={len(train_data)}, val={len(val_data)}, test={len(test_data)}, full={len(full_data)}")


DATA FETCHING



[1/5] Fetching base features from FRED and Yahoo...


   Real rate: 2901 rows


   DXY: 2897 rows


   VIX: 2903 rows


   Yield spread: 2901 rows


   Inflation expectation: 2902 rows
   Base features merged: 2902 rows, 5 columns

[2/5] Transforming base features (diff)...
   Base features transformed: ['real_rate_change', 'dxy_change', 'vix', 'yield_spread_change', 'inflation_exp_change']

[3/5] Loading submodel outputs from Kaggle Dataset...
   VIX: 2858 rows
   Technical: 2860 rows
   Cross-asset: 2774 rows
   ETF flow: 2839 rows
   Options: 2798 rows

[4/5] Merging all features...
   Final merged: 2716 rows, 14 columns
   Date range: 2015-01-30 00:00:00 to 2026-02-12 00:00:00

[5/5] Splitting data (70/15/15)...
   Train: 1901 rows (2015-01-30 00:00:00 to 2022-10-04 00:00:00)
   Val:   407 rows (2022-10-05 00:00:00 to 2024-06-18 00:00:00)
   Test:  408 rows (2024-06-20 00:00:00 to 2026-02-12 00:00:00)

Standardizing features...
   [OK] Features standardized using train set statistics

DATA FETCHING COMPLETE

Data ready: train=1901, val=407, test=408, full=2716


## Cell 3: Windowing Function

In [3]:
def create_windows(data, window_size):
    """
    Create sliding windows from time-series data.
    
    Args:
        data: DataFrame (N, 14) with date index
        window_size: int, number of time steps per window
    
    Returns:
        windows: tensor (N-W+1, W, 14)
        dates: list of dates for each window (end date)
    """
    values = data.values
    n_samples = len(values)
    
    if n_samples < window_size:
        raise ValueError(f"Not enough samples ({n_samples}) for window size {window_size}")
    
    windows = []
    dates = []
    
    for i in range(window_size - 1, n_samples):
        window = values[i - window_size + 1:i + 1]
        windows.append(window)
        dates.append(data.index[i])
    
    windows = np.array(windows)
    windows_tensor = torch.FloatTensor(windows)
    
    return windows_tensor, dates

print("Windowing function defined.")

Windowing function defined.


## Cell 4: PyTorch Dataset

In [4]:
class WindowDataset(Dataset):
    """
    PyTorch Dataset for windowed time-series data.
    """
    def __init__(self, windows):
        """
        Args:
            windows: tensor (N, W, 14)
        """
        self.windows = windows
    
    def __len__(self):
        return len(self.windows)
    
    def __getitem__(self, idx):
        return self.windows[idx]

print("WindowDataset class defined.")

WindowDataset class defined.


## Cell 5: Model Definition - Temporal Context Transformer

In [5]:
class TemporalContextTransformer(nn.Module):
    """
    Asymmetric Transformer Autoencoder for temporal context extraction.
    
    Architecture:
      Input (batch, seq, 14)
        -> Input Projection (14 -> d_model)
        -> Learned Positional Encoding
        -> TransformerEncoder (L layers, H heads)
        -> Mean Pool over time
        -> Bottleneck Linear (d_model -> 1)
        -> Sigmoid -> context_score (0-1)
      
      Reconstruction branch (training only):
        -> Bottleneck (1) -> Expand (d_model)
        -> Repeat to seq_len
        -> Output Projection (d_model -> 14)
    """
    
    def __init__(self, input_dim=14, d_model=24, n_heads=2, n_layers=1,
                 ffn_ratio=2, dropout=0.2, max_seq_len=20):
        super().__init__()
        
        # Input projection
        self.input_proj = nn.Linear(input_dim, d_model)
        
        # Learned positional encoding
        self.pos_encoding = nn.Embedding(max_seq_len, d_model)
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model * ffn_ratio,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True  # Pre-Norm for stability
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        # Bottleneck
        self.bottleneck = nn.Sequential(
            nn.Linear(d_model, 1),
            nn.Sigmoid()
        )
        
        # Lightweight Decoder (reconstruction branch)
        self.decoder_expand = nn.Linear(1, d_model)
        self.decoder_output = nn.Linear(d_model, input_dim)
        
        # Dropout for input
        self.input_dropout = nn.Dropout(dropout)
        
    def encode(self, x):
        """
        x: (batch, seq_len, input_dim)
        Returns: context_score (batch, 1), pooled (batch, d_model)
        """
        batch_size, seq_len, _ = x.shape
        
        # Input projection
        h = self.input_proj(x)  # (batch, seq, d_model)
        
        # Add positional encoding
        positions = torch.arange(seq_len, device=x.device)
        h = h + self.pos_encoding(positions).unsqueeze(0)
        
        # Apply input dropout
        h = self.input_dropout(h)
        
        # Transformer encoder
        encoded = self.encoder(h)  # (batch, seq, d_model)
        
        # Mean pool over time
        pooled = encoded.mean(dim=1)  # (batch, d_model)
        
        # Bottleneck -> context score
        context_score = self.bottleneck(pooled)  # (batch, 1)
        
        return context_score, pooled
    
    def decode(self, context_score, seq_len):
        """
        Reconstruct from bottleneck for masked reconstruction loss.
        context_score: (batch, 1)
        Returns: (batch, seq_len, input_dim)
        """
        # Expand bottleneck
        expanded = self.decoder_expand(context_score)  # (batch, d_model)
        
        # Repeat to sequence length
        expanded = expanded.unsqueeze(1).repeat(1, seq_len, 1)  # (batch, seq, d_model)
        
        # Output projection
        reconstructed = self.decoder_output(expanded)  # (batch, seq, input_dim)
        
        return reconstructed
    
    def forward(self, x, mask_ratio=0.2):
        """
        Forward pass with masked reconstruction.
        x: (batch, seq_len, input_dim)
        Returns: context_score, reconstructed, mask
        """
        batch_size, seq_len, input_dim = x.shape
        
        # Create random mask (mask time steps, not features)
        mask = torch.rand(batch_size, seq_len, device=x.device) < mask_ratio
        # Ensure at least 1 step is masked and 1 is unmasked
        mask[:, 0] = False  # Keep first step
        if seq_len > 2:
            mask[:, -1] = True   # Always mask last step
        
        # Apply mask (zero out masked positions)
        x_masked = x.clone()
        x_masked[mask] = 0.0
        
        # Encode
        context_score, pooled = self.encode(x_masked)
        
        # Decode (reconstruct)
        reconstructed = self.decode(context_score, seq_len)
        
        return context_score, reconstructed, mask
    
    def extract(self, x):
        """
        Extract context score for inference (no masking).
        x: (batch, seq_len, input_dim)
        Returns: context_score (batch, 1)
        """
        context_score, _ = self.encode(x)
        return context_score

# Test model instantiation
test_model = TemporalContextTransformer()
n_params = sum(p.numel() for p in test_model.parameters())
print(f"\nModel defined. Test model parameters: {n_params:,}")
print(f"Expected range: 3,000-10,000 params")
del test_model


Model defined. Test model parameters: 6,135
Expected range: 3,000-10,000 params


## Cell 6: Loss Function and Training Loop

In [6]:
def masked_reconstruction_loss(original, reconstructed, mask):
    """
    MSE loss computed ONLY on masked time steps.
    
    original: (batch, seq_len, 14)
    reconstructed: (batch, seq_len, 14)
    mask: (batch, seq_len) -- True where masked
    """
    # Expand mask to feature dimension
    mask_expanded = mask.unsqueeze(-1).expand_as(original)  # (batch, seq, 14)
    
    # Compute MSE only on masked positions
    diff = (original - reconstructed) ** 2
    masked_diff = diff[mask_expanded]
    
    if masked_diff.numel() == 0:
        return torch.tensor(0.0, requires_grad=True, device=original.device)
    
    return masked_diff.mean()


def train_model(model, train_loader, val_loader, config):
    """
    Train the model with early stopping.
    
    Returns: model, metrics_dict
    """
    model = model.to(device)
    
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config['learning_rate'],
        weight_decay=config.get('weight_decay', 0.01),
        betas=(0.9, 0.999),
        eps=1e-8
    )
    
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=20, T_mult=2
    )
    
    best_val_loss = float('inf')
    best_train_loss = float('inf')
    patience_counter = 0
    best_state = None
    
    max_epochs = config.get('max_epochs', 200)
    patience = config.get('patience', 10)
    mask_ratio = config.get('mask_ratio', 0.2)
    
    for epoch in range(max_epochs):
        # Train
        model.train()
        train_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            
            optimizer.zero_grad()
            context_score, reconstructed, mask = model(batch, mask_ratio=mask_ratio)
            loss = masked_reconstruction_loss(batch, reconstructed, mask)
            
            loss.backward()
            # Gradient clipping
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        
        # Validate
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                context_score, reconstructed, mask = model(batch, mask_ratio=mask_ratio)
                loss = masked_reconstruction_loss(batch, reconstructed, mask)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        
        # Step scheduler
        scheduler.step()
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_train_loss = train_loss
            patience_counter = 0
            best_state = {k: v.clone().cpu() for k, v in model.state_dict().items()}
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"   Early stopping at epoch {epoch+1}")
                break
        
        if (epoch + 1) % 20 == 0:
            print(f"   Epoch {epoch+1}/{max_epochs}: train_loss={train_loss:.6f}, val_loss={val_loss:.6f}")
    
    # Restore best weights
    if best_state:
        model.load_state_dict(best_state)
    
    overfit_ratio = best_val_loss / (best_train_loss + 1e-10)
    
    metrics = {
        'train_loss': best_train_loss,
        'val_loss': best_val_loss,
        'overfit_ratio': overfit_ratio,
        'epochs_trained': epoch + 1
    }
    
    return model, metrics

print("Training functions defined.")

Training functions defined.


## Cell 7: Optuna HPO

In [7]:
def run_hpo(train_data, val_data, n_trials=30, timeout=1800):
    """
    Run hyperparameter optimization using Optuna.
    
    Returns: best_params, best_value, n_completed_trials
    """
    print("\n" + "="*80)
    print("HYPERPARAMETER OPTIMIZATION (OPTUNA)")
    print("="*80)
    
    def objective(trial):
        # Sample hyperparameters
        window_size = trial.suggest_categorical('window_size', [5, 10, 15, 20])

        # Flatten parameter space: (d_model, n_heads) combinations
        model_config = trial.suggest_categorical('model_config', [
            (16, 2),
            (24, 2),
            (24, 4),
            (32, 2),
            (32, 4)
        ])
        d_model, n_heads = model_config

        n_layers = trial.suggest_int('n_layers', 1, 2)
        ffn_ratio = trial.suggest_categorical('ffn_ratio', [2, 3])
        dropout = trial.suggest_float('dropout', 0.1, 0.3)
        mask_ratio = trial.suggest_float('mask_ratio', 0.15, 0.30)
        learning_rate = trial.suggest_float('learning_rate', 1e-4, 3e-3, log=True)
        weight_decay = trial.suggest_float('weight_decay', 0.01, 0.1, log=True)
        patience = trial.suggest_categorical('patience', [7, 10, 15])
        
        config = {
            'window_size': window_size,
            'd_model': d_model,
            'n_heads': n_heads,
            'n_layers': n_layers,
            'ffn_ratio': ffn_ratio,
            'dropout': dropout,
            'mask_ratio': mask_ratio,
            'learning_rate': learning_rate,
            'weight_decay': weight_decay,
            'patience': patience,
            'max_epochs': 200,
            'batch_size': 64
        }
        
        # Create windows
        try:
            train_windows, _ = create_windows(train_data, window_size)
            val_windows, _ = create_windows(val_data, window_size)
        except ValueError as e:
            print(f"   Trial failed: {e}")
            return float('inf')
        
        # Create dataloaders
        train_dataset = WindowDataset(train_windows)
        val_dataset = WindowDataset(val_windows)
        
        train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=False)
        val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
        
        # Build model
        model = TemporalContextTransformer(
            input_dim=14,
            d_model=d_model,
            n_heads=n_heads,
            n_layers=n_layers,
            ffn_ratio=ffn_ratio,
            dropout=dropout,
            max_seq_len=20
        )
        
        # Train
        model, metrics = train_model(model, train_loader, val_loader, config)
        
        val_loss = metrics['val_loss']
        overfit_ratio = metrics['overfit_ratio']
        
        # Pruning
        trial.report(val_loss, step=metrics['epochs_trained'])
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        # Penalize overfitting
        penalty = max(0, overfit_ratio - 1.5) * 0.1
        
        return val_loss + penalty
    
    # Run optimization
    study = optuna.create_study(
        direction='minimize',
        sampler=TPESampler(seed=SEED),
        pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10)
    )
    
    study.optimize(objective, n_trials=n_trials, timeout=timeout, show_progress_bar=True)
    
    print("\n" + "="*80)
    print("OPTUNA RESULTS")
    print("="*80)
    print(f"Number of completed trials: {len(study.trials)}")
    print(f"Best trial value: {study.best_value:.6f}")
    print(f"Best parameters:")
    for key, value in study.best_params.items():
        print(f"   {key}: {value}")
    
    return study.best_params, study.best_value, len(study.trials)

print("HPO function defined.")

HPO function defined.


## Cell 8: Run HPO

In [8]:
# Run hyperparameter optimization
best_params, best_value, n_completed = run_hpo(
    train_data, val_data,
    n_trials=30,
    timeout=1800  # 30 minutes
)

print(f"\n[OK] HPO complete. Best validation loss: {best_value:.6f}")

[32m[I 2026-02-18 01:38:12,001][0m A new study created in memory with name: no-name-be31da77-f234-4df0-9332-e81c9dd8ab71[0m



HYPERPARAMETER OPTIMIZATION (OPTUNA)


  0%|          | 0/30 [00:00<?, ?it/s]

   Epoch 20/200: train_loss=1.018332, val_loss=0.942577


                                      



  0%|          | 0/30 [00:12<?, ?it/s]

Best trial: 0. Best value: 0.869323:   0%|          | 0/30 [00:12<?, ?it/s]

Best trial: 0. Best value: 0.869323:   3%|▎         | 1/30 [00:12<05:57, 12.33s/it]

Best trial: 0. Best value: 0.869323:   3%|▎         | 1/30 [00:12<05:57, 12.33s/it, 12.32/1800 seconds]

   Early stopping at epoch 27
[32m[I 2026-02-18 01:38:24,327][0m Trial 0 finished with value: 0.8693226405552456 and parameters: {'window_size': 10, 'model_config': (32, 2), 'n_layers': 2, 'ffn_ratio': 3, 'dropout': 0.26648852816008434, 'mask_ratio': 0.1818508666017414, 'learning_rate': 0.00018559980846490597, 'weight_decay': 0.015254729458052608, 'patience': 10}. Best is trial 0 with value: 0.8693226405552456.[0m


                                                                                                       



Best trial: 0. Best value: 0.869323:   3%|▎         | 1/30 [00:17<05:57, 12.33s/it, 12.32/1800 seconds]

Best trial: 1. Best value: 0.848066:   3%|▎         | 1/30 [00:17<05:57, 12.33s/it, 12.32/1800 seconds]

Best trial: 1. Best value: 0.848066:   7%|▋         | 2/30 [00:17<03:40,  7.88s/it, 12.32/1800 seconds]

Best trial: 1. Best value: 0.848066:   7%|▋         | 2/30 [00:17<03:40,  7.88s/it, 17.10/1800 seconds]

   Early stopping at epoch 14
[32m[I 2026-02-18 01:38:29,102][0m Trial 1 finished with value: 0.8480663469859532 and parameters: {'window_size': 10, 'model_config': (24, 4), 'n_layers': 2, 'ffn_ratio': 3, 'dropout': 0.1341048247374583, 'mask_ratio': 0.1597577389477919, 'learning_rate': 0.00252126790477792, 'weight_decay': 0.0923915031962725, 'patience': 7}. Best is trial 1 with value: 0.8480663469859532.[0m


   Epoch 20/200: train_loss=0.910082, val_loss=0.915929


                                                                                                       



Best trial: 1. Best value: 0.848066:   7%|▋         | 2/30 [00:26<03:40,  7.88s/it, 17.10/1800 seconds]

Best trial: 2. Best value: 0.810552:   7%|▋         | 2/30 [00:26<03:40,  7.88s/it, 17.10/1800 seconds]

Best trial: 2. Best value: 0.810552:  10%|█         | 3/30 [00:26<03:55,  8.73s/it, 17.10/1800 seconds]

Best trial: 2. Best value: 0.810552:  10%|█         | 3/30 [00:26<03:55,  8.73s/it, 26.83/1800 seconds]

   Early stopping at epoch 29
[32m[I 2026-02-18 01:38:38,830][0m Trial 2 finished with value: 0.8105517285210746 and parameters: {'window_size': 5, 'model_config': (24, 2), 'n_layers': 2, 'ffn_ratio': 2, 'dropout': 0.2939169255529117, 'mask_ratio': 0.26626992350416717, 'learning_rate': 0.002442046084491142, 'weight_decay': 0.07849235338159358, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=0.982965, val_loss=0.886789


                                                                                                       



Best trial: 2. Best value: 0.810552:  10%|█         | 3/30 [00:34<03:55,  8.73s/it, 26.83/1800 seconds]

Best trial: 2. Best value: 0.810552:  10%|█         | 3/30 [00:34<03:55,  8.73s/it, 26.83/1800 seconds]

Best trial: 2. Best value: 0.810552:  13%|█▎        | 4/30 [00:34<03:36,  8.31s/it, 26.83/1800 seconds]

Best trial: 2. Best value: 0.810552:  13%|█▎        | 4/30 [00:34<03:36,  8.31s/it, 34.50/1800 seconds]

   Early stopping at epoch 32
[32m[I 2026-02-18 01:38:46,502][0m Trial 3 finished with value: 0.8557278939655849 and parameters: {'window_size': 20, 'model_config': (24, 2), 'n_layers': 1, 'ffn_ratio': 2, 'dropout': 0.29737738732010344, 'mask_ratio': 0.2658367153944986, 'learning_rate': 0.00019657448966046135, 'weight_decay': 0.010127963257331486, 'patience': 7}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=1.033801, val_loss=0.945230


   Epoch 40/200: train_loss=1.001741, val_loss=0.881472


   Epoch 60/200: train_loss=1.013461, val_loss=0.935711


                                                                                                       



Best trial: 2. Best value: 0.810552:  13%|█▎        | 4/30 [00:53<03:36,  8.31s/it, 34.50/1800 seconds]

Best trial: 2. Best value: 0.810552:  13%|█▎        | 4/30 [00:53<03:36,  8.31s/it, 34.50/1800 seconds]

Best trial: 2. Best value: 0.810552:  17%|█▋        | 5/30 [00:53<05:03, 12.13s/it, 34.50/1800 seconds]

Best trial: 2. Best value: 0.810552:  17%|█▋        | 5/30 [00:53<05:03, 12.13s/it, 53.41/1800 seconds]

   Early stopping at epoch 79
[32m[I 2026-02-18 01:39:05,410][0m Trial 4 finished with value: 0.8399424382618496 and parameters: {'window_size': 5, 'model_config': (16, 2), 'n_layers': 1, 'ffn_ratio': 2, 'dropout': 0.2774425485152653, 'mask_ratio': 0.22083223877429237, 'learning_rate': 0.00015019490572374368, 'weight_decay': 0.05167075260023277, 'patience': 15}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=1.033319, val_loss=0.965665


                                                                                                       



Best trial: 2. Best value: 0.810552:  17%|█▋        | 5/30 [01:05<05:03, 12.13s/it, 53.41/1800 seconds]

Best trial: 2. Best value: 0.810552:  17%|█▋        | 5/30 [01:05<05:03, 12.13s/it, 53.41/1800 seconds]

Best trial: 2. Best value: 0.810552:  20%|██        | 6/30 [01:05<04:53, 12.25s/it, 53.41/1800 seconds]

Best trial: 2. Best value: 0.810552:  20%|██        | 6/30 [01:05<04:53, 12.25s/it, 65.88/1800 seconds]

   Early stopping at epoch 37
[32m[I 2026-02-18 01:39:17,882][0m Trial 5 finished with value: 0.8793254154069083 and parameters: {'window_size': 10, 'model_config': (24, 4), 'n_layers': 2, 'ffn_ratio': 3, 'dropout': 0.2511102277086097, 'mask_ratio': 0.18431972482374337, 'learning_rate': 0.0001299297674052037, 'weight_decay': 0.01948729021356848, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=1.009601, val_loss=0.896938


                                                                                                       



Best trial: 2. Best value: 0.810552:  20%|██        | 6/30 [01:12<04:53, 12.25s/it, 65.88/1800 seconds]

Best trial: 2. Best value: 0.810552:  20%|██        | 6/30 [01:12<04:53, 12.25s/it, 65.88/1800 seconds]

Best trial: 2. Best value: 0.810552:  23%|██▎       | 7/30 [01:12<04:00, 10.47s/it, 65.88/1800 seconds]

Best trial: 2. Best value: 0.810552:  23%|██▎       | 7/30 [01:12<04:00, 10.47s/it, 72.68/1800 seconds]

   Early stopping at epoch 28
[32m[I 2026-02-18 01:39:24,684][0m Trial 6 finished with value: 0.8928050994873047 and parameters: {'window_size': 10, 'model_config': (32, 2), 'n_layers': 1, 'ffn_ratio': 3, 'dropout': 0.2636029531844986, 'mask_ratio': 0.2791095874884515, 'learning_rate': 0.00010239273411172726, 'weight_decay': 0.03241509528627273, 'patience': 7}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=0.941188, val_loss=0.911668


                                                                                                       



Best trial: 2. Best value: 0.810552:  23%|██▎       | 7/30 [01:19<04:00, 10.47s/it, 72.68/1800 seconds]

Best trial: 2. Best value: 0.810552:  23%|██▎       | 7/30 [01:19<04:00, 10.47s/it, 72.68/1800 seconds]

Best trial: 2. Best value: 0.810552:  27%|██▋       | 8/30 [01:19<03:23,  9.24s/it, 72.68/1800 seconds]

Best trial: 2. Best value: 0.810552:  27%|██▋       | 8/30 [01:19<03:23,  9.24s/it, 79.31/1800 seconds]

   Early stopping at epoch 26
[32m[I 2026-02-18 01:39:31,310][0m Trial 7 finished with value: 0.8349365336554391 and parameters: {'window_size': 10, 'model_config': (24, 4), 'n_layers': 1, 'ffn_ratio': 2, 'dropout': 0.10737738947090657, 'mask_ratio': 0.2414346500969845, 'learning_rate': 0.0005527361503993266, 'weight_decay': 0.011258453832483528, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=0.971242, val_loss=0.942300


                                                                                                       



Best trial: 2. Best value: 0.810552:  27%|██▋       | 8/30 [01:28<03:23,  9.24s/it, 79.31/1800 seconds]

Best trial: 2. Best value: 0.810552:  27%|██▋       | 8/30 [01:28<03:23,  9.24s/it, 79.31/1800 seconds]

Best trial: 2. Best value: 0.810552:  30%|███       | 9/30 [01:28<03:10,  9.09s/it, 79.31/1800 seconds]

Best trial: 2. Best value: 0.810552:  30%|███       | 9/30 [01:28<03:10,  9.09s/it, 88.07/1800 seconds]

   Early stopping at epoch 25
[32m[I 2026-02-18 01:39:40,077][0m Trial 8 finished with value: 0.8683063983917236 and parameters: {'window_size': 15, 'model_config': (24, 2), 'n_layers': 2, 'ffn_ratio': 2, 'dropout': 0.11805795401088166, 'mask_ratio': 0.2752953743383857, 'learning_rate': 0.00029773579612263796, 'weight_decay': 0.015364502781676042, 'patience': 15}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=0.927969, val_loss=0.855931


                                                                                                       



Best trial: 2. Best value: 0.810552:  30%|███       | 9/30 [01:33<03:10,  9.09s/it, 88.07/1800 seconds]

Best trial: 2. Best value: 0.810552:  30%|███       | 9/30 [01:33<03:10,  9.09s/it, 88.07/1800 seconds]

Best trial: 2. Best value: 0.810552:  33%|███▎      | 10/30 [01:33<02:37,  7.89s/it, 88.07/1800 seconds]

Best trial: 2. Best value: 0.810552:  33%|███▎      | 10/30 [01:33<02:37,  7.89s/it, 93.28/1800 seconds]

   Early stopping at epoch 23
[32m[I 2026-02-18 01:39:45,280][0m Trial 9 finished with value: 0.8169306984969548 and parameters: {'window_size': 20, 'model_config': (32, 2), 'n_layers': 1, 'ffn_ratio': 3, 'dropout': 0.27546787067619616, 'mask_ratio': 0.18869124415727334, 'learning_rate': 0.0009437923703573488, 'weight_decay': 0.06564810589958753, 'patience': 7}. Best is trial 2 with value: 0.8105517285210746.[0m


                                                                                                        



Best trial: 2. Best value: 0.810552:  33%|███▎      | 10/30 [01:37<02:37,  7.89s/it, 93.28/1800 seconds]

Best trial: 2. Best value: 0.810552:  33%|███▎      | 10/30 [01:37<02:37,  7.89s/it, 93.28/1800 seconds]

Best trial: 2. Best value: 0.810552:  37%|███▋      | 11/30 [01:37<02:11,  6.91s/it, 93.28/1800 seconds]

Best trial: 2. Best value: 0.810552:  37%|███▋      | 11/30 [01:37<02:11,  6.91s/it, 97.96/1800 seconds]

   Early stopping at epoch 15
[32m[I 2026-02-18 01:39:49,963][0m Trial 10 finished with value: 0.8414711356163025 and parameters: {'window_size': 5, 'model_config': (32, 4), 'n_layers': 2, 'ffn_ratio': 2, 'dropout': 0.20050624353495927, 'mask_ratio': 0.23653947569997247, 'learning_rate': 0.0027374027426971623, 'weight_decay': 0.041396749389544275, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


                                                                                                        



Best trial: 2. Best value: 0.810552:  37%|███▋      | 11/30 [01:40<02:11,  6.91s/it, 97.96/1800 seconds]

Best trial: 2. Best value: 0.810552:  37%|███▋      | 11/30 [01:40<02:11,  6.91s/it, 97.96/1800 seconds]

Best trial: 2. Best value: 0.810552:  40%|████      | 12/30 [01:40<01:38,  5.49s/it, 97.96/1800 seconds]

Best trial: 2. Best value: 0.810552:  40%|████      | 12/30 [01:40<01:38,  5.49s/it, 100.19/1800 seconds]

   Early stopping at epoch 10
[32m[I 2026-02-18 01:39:52,191][0m Trial 11 finished with value: 0.8830616133553642 and parameters: {'window_size': 20, 'model_config': (24, 2), 'n_layers': 1, 'ffn_ratio': 3, 'dropout': 0.22074762071794393, 'mask_ratio': 0.29886582656785377, 'learning_rate': 0.0011911493102132206, 'weight_decay': 0.09241145820526017, 'patience': 7}. Best is trial 2 with value: 0.8105517285210746.[0m


                                                                                                         



Best trial: 2. Best value: 0.810552:  40%|████      | 12/30 [01:44<01:38,  5.49s/it, 100.19/1800 seconds]

Best trial: 2. Best value: 0.810552:  40%|████      | 12/30 [01:44<01:38,  5.49s/it, 100.19/1800 seconds]

Best trial: 2. Best value: 0.810552:  43%|████▎     | 13/30 [01:44<01:25,  5.05s/it, 100.19/1800 seconds]

Best trial: 2. Best value: 0.810552:  43%|████▎     | 13/30 [01:44<01:25,  5.05s/it, 104.23/1800 seconds]

   Early stopping at epoch 19
[32m[I 2026-02-18 01:39:56,234][0m Trial 12 finished with value: 0.864848792552948 and parameters: {'window_size': 5, 'model_config': (32, 2), 'n_layers': 1, 'ffn_ratio': 2, 'dropout': 0.16134245105421344, 'mask_ratio': 0.20662414280782768, 'learning_rate': 0.0012423428966730084, 'weight_decay': 0.06207651806442917, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


                                                                                                         



Best trial: 2. Best value: 0.810552:  43%|████▎     | 13/30 [01:46<01:25,  5.05s/it, 104.23/1800 seconds]

Best trial: 2. Best value: 0.810552:  43%|████▎     | 13/30 [01:46<01:25,  5.05s/it, 104.23/1800 seconds]

Best trial: 2. Best value: 0.810552:  47%|████▋     | 14/30 [01:46<01:08,  4.26s/it, 104.23/1800 seconds]

Best trial: 2. Best value: 0.810552:  47%|████▋     | 14/30 [01:46<01:08,  4.26s/it, 106.66/1800 seconds]

   Early stopping at epoch 8
[32m[I 2026-02-18 01:39:58,663][0m Trial 13 finished with value: 0.8936977301325116 and parameters: {'window_size': 20, 'model_config': (16, 2), 'n_layers': 2, 'ffn_ratio': 3, 'dropout': 0.23189595634382337, 'mask_ratio': 0.19188389978867815, 'learning_rate': 0.0011468920051395848, 'weight_decay': 0.06450285213659933, 'patience': 7}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=0.986567, val_loss=0.956859


                                                                                                         



Best trial: 2. Best value: 0.810552:  47%|████▋     | 14/30 [01:54<01:08,  4.26s/it, 106.66/1800 seconds]

Best trial: 2. Best value: 0.810552:  47%|████▋     | 14/30 [01:54<01:08,  4.26s/it, 106.66/1800 seconds]

Best trial: 2. Best value: 0.810552:  50%|█████     | 15/30 [01:54<01:20,  5.40s/it, 106.66/1800 seconds]

Best trial: 2. Best value: 0.810552:  50%|█████     | 15/30 [01:54<01:20,  5.40s/it, 114.70/1800 seconds]

   Early stopping at epoch 37
[32m[I 2026-02-18 01:40:06,705][0m Trial 14 finished with value: 0.8556677103042603 and parameters: {'window_size': 15, 'model_config': (32, 4), 'n_layers': 1, 'ffn_ratio': 3, 'dropout': 0.29979661750343023, 'mask_ratio': 0.15543565321798464, 'learning_rate': 0.0006689802001564378, 'weight_decay': 0.07468959677032493, 'patience': 15}. Best is trial 2 with value: 0.8105517285210746.[0m


                                                                                                         



Best trial: 2. Best value: 0.810552:  50%|█████     | 15/30 [01:58<01:20,  5.40s/it, 114.70/1800 seconds]

Best trial: 2. Best value: 0.810552:  50%|█████     | 15/30 [01:58<01:20,  5.40s/it, 114.70/1800 seconds]

Best trial: 2. Best value: 0.810552:  53%|█████▎    | 16/30 [01:58<01:10,  5.00s/it, 114.70/1800 seconds]

Best trial: 2. Best value: 0.810552:  53%|█████▎    | 16/30 [01:58<01:10,  5.00s/it, 118.78/1800 seconds]

   Early stopping at epoch 13
[32m[I 2026-02-18 01:40:10,782][0m Trial 15 finished with value: 0.8250233956745693 and parameters: {'window_size': 20, 'model_config': (32, 2), 'n_layers': 2, 'ffn_ratio': 2, 'dropout': 0.17030433162664235, 'mask_ratio': 0.2504433992591721, 'learning_rate': 0.0018821242414979503, 'weight_decay': 0.04109567876595779, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


                                                                                                         



Best trial: 2. Best value: 0.810552:  53%|█████▎    | 16/30 [02:02<01:10,  5.00s/it, 118.78/1800 seconds]

Best trial: 2. Best value: 0.810552:  53%|█████▎    | 16/30 [02:02<01:10,  5.00s/it, 118.78/1800 seconds]

Best trial: 2. Best value: 0.810552:  57%|█████▋    | 17/30 [02:02<01:01,  4.72s/it, 118.78/1800 seconds]

Best trial: 2. Best value: 0.810552:  57%|█████▋    | 17/30 [02:02<01:01,  4.72s/it, 122.85/1800 seconds]

   Early stopping at epoch 18
[32m[I 2026-02-18 01:40:14,850][0m Trial 16 finished with value: 0.8503855381693158 and parameters: {'window_size': 5, 'model_config': (24, 2), 'n_layers': 1, 'ffn_ratio': 2, 'dropout': 0.23976017097175045, 'mask_ratio': 0.22194274508017123, 'learning_rate': 0.0008431966570898267, 'weight_decay': 0.024477620362977685, 'patience': 7}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=1.023567, val_loss=0.919587


   Epoch 40/200: train_loss=0.941055, val_loss=0.909008


                                                                                                         



Best trial: 2. Best value: 0.810552:  57%|█████▋    | 17/30 [02:17<01:01,  4.72s/it, 122.85/1800 seconds]

Best trial: 2. Best value: 0.810552:  57%|█████▋    | 17/30 [02:17<01:01,  4.72s/it, 122.85/1800 seconds]

Best trial: 2. Best value: 0.810552:  60%|██████    | 18/30 [02:17<01:31,  7.59s/it, 122.85/1800 seconds]

Best trial: 2. Best value: 0.810552:  60%|██████    | 18/30 [02:17<01:31,  7.59s/it, 137.11/1800 seconds]

   Early stopping at epoch 47
[32m[I 2026-02-18 01:40:29,108][0m Trial 17 finished with value: 0.8448534096990313 and parameters: {'window_size': 5, 'model_config': (24, 2), 'n_layers': 2, 'ffn_ratio': 3, 'dropout': 0.2785851959737444, 'mask_ratio': 0.20660162270208032, 'learning_rate': 0.00038713573695610897, 'weight_decay': 0.049764384033919445, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=0.940269, val_loss=0.882034


                                                                                                         



Best trial: 2. Best value: 0.810552:  60%|██████    | 18/30 [02:24<01:31,  7.59s/it, 137.11/1800 seconds]

Best trial: 2. Best value: 0.810552:  60%|██████    | 18/30 [02:24<01:31,  7.59s/it, 137.11/1800 seconds]

Best trial: 2. Best value: 0.810552:  63%|██████▎   | 19/30 [02:24<01:21,  7.42s/it, 137.11/1800 seconds]

Best trial: 2. Best value: 0.810552:  63%|██████▎   | 19/30 [02:24<01:21,  7.42s/it, 144.15/1800 seconds]

   Early stopping at epoch 23
[32m[I 2026-02-18 01:40:36,154][0m Trial 18 pruned. [0m


   Epoch 20/200: train_loss=0.913822, val_loss=0.894806


   Epoch 40/200: train_loss=0.889360, val_loss=0.874123


                                                                                                         



Best trial: 2. Best value: 0.810552:  63%|██████▎   | 19/30 [02:35<01:21,  7.42s/it, 144.15/1800 seconds]

Best trial: 2. Best value: 0.810552:  63%|██████▎   | 19/30 [02:35<01:21,  7.42s/it, 144.15/1800 seconds]

Best trial: 2. Best value: 0.810552:  67%|██████▋   | 20/30 [02:35<01:24,  8.50s/it, 144.15/1800 seconds]

Best trial: 2. Best value: 0.810552:  67%|██████▋   | 20/30 [02:35<01:24,  8.50s/it, 155.15/1800 seconds]

   Early stopping at epoch 50
[32m[I 2026-02-18 01:40:47,151][0m Trial 19 finished with value: 0.8430841990879604 and parameters: {'window_size': 15, 'model_config': (16, 2), 'n_layers': 1, 'ffn_ratio': 3, 'dropout': 0.18204408451786194, 'mask_ratio': 0.2555041037399341, 'learning_rate': 0.0017348165810743695, 'weight_decay': 0.09965433220579406, 'patience': 15}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=0.954296, val_loss=0.888178


                                                                                                         



Best trial: 2. Best value: 0.810552:  67%|██████▋   | 20/30 [02:41<01:24,  8.50s/it, 155.15/1800 seconds]

Best trial: 2. Best value: 0.810552:  67%|██████▋   | 20/30 [02:41<01:24,  8.50s/it, 155.15/1800 seconds]

Best trial: 2. Best value: 0.810552:  70%|███████   | 21/30 [02:41<01:10,  7.83s/it, 155.15/1800 seconds]

Best trial: 2. Best value: 0.810552:  70%|███████   | 21/30 [02:41<01:10,  7.83s/it, 161.43/1800 seconds]

   Early stopping at epoch 29
[32m[I 2026-02-18 01:40:53,427][0m Trial 20 pruned. [0m


   Epoch 20/200: train_loss=0.912612, val_loss=0.924098


                                                                                                         



Best trial: 2. Best value: 0.810552:  70%|███████   | 21/30 [02:48<01:10,  7.83s/it, 161.43/1800 seconds]

Best trial: 2. Best value: 0.810552:  70%|███████   | 21/30 [02:48<01:10,  7.83s/it, 161.43/1800 seconds]

Best trial: 2. Best value: 0.810552:  73%|███████▎  | 22/30 [02:48<01:01,  7.75s/it, 161.43/1800 seconds]

Best trial: 2. Best value: 0.810552:  73%|███████▎  | 22/30 [02:48<01:01,  7.75s/it, 168.99/1800 seconds]

   Early stopping at epoch 25
[32m[I 2026-02-18 01:41:00,989][0m Trial 21 finished with value: 0.8553511244910104 and parameters: {'window_size': 20, 'model_config': (32, 2), 'n_layers': 2, 'ffn_ratio': 2, 'dropout': 0.1698534837998375, 'mask_ratio': 0.24488914699075237, 'learning_rate': 0.001984197401273609, 'weight_decay': 0.04263266581745791, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=0.964416, val_loss=0.921506


                                                                                                         



Best trial: 2. Best value: 0.810552:  73%|███████▎  | 22/30 [02:56<01:01,  7.75s/it, 168.99/1800 seconds]

Best trial: 2. Best value: 0.810552:  73%|███████▎  | 22/30 [02:56<01:01,  7.75s/it, 168.99/1800 seconds]

Best trial: 2. Best value: 0.810552:  77%|███████▋  | 23/30 [02:56<00:54,  7.78s/it, 168.99/1800 seconds]

Best trial: 2. Best value: 0.810552:  77%|███████▋  | 23/30 [02:56<00:54,  7.78s/it, 176.84/1800 seconds]

   Early stopping at epoch 26
[32m[I 2026-02-18 01:41:08,846][0m Trial 22 pruned. [0m


   Epoch 20/200: train_loss=0.908030, val_loss=0.819794


                                                                                                         



Best trial: 2. Best value: 0.810552:  77%|███████▋  | 23/30 [03:05<00:54,  7.78s/it, 176.84/1800 seconds]

Best trial: 2. Best value: 0.810552:  77%|███████▋  | 23/30 [03:06<00:54,  7.78s/it, 176.84/1800 seconds]

Best trial: 2. Best value: 0.810552:  80%|████████  | 24/30 [03:06<00:49,  8.19s/it, 176.84/1800 seconds]

Best trial: 2. Best value: 0.810552:  80%|████████  | 24/30 [03:06<00:49,  8.19s/it, 186.00/1800 seconds]

   Early stopping at epoch 30
[32m[I 2026-02-18 01:41:18,001][0m Trial 23 finished with value: 0.8197942205837795 and parameters: {'window_size': 20, 'model_config': (32, 2), 'n_layers': 2, 'ffn_ratio': 2, 'dropout': 0.18207939734266204, 'mask_ratio': 0.2842059987893025, 'learning_rate': 0.0029606536994747465, 'weight_decay': 0.037567145465111515, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=0.894058, val_loss=0.874317


                                                                                                         



Best trial: 2. Best value: 0.810552:  80%|████████  | 24/30 [03:14<00:49,  8.19s/it, 186.00/1800 seconds]

Best trial: 2. Best value: 0.810552:  80%|████████  | 24/30 [03:14<00:49,  8.19s/it, 186.00/1800 seconds]

Best trial: 2. Best value: 0.810552:  83%|████████▎ | 25/30 [03:14<00:41,  8.28s/it, 186.00/1800 seconds]

Best trial: 2. Best value: 0.810552:  83%|████████▎ | 25/30 [03:14<00:41,  8.28s/it, 194.49/1800 seconds]

   Early stopping at epoch 28
[32m[I 2026-02-18 01:41:26,491][0m Trial 24 finished with value: 0.8465127604348319 and parameters: {'window_size': 5, 'model_config': (32, 2), 'n_layers': 2, 'ffn_ratio': 2, 'dropout': 0.18695911311145635, 'mask_ratio': 0.28585076195675124, 'learning_rate': 0.0028370429601712035, 'weight_decay': 0.07234533455482471, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


                                                                                                         



Best trial: 2. Best value: 0.810552:  83%|████████▎ | 25/30 [03:18<00:41,  8.28s/it, 194.49/1800 seconds]

Best trial: 2. Best value: 0.810552:  83%|████████▎ | 25/30 [03:18<00:41,  8.28s/it, 194.49/1800 seconds]

Best trial: 2. Best value: 0.810552:  87%|████████▋ | 26/30 [03:18<00:27,  6.97s/it, 194.49/1800 seconds]

Best trial: 2. Best value: 0.810552:  87%|████████▋ | 26/30 [03:18<00:27,  6.97s/it, 198.40/1800 seconds]

   Early stopping at epoch 13
[32m[I 2026-02-18 01:41:30,399][0m Trial 25 pruned. [0m


                                                                                                         



Best trial: 2. Best value: 0.810552:  87%|████████▋ | 26/30 [03:22<00:27,  6.97s/it, 198.40/1800 seconds]

Best trial: 2. Best value: 0.810552:  87%|████████▋ | 26/30 [03:22<00:27,  6.97s/it, 198.40/1800 seconds]

Best trial: 2. Best value: 0.810552:  90%|█████████ | 27/30 [03:22<00:18,  6.18s/it, 198.40/1800 seconds]

Best trial: 2. Best value: 0.810552:  90%|█████████ | 27/30 [03:22<00:18,  6.18s/it, 202.72/1800 seconds]

   Early stopping at epoch 14
[32m[I 2026-02-18 01:41:34,724][0m Trial 26 pruned. [0m


   Epoch 20/200: train_loss=0.958592, val_loss=0.879660


                                                                                                         



Best trial: 2. Best value: 0.810552:  90%|█████████ | 27/30 [03:29<00:18,  6.18s/it, 202.72/1800 seconds]

Best trial: 2. Best value: 0.810552:  90%|█████████ | 27/30 [03:29<00:18,  6.18s/it, 202.72/1800 seconds]

Best trial: 2. Best value: 0.810552:  93%|█████████▎| 28/30 [03:29<00:12,  6.33s/it, 202.72/1800 seconds]

Best trial: 2. Best value: 0.810552:  93%|█████████▎| 28/30 [03:29<00:12,  6.33s/it, 209.42/1800 seconds]

   Early stopping at epoch 22
[32m[I 2026-02-18 01:41:41,420][0m Trial 27 finished with value: 0.8511484265327454 and parameters: {'window_size': 5, 'model_config': (32, 2), 'n_layers': 2, 'ffn_ratio': 2, 'dropout': 0.26020429305198206, 'mask_ratio': 0.28616317727284757, 'learning_rate': 0.0009545217958209343, 'weight_decay': 0.04775598084163709, 'patience': 10}. Best is trial 2 with value: 0.8105517285210746.[0m


   Epoch 20/200: train_loss=0.920677, val_loss=0.870079


                                                                                                         



Best trial: 2. Best value: 0.810552:  93%|█████████▎| 28/30 [03:35<00:12,  6.33s/it, 209.42/1800 seconds]

Best trial: 2. Best value: 0.810552:  93%|█████████▎| 28/30 [03:35<00:12,  6.33s/it, 209.42/1800 seconds]

Best trial: 2. Best value: 0.810552:  97%|█████████▋| 29/30 [03:35<00:06,  6.26s/it, 209.42/1800 seconds]

Best trial: 2. Best value: 0.810552:  97%|█████████▋| 29/30 [03:35<00:06,  6.26s/it, 215.50/1800 seconds]

   Early stopping at epoch 28
[32m[I 2026-02-18 01:41:47,499][0m Trial 28 finished with value: 0.8360454951013837 and parameters: {'window_size': 15, 'model_config': (24, 2), 'n_layers': 1, 'ffn_ratio': 3, 'dropout': 0.22631669605920351, 'mask_ratio': 0.20606517736038132, 'learning_rate': 0.001483986220706737, 'weight_decay': 0.0830431722412956, 'patience': 15}. Best is trial 2 with value: 0.8105517285210746.[0m


                                                                                                         



Best trial: 2. Best value: 0.810552:  97%|█████████▋| 29/30 [03:38<00:06,  6.26s/it, 215.50/1800 seconds]

Best trial: 2. Best value: 0.810552:  97%|█████████▋| 29/30 [03:38<00:06,  6.26s/it, 215.50/1800 seconds]

Best trial: 2. Best value: 0.810552: 100%|██████████| 30/30 [03:38<00:00,  5.37s/it, 215.50/1800 seconds]

Best trial: 2. Best value: 0.810552: 100%|██████████| 30/30 [03:38<00:00,  5.37s/it, 218.81/1800 seconds]

Best trial: 2. Best value: 0.810552: 100%|██████████| 30/30 [03:38<00:00,  7.29s/it, 218.81/1800 seconds]

   Early stopping at epoch 11
[32m[I 2026-02-18 01:41:50,812][0m Trial 29 finished with value: 0.8453250314508166 and parameters: {'window_size': 20, 'model_config': (32, 2), 'n_layers': 2, 'ffn_ratio': 3, 'dropout': 0.2835231688481889, 'mask_ratio': 0.19272995236447615, 'learning_rate': 0.00216409733116546, 'weight_decay': 0.06366781759826982, 'patience': 7}. Best is trial 2 with value: 0.8105517285210746.[0m

OPTUNA RESULTS
Number of completed trials: 30
Best trial value: 0.810552
Best parameters:
   window_size: 5
   model_config: (24, 2)
   n_layers: 2
   ffn_ratio: 2
   dropout: 0.2939169255529117
   mask_ratio: 0.26626992350416717
   learning_rate: 0.002442046084491142
   weight_decay: 0.07849235338159358
   patience: 10

[OK] HPO complete. Best validation loss: 0.810552





## Cell 9: Final Training with Best Parameters

In [9]:
print("\n" + "="*80)
print("FINAL TRAINING WITH BEST PARAMETERS")
print("="*80)

# Extract best hyperparameters
window_size = best_params['window_size']
d_model, n_heads = best_params['model_config']
n_layers = best_params['n_layers']
ffn_ratio = best_params['ffn_ratio']
dropout = best_params['dropout']
mask_ratio = best_params['mask_ratio']
learning_rate = best_params['learning_rate']
weight_decay = best_params['weight_decay']
patience = best_params['patience']

# Create windows for train+val combined (for final training)
train_val_data = pd.concat([train_data, val_data])
train_val_windows, train_val_dates = create_windows(train_val_data, window_size)

# Create windows for validation (to monitor training)
val_windows, val_dates = create_windows(val_data, window_size)

# Create dataloaders
train_val_dataset = WindowDataset(train_val_windows)
val_dataset = WindowDataset(val_windows)

train_val_loader = DataLoader(train_val_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Build final model
final_model = TemporalContextTransformer(
    input_dim=14,
    d_model=d_model,
    n_heads=n_heads,
    n_layers=n_layers,
    ffn_ratio=ffn_ratio,
    dropout=dropout,
    max_seq_len=20
)

# Count parameters
n_params = sum(p.numel() for p in final_model.parameters())
print(f"\nFinal model parameters: {n_params:,}")
print(f"Target range: 3,000-10,000")
print(f"Status: {'✓ OK' if 3000 <= n_params <= 10000 else '⚠ WARNING'}")

# Train final model
final_config = {
    'learning_rate': learning_rate,
    'weight_decay': weight_decay,
    'patience': patience,
    'mask_ratio': mask_ratio,
    'max_epochs': 200,
    'batch_size': 64
}

final_model, final_metrics = train_model(final_model, train_val_loader, val_loader, final_config)

print("\n" + "="*80)
print("FINAL TRAINING METRICS")
print("="*80)
for key, value in final_metrics.items():
    print(f"{key}: {value}")

# Move model to CPU for inference
final_model = final_model.cpu()
final_model.eval()

print("\n[OK] Final training complete.")


FINAL TRAINING WITH BEST PARAMETERS

Final model parameters: 11,007
Target range: 3,000-10,000


   Epoch 20/200: train_loss=0.891611, val_loss=0.883879


   Early stopping at epoch 27

FINAL TRAINING METRICS
train_loss: 0.8963682187928094
val_loss: 0.8526514172554016
overfit_ratio: 0.9512289696176348
epochs_trained: 27

[OK] Final training complete.


## Cell 10: Generate Submodel Output for Full Dataset

In [10]:
print("\n" + "="*80)
print("GENERATING SUBMODEL OUTPUT")
print("="*80)

# Create windows for full dataset
full_windows, full_dates = create_windows(full_data, window_size)

print(f"Full windows: {full_windows.shape}")
print(f"Date range: {full_dates[0]} to {full_dates[-1]}")

# Extract context scores
final_model.eval()
with torch.no_grad():
    context_scores = []
    batch_size = 256
    
    for i in range(0, len(full_windows), batch_size):
        batch = full_windows[i:i+batch_size]
        scores = final_model.extract(batch)
        context_scores.append(scores.cpu().numpy())
    
    context_scores = np.concatenate(context_scores, axis=0).flatten()

# Create output DataFrame
output = pd.DataFrame({
    'date': full_dates,
    'temporal_context_score': context_scores
})

print(f"\nOutput shape: {output.shape}")
print(f"Output columns: {list(output.columns)}")
print(f"\nOutput statistics:")
print(output['temporal_context_score'].describe())

# Check for issues
n_nan = output['temporal_context_score'].isna().sum()
n_inf = np.isinf(output['temporal_context_score']).sum()
is_constant = output['temporal_context_score'].std() < 1e-10

print(f"\nQuality checks:")
print(f"   NaN values: {n_nan} {'✓ OK' if n_nan == 0 else '✗ FAIL'}")
print(f"   Inf values: {n_inf} {'✓ OK' if n_inf == 0 else '✗ FAIL'}")
print(f"   Constant output: {'✗ FAIL' if is_constant else '✓ OK'}")
print(f"   Range [0,1]: {'✓ OK' if output['temporal_context_score'].min() >= 0 and output['temporal_context_score'].max() <= 1 else '✗ FAIL'}")

print("\n[OK] Submodel output generated.")


GENERATING SUBMODEL OUTPUT
Full windows: torch.Size([2712, 5, 14])
Date range: 2015-02-05 00:00:00 to 2026-02-12 00:00:00

Output shape: (2712, 2)
Output columns: ['date', 'temporal_context_score']

Output statistics:
count    2712.000000
mean        0.196092
std         0.302998
min         0.000920
25%         0.005154
50%         0.023000
75%         0.289475
max         1.000000
Name: temporal_context_score, dtype: float64

Quality checks:
   NaN values: 0 ✓ OK
   Inf values: 0 ✓ OK
   Constant output: ✓ OK
   Range [0,1]: ✓ OK

[OK] Submodel output generated.


## Cell 11: Save Results

In [11]:
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Save submodel output CSV
output.to_csv("submodel_output.csv", index=False)
print("[OK] Saved: submodel_output.csv")

# Save model
torch.save({
    'model_state': final_model.state_dict(),
    'config': {
        'd_model': d_model,
        'n_heads': n_heads,
        'n_layers': n_layers,
        'ffn_ratio': ffn_ratio,
        'dropout': dropout,
        'window_size': window_size
    }
}, "model.pt")
print("[OK] Saved: model.pt")

# Save training result JSON
result = {
    "feature": "temporal_context",
    "attempt": 1,
    "timestamp": datetime.now().isoformat(),
    "best_params": best_params,
    "metrics": final_metrics,
    "optuna_trials_completed": n_completed,
    "optuna_best_value": best_value,
    "model_param_count": n_params,
    "output_shape": list(output.shape),
    "output_columns": list(output.columns),
    "output_statistics": {
        "mean": float(output['temporal_context_score'].mean()),
        "std": float(output['temporal_context_score'].std()),
        "min": float(output['temporal_context_score'].min()),
        "max": float(output['temporal_context_score'].max()),
        "median": float(output['temporal_context_score'].median())
    },
    "data_info": {
        "train_samples": len(train_data),
        "val_samples": len(val_data),
        "test_samples": len(test_data),
        "full_samples": len(full_data),
        "window_size": window_size,
        "windowed_samples": len(output)
    },
    "quality_checks": {
        "nan_count": int(n_nan),
        "inf_count": int(n_inf),
        "is_constant": bool(is_constant),
        "in_range_0_1": bool(output['temporal_context_score'].min() >= 0 and output['temporal_context_score'].max() <= 1)
    }
}

with open("training_result.json", "w") as f:
    json.dump(result, f, indent=2, default=str)

print("[OK] Saved: training_result.json")

print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)
print(f"Finished: {datetime.now().isoformat()}")
print(f"\nOutputs:")
print(f"  1. submodel_output.csv ({len(output)} rows)")
print(f"  2. model.pt ({n_params:,} parameters)")
print(f"  3. training_result.json")
print("\nReady for evaluator.")


SAVING RESULTS
[OK] Saved: submodel_output.csv
[OK] Saved: model.pt
[OK] Saved: training_result.json

TRAINING COMPLETE
Finished: 2026-02-18T01:42:00.635531

Outputs:
  1. submodel_output.csv (2712 rows)
  2. model.pt (11,007 parameters)
  3. training_result.json

Ready for evaluator.
