# Gold Prediction SubModel Training - Temporal Context Transformer
## Attempt 1

**Generated by**: builder_model agent  
**Architecture**: Asymmetric Transformer Autoencoder with Masked Reconstruction  
**Input**: 14 features (5 base + 9 submodel outputs)  
**Output**: 1 column (temporal_context_score, 0-1)  
**Target params**: ~6,200 (3K-10K range)  
**Window size**: 5-20 days (Optuna search)  

Self-contained: Data fetch → Preprocessing → Training → Evaluation → Save results

## Cell 1: Imports and Setup

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import json
import os
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

# Set random seeds
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"=== Gold SubModel Training: temporal_context attempt 1 ===")
print(f"Started: {datetime.now().isoformat()}")
print(f"Device: {device}")
print(f"PyTorch version: {torch.__version__}")

## Cell 2: Data Fetching (API-based, self-contained)

In [None]:
def fetch_data():
    """
    Fetch and prepare data for temporal context transformer.
    Returns: (train_df, val_df, test_df, full_df, scaler)
    """
    print("\n" + "="*80)
    print("DATA FETCHING")
    print("="*80)
    
    # ===== 1. Import data APIs =====
    import yfinance as yf
    try:
        from fredapi import Fred
    except ImportError:
        import subprocess
        print("Installing fredapi...")
        subprocess.run(["pip", "install", "fredapi"], check=True)
        from fredapi import Fred
    
    # Get FRED API key from environment variable
    FRED_API_KEY = "3ffb68facdf6321e180e380c00e909c8"
    fred = Fred(api_key=FRED_API_KEY)
    
    # ===== 2. Fetch base features (FRED + Yahoo) =====
    print("\n[1/5] Fetching base features from FRED and Yahoo...")
    
    # Real interest rate (10Y TIPS)
    real_rate = fred.get_series('DFII10', observation_start='2015-01-01')
    real_rate_df = pd.DataFrame({'real_rate_real_rate': real_rate})
    print(f"   Real rate: {len(real_rate_df)} rows")
    
    # DXY (Dollar Index) - using FRED for stability
    dxy = fred.get_series('DTWEXBGS', observation_start='2015-01-01')
    dxy_df = pd.DataFrame({'dxy_dxy': dxy})
    print(f"   DXY: {len(dxy_df)} rows")
    
    # VIX
    vix = fred.get_series('VIXCLS', observation_start='2015-01-01')
    vix_df = pd.DataFrame({'vix_vix': vix})
    print(f"   VIX: {len(vix_df)} rows")
    
    # Yield spread (10Y - 2Y)
    dgs10 = fred.get_series('DGS10', observation_start='2015-01-01')
    dgs2 = fred.get_series('DGS2', observation_start='2015-01-01')
    yield_spread = dgs10 - dgs2
    yield_spread_df = pd.DataFrame({'yield_curve_yield_spread': yield_spread})
    print(f"   Yield spread: {len(yield_spread_df)} rows")
    
    # Inflation expectation (10Y Breakeven)
    inflation_exp = fred.get_series('T10YIE', observation_start='2015-01-01')
    inflation_exp_df = pd.DataFrame({'inflation_expectation_inflation_expectation': inflation_exp})
    print(f"   Inflation expectation: {len(inflation_exp_df)} rows")
    
    # Merge base features
    base_df = real_rate_df.join(dxy_df, how='outer')
    base_df = base_df.join(vix_df, how='outer')
    base_df = base_df.join(yield_spread_df, how='outer')
    base_df = base_df.join(inflation_exp_df, how='outer')
    base_df = base_df.sort_index()
    
    # Forward fill and backward fill (max 5 days)
    base_df = base_df.ffill(limit=5)
    base_df = base_df.bfill(limit=5)
    base_df = base_df.dropna()
    
    print(f"   Base features merged: {len(base_df)} rows, {base_df.shape[1]} columns")
    
    # ===== 3. Transform base features =====
    print("\n[2/5] Transforming base features (diff)...")
    
    base_features = pd.DataFrame(index=base_df.index)
    base_features['real_rate_change'] = base_df['real_rate_real_rate'].diff()
    base_features['dxy_change'] = base_df['dxy_dxy'].diff()
    base_features['vix'] = base_df['vix_vix']  # No transformation (already stationary)
    base_features['yield_spread_change'] = base_df['yield_curve_yield_spread'].diff()
    base_features['inflation_exp_change'] = base_df['inflation_expectation_inflation_expectation'].diff()
    
    print(f"   Base features transformed: {list(base_features.columns)}")
    
    # ===== 4. Load submodel outputs from Kaggle Dataset =====
    print("\n[3/5] Loading submodel outputs from Kaggle Dataset...")
    
    # Note: In Kaggle environment, the dataset is mounted at /kaggle/input/
    # For local testing, adjust the path
    try:
        # Kaggle environment
        submodel_path = "/kaggle/input/gold-prediction-submodels/"
    except:
        # Fallback for local testing
        submodel_path = "../data/submodel_outputs/"
    
    # VIX submodel (2 columns)
    vix_sub = pd.read_csv(submodel_path + "vix.csv")
    vix_sub['date'] = pd.to_datetime(vix_sub['date'])
    vix_sub = vix_sub.set_index('date').sort_index()
    vix_features = vix_sub[['vix_regime_probability', 'vix_mean_reversion_z']].copy()
    print(f"   VIX: {len(vix_features)} rows")
    
    # Technical submodel (3 columns) - handle timezone
    tech_sub = pd.read_csv(submodel_path + "technical.csv")
    tech_sub['date'] = tech_sub['date'].str[:10]  # Extract YYYY-MM-DD
    tech_sub['date'] = pd.to_datetime(tech_sub['date'])
    tech_sub = tech_sub.set_index('date').sort_index()
    tech_features = tech_sub[['tech_trend_regime_prob', 'tech_mean_reversion_z', 'tech_volatility_regime']].copy()
    print(f"   Technical: {len(tech_features)} rows")
    
    # Cross-asset submodel (2 columns)
    xasset_sub = pd.read_csv(submodel_path + "cross_asset.csv")
    xasset_sub['Date'] = pd.to_datetime(xasset_sub['Date'])
    xasset_sub = xasset_sub.set_index('Date').sort_index()
    xasset_features = xasset_sub[['xasset_regime_prob', 'xasset_divergence']].copy()
    print(f"   Cross-asset: {len(xasset_features)} rows")
    
    # ETF flow submodel (1 column)
    etf_sub = pd.read_csv(submodel_path + "etf_flow.csv")
    etf_sub['Date'] = pd.to_datetime(etf_sub['Date'])
    etf_sub = etf_sub.set_index('Date').sort_index()
    etf_features = etf_sub[['etf_regime_prob']].copy()
    print(f"   ETF flow: {len(etf_features)} rows")
    
    # Options market submodel (1 column) - handle timezone
    options_sub = pd.read_csv(submodel_path + "options_market.csv")
    options_sub['Date'] = options_sub['Date'].str[:10]  # Extract YYYY-MM-DD
    options_sub['Date'] = pd.to_datetime(options_sub['Date'])
    options_sub = options_sub.set_index('Date').sort_index()
    options_features = options_sub[['options_risk_regime_prob']].copy()
    print(f"   Options: {len(options_features)} rows")
    
    # ===== 5. Merge all features =====
    print("\n[4/5] Merging all features...")
    
    merged_df = base_features.copy()
    merged_df = merged_df.join(vix_features, how='inner')
    merged_df = merged_df.join(tech_features, how='inner')
    merged_df = merged_df.join(xasset_features, how='inner')
    merged_df = merged_df.join(etf_features, how='inner')
    merged_df = merged_df.join(options_features, how='inner')
    
    # Handle NaN values
    merged_df = merged_df.ffill(limit=5)
    merged_df = merged_df.bfill()
    merged_df = merged_df.dropna()
    
    # Remove infinite values
    inf_mask = np.isinf(merged_df.values).any(axis=1)
    if inf_mask.any():
        merged_df = merged_df[~inf_mask]
    
    print(f"   Final merged: {len(merged_df)} rows, {merged_df.shape[1]} columns")
    print(f"   Date range: {merged_df.index.min()} to {merged_df.index.max()}")
    
    # Verify we have exactly 14 columns
    assert merged_df.shape[1] == 14, f"Expected 14 columns, got {merged_df.shape[1]}"
    
    # ===== 6. Time-series split (70/15/15) =====
    print("\n[5/5] Splitting data (70/15/15)...")
    
    n = len(merged_df)
    train_end = int(n * 0.70)
    val_end = int(n * 0.85)
    
    train_df = merged_df.iloc[:train_end].copy()
    val_df = merged_df.iloc[train_end:val_end].copy()
    test_df = merged_df.iloc[val_end:].copy()
    
    print(f"   Train: {len(train_df)} rows ({train_df.index.min()} to {train_df.index.max()})")
    print(f"   Val:   {len(val_df)} rows ({val_df.index.min()} to {val_df.index.max()})")
    print(f"   Test:  {len(test_df)} rows ({test_df.index.min()} to {test_df.index.max()})")
    
    # ===== 7. Standardization =====
    print("\nStandardizing features...")
    
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train_df)
    val_scaled = scaler.transform(val_df)
    test_scaled = scaler.transform(test_df)
    full_scaled = scaler.transform(merged_df)
    
    # Convert back to DataFrame
    train_df = pd.DataFrame(train_scaled, index=train_df.index, columns=train_df.columns)
    val_df = pd.DataFrame(val_scaled, index=val_df.index, columns=val_df.columns)
    test_df = pd.DataFrame(test_scaled, index=test_df.index, columns=test_df.columns)
    full_df = pd.DataFrame(full_scaled, index=merged_df.index, columns=merged_df.columns)
    
    print("   [OK] Features standardized using train set statistics")
    
    print("\n" + "="*80)
    print("DATA FETCHING COMPLETE")
    print("="*80)
    
    return train_df, val_df, test_df, full_df, scaler

# Fetch data
train_data, val_data, test_data, full_data, scaler = fetch_data()
print(f"\nData ready: train={len(train_data)}, val={len(val_data)}, test={len(test_data)}, full={len(full_data)}")

## Cell 3: Windowing Function

In [None]:
def create_windows(data, window_size):
    """
    Create sliding windows from time-series data.
    
    Args:
        data: DataFrame (N, 14) with date index
        window_size: int, number of time steps per window
    
    Returns:
        windows: tensor (N-W+1, W, 14)
        dates: list of dates for each window (end date)
    """
    values = data.values
    n_samples = len(values)
    
    if n_samples < window_size:
        raise ValueError(f"Not enough samples ({n_samples}) for window size {window_size}")
    
    windows = []
    dates = []
    
    for i in range(window_size - 1, n_samples):
        window = values[i - window_size + 1:i + 1]
        windows.append(window)
        dates.append(data.index[i])
    
    windows = np.array(windows)
    windows_tensor = torch.FloatTensor(windows)
    
    return windows_tensor, dates

print("Windowing function defined.")

## Cell 4: PyTorch Dataset

In [None]:
class WindowDataset(Dataset):
    """
    PyTorch Dataset for windowed time-series data.
    """
    def __init__(self, windows):
        """
        Args:
            windows: tensor (N, W, 14)
        """
        self.windows = windows
    
    def __len__(self):
        return len(self.windows)
    
    def __getitem__(self, idx):
        return self.windows[idx]

print("WindowDataset class defined.")

## Cell 5: Model Definition - Temporal Context Transformer

In [None]:
class TemporalContextTransformer(nn.Module):
    """
    Asymmetric Transformer Autoencoder for temporal context extraction.
    
    Architecture:
      Input (batch, seq, 14)
        -> Input Projection (14 -> d_model)
        -> Learned Positional Encoding
        -> TransformerEncoder (L layers, H heads)
        -> Mean Pool over time
        -> Bottleneck Linear (d_model -> 1)
        -> Sigmoid -> context_score (0-1)
      
      Reconstruction branch (training only):
        -> Bottleneck (1) -> Expand (d_model)
        -> Repeat to seq_len
        -> Output Projection (d_model -> 14)
    """
    
    def __init__(self, input_dim=14, d_model=24, n_heads=2, n_layers=1,
                 ffn_ratio=2, dropout=0.2, max_seq_len=20):
        super().__init__()
        
        # Input projection
        self.input_proj = nn.Linear(input_dim, d_model)
        
        # Learned positional encoding
        self.pos_encoding = nn.Embedding(max_seq_len, d_model)
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model * ffn_ratio,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True  # Pre-Norm for stability
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        # Bottleneck
        self.bottleneck = nn.Sequential(
            nn.Linear(d_model, 1),
            nn.Sigmoid()
        )
        
        # Lightweight Decoder (reconstruction branch)
        self.decoder_expand = nn.Linear(1, d_model)
        self.decoder_output = nn.Linear(d_model, input_dim)
        
        # Dropout for input
        self.input_dropout = nn.Dropout(dropout)
        
    def encode(self, x):
        """
        x: (batch, seq_len, input_dim)
        Returns: context_score (batch, 1), pooled (batch, d_model)
        """
        batch_size, seq_len, _ = x.shape
        
        # Input projection
        h = self.input_proj(x)  # (batch, seq, d_model)
        
        # Add positional encoding
        positions = torch.arange(seq_len, device=x.device)
        h = h + self.pos_encoding(positions).unsqueeze(0)
        
        # Apply input dropout
        h = self.input_dropout(h)
        
        # Transformer encoder
        encoded = self.encoder(h)  # (batch, seq, d_model)
        
        # Mean pool over time
        pooled = encoded.mean(dim=1)  # (batch, d_model)
        
        # Bottleneck -> context score
        context_score = self.bottleneck(pooled)  # (batch, 1)
        
        return context_score, pooled
    
    def decode(self, context_score, seq_len):
        """
        Reconstruct from bottleneck for masked reconstruction loss.
        context_score: (batch, 1)
        Returns: (batch, seq_len, input_dim)
        """
        # Expand bottleneck
        expanded = self.decoder_expand(context_score)  # (batch, d_model)
        
        # Repeat to sequence length
        expanded = expanded.unsqueeze(1).repeat(1, seq_len, 1)  # (batch, seq, d_model)
        
        # Output projection
        reconstructed = self.decoder_output(expanded)  # (batch, seq, input_dim)
        
        return reconstructed
    
    def forward(self, x, mask_ratio=0.2):
        """
        Forward pass with masked reconstruction.
        x: (batch, seq_len, input_dim)
        Returns: context_score, reconstructed, mask
        """
        batch_size, seq_len, input_dim = x.shape
        
        # Create random mask (mask time steps, not features)
        mask = torch.rand(batch_size, seq_len, device=x.device) < mask_ratio
        # Ensure at least 1 step is masked and 1 is unmasked
        mask[:, 0] = False  # Keep first step
        if seq_len > 2:
            mask[:, -1] = True   # Always mask last step
        
        # Apply mask (zero out masked positions)
        x_masked = x.clone()
        x_masked[mask] = 0.0
        
        # Encode
        context_score, pooled = self.encode(x_masked)
        
        # Decode (reconstruct)
        reconstructed = self.decode(context_score, seq_len)
        
        return context_score, reconstructed, mask
    
    def extract(self, x):
        """
        Extract context score for inference (no masking).
        x: (batch, seq_len, input_dim)
        Returns: context_score (batch, 1)
        """
        context_score, _ = self.encode(x)
        return context_score

# Test model instantiation
test_model = TemporalContextTransformer()
n_params = sum(p.numel() for p in test_model.parameters())
print(f"\nModel defined. Test model parameters: {n_params:,}")
print(f"Expected range: 3,000-10,000 params")
del test_model

## Cell 6: Loss Function and Training Loop

In [None]:
def masked_reconstruction_loss(original, reconstructed, mask):
    """
    MSE loss computed ONLY on masked time steps.
    
    original: (batch, seq_len, 14)
    reconstructed: (batch, seq_len, 14)
    mask: (batch, seq_len) -- True where masked
    """
    # Expand mask to feature dimension
    mask_expanded = mask.unsqueeze(-1).expand_as(original)  # (batch, seq, 14)
    
    # Compute MSE only on masked positions
    diff = (original - reconstructed) ** 2
    masked_diff = diff[mask_expanded]
    
    if masked_diff.numel() == 0:
        return torch.tensor(0.0, requires_grad=True, device=original.device)
    
    return masked_diff.mean()


def train_model(model, train_loader, val_loader, config):
    """
    Train the model with early stopping.
    
    Returns: model, metrics_dict
    """
    model = model.to(device)
    
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config['learning_rate'],
        weight_decay=config.get('weight_decay', 0.01),
        betas=(0.9, 0.999),
        eps=1e-8
    )
    
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=20, T_mult=2
    )
    
    best_val_loss = float('inf')
    best_train_loss = float('inf')
    patience_counter = 0
    best_state = None
    
    max_epochs = config.get('max_epochs', 200)
    patience = config.get('patience', 10)
    mask_ratio = config.get('mask_ratio', 0.2)
    
    for epoch in range(max_epochs):
        # Train
        model.train()
        train_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            
            optimizer.zero_grad()
            context_score, reconstructed, mask = model(batch, mask_ratio=mask_ratio)
            loss = masked_reconstruction_loss(batch, reconstructed, mask)
            
            loss.backward()
            # Gradient clipping
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        
        # Validate
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                context_score, reconstructed, mask = model(batch, mask_ratio=mask_ratio)
                loss = masked_reconstruction_loss(batch, reconstructed, mask)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        
        # Step scheduler
        scheduler.step()
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_train_loss = train_loss
            patience_counter = 0
            best_state = {k: v.clone().cpu() for k, v in model.state_dict().items()}
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"   Early stopping at epoch {epoch+1}")
                break
        
        if (epoch + 1) % 20 == 0:
            print(f"   Epoch {epoch+1}/{max_epochs}: train_loss={train_loss:.6f}, val_loss={val_loss:.6f}")
    
    # Restore best weights
    if best_state:
        model.load_state_dict(best_state)
    
    overfit_ratio = best_val_loss / (best_train_loss + 1e-10)
    
    metrics = {
        'train_loss': best_train_loss,
        'val_loss': best_val_loss,
        'overfit_ratio': overfit_ratio,
        'epochs_trained': epoch + 1
    }
    
    return model, metrics

print("Training functions defined.")

## Cell 7: Optuna HPO

In [None]:
def run_hpo(train_data, val_data, n_trials=30, timeout=1800):
    """
    Run hyperparameter optimization using Optuna.
    
    Returns: best_params, best_value, n_completed_trials
    """
    print("\n" + "="*80)
    print("HYPERPARAMETER OPTIMIZATION (OPTUNA)")
    print("="*80)
    
    def objective(trial):
        # Sample hyperparameters
        window_size = trial.suggest_categorical('window_size', [5, 10, 15, 20])

        # Flatten parameter space: (d_model, n_heads) combinations
        model_config = trial.suggest_categorical('model_config', [
            (16, 2),
            (24, 2),
            (24, 4),
            (32, 2),
            (32, 4)
        ])
        d_model, n_heads = model_config

        n_layers = trial.suggest_int('n_layers', 1, 2)
        ffn_ratio = trial.suggest_categorical('ffn_ratio', [2, 3])
        dropout = trial.suggest_float('dropout', 0.1, 0.3)
        mask_ratio = trial.suggest_float('mask_ratio', 0.15, 0.30)
        learning_rate = trial.suggest_float('learning_rate', 1e-4, 3e-3, log=True)
        weight_decay = trial.suggest_float('weight_decay', 0.01, 0.1, log=True)
        patience = trial.suggest_categorical('patience', [7, 10, 15])
        
        config = {
            'window_size': window_size,
            'd_model': d_model,
            'n_heads': n_heads,
            'n_layers': n_layers,
            'ffn_ratio': ffn_ratio,
            'dropout': dropout,
            'mask_ratio': mask_ratio,
            'learning_rate': learning_rate,
            'weight_decay': weight_decay,
            'patience': patience,
            'max_epochs': 200,
            'batch_size': 64
        }
        
        # Create windows
        try:
            train_windows, _ = create_windows(train_data, window_size)
            val_windows, _ = create_windows(val_data, window_size)
        except ValueError as e:
            print(f"   Trial failed: {e}")
            return float('inf')
        
        # Create dataloaders
        train_dataset = WindowDataset(train_windows)
        val_dataset = WindowDataset(val_windows)
        
        train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=False)
        val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
        
        # Build model
        model = TemporalContextTransformer(
            input_dim=14,
            d_model=d_model,
            n_heads=n_heads,
            n_layers=n_layers,
            ffn_ratio=ffn_ratio,
            dropout=dropout,
            max_seq_len=20
        )
        
        # Train
        model, metrics = train_model(model, train_loader, val_loader, config)
        
        val_loss = metrics['val_loss']
        overfit_ratio = metrics['overfit_ratio']
        
        # Pruning
        trial.report(val_loss, step=metrics['epochs_trained'])
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        # Penalize overfitting
        penalty = max(0, overfit_ratio - 1.5) * 0.1
        
        return val_loss + penalty
    
    # Run optimization
    study = optuna.create_study(
        direction='minimize',
        sampler=TPESampler(seed=SEED),
        pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10)
    )
    
    study.optimize(objective, n_trials=n_trials, timeout=timeout, show_progress_bar=True)
    
    print("\n" + "="*80)
    print("OPTUNA RESULTS")
    print("="*80)
    print(f"Number of completed trials: {len(study.trials)}")
    print(f"Best trial value: {study.best_value:.6f}")
    print(f"Best parameters:")
    for key, value in study.best_params.items():
        print(f"   {key}: {value}")
    
    return study.best_params, study.best_value, len(study.trials)

print("HPO function defined.")

## Cell 8: Run HPO

In [None]:
# Run hyperparameter optimization
best_params, best_value, n_completed = run_hpo(
    train_data, val_data,
    n_trials=30,
    timeout=1800  # 30 minutes
)

print(f"\n[OK] HPO complete. Best validation loss: {best_value:.6f}")

## Cell 9: Final Training with Best Parameters

In [None]:
print("\n" + "="*80)
print("FINAL TRAINING WITH BEST PARAMETERS")
print("="*80)

# Extract best hyperparameters
window_size = best_params['window_size']
d_model, n_heads = best_params['model_config']
n_layers = best_params['n_layers']
ffn_ratio = best_params['ffn_ratio']
dropout = best_params['dropout']
mask_ratio = best_params['mask_ratio']
learning_rate = best_params['learning_rate']
weight_decay = best_params['weight_decay']
patience = best_params['patience']

# Create windows for train+val combined (for final training)
train_val_data = pd.concat([train_data, val_data])
train_val_windows, train_val_dates = create_windows(train_val_data, window_size)

# Create windows for validation (to monitor training)
val_windows, val_dates = create_windows(val_data, window_size)

# Create dataloaders
train_val_dataset = WindowDataset(train_val_windows)
val_dataset = WindowDataset(val_windows)

train_val_loader = DataLoader(train_val_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Build final model
final_model = TemporalContextTransformer(
    input_dim=14,
    d_model=d_model,
    n_heads=n_heads,
    n_layers=n_layers,
    ffn_ratio=ffn_ratio,
    dropout=dropout,
    max_seq_len=20
)

# Count parameters
n_params = sum(p.numel() for p in final_model.parameters())
print(f"\nFinal model parameters: {n_params:,}")
print(f"Target range: 3,000-10,000")
print(f"Status: {'✓ OK' if 3000 <= n_params <= 10000 else '⚠ WARNING'}")

# Train final model
final_config = {
    'learning_rate': learning_rate,
    'weight_decay': weight_decay,
    'patience': patience,
    'mask_ratio': mask_ratio,
    'max_epochs': 200,
    'batch_size': 64
}

final_model, final_metrics = train_model(final_model, train_val_loader, val_loader, final_config)

print("\n" + "="*80)
print("FINAL TRAINING METRICS")
print("="*80)
for key, value in final_metrics.items():
    print(f"{key}: {value}")

# Move model to CPU for inference
final_model = final_model.cpu()
final_model.eval()

print("\n[OK] Final training complete.")

## Cell 10: Generate Submodel Output for Full Dataset

In [None]:
print("\n" + "="*80)
print("GENERATING SUBMODEL OUTPUT")
print("="*80)

# Create windows for full dataset
full_windows, full_dates = create_windows(full_data, window_size)

print(f"Full windows: {full_windows.shape}")
print(f"Date range: {full_dates[0]} to {full_dates[-1]}")

# Extract context scores
final_model.eval()
with torch.no_grad():
    context_scores = []
    batch_size = 256
    
    for i in range(0, len(full_windows), batch_size):
        batch = full_windows[i:i+batch_size]
        scores = final_model.extract(batch)
        context_scores.append(scores.cpu().numpy())
    
    context_scores = np.concatenate(context_scores, axis=0).flatten()

# Create output DataFrame
output = pd.DataFrame({
    'date': full_dates,
    'temporal_context_score': context_scores
})

print(f"\nOutput shape: {output.shape}")
print(f"Output columns: {list(output.columns)}")
print(f"\nOutput statistics:")
print(output['temporal_context_score'].describe())

# Check for issues
n_nan = output['temporal_context_score'].isna().sum()
n_inf = np.isinf(output['temporal_context_score']).sum()
is_constant = output['temporal_context_score'].std() < 1e-10

print(f"\nQuality checks:")
print(f"   NaN values: {n_nan} {'✓ OK' if n_nan == 0 else '✗ FAIL'}")
print(f"   Inf values: {n_inf} {'✓ OK' if n_inf == 0 else '✗ FAIL'}")
print(f"   Constant output: {'✗ FAIL' if is_constant else '✓ OK'}")
print(f"   Range [0,1]: {'✓ OK' if output['temporal_context_score'].min() >= 0 and output['temporal_context_score'].max() <= 1 else '✗ FAIL'}")

print("\n[OK] Submodel output generated.")

## Cell 11: Save Results

In [None]:
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Save submodel output CSV
output.to_csv("submodel_output.csv", index=False)
print("[OK] Saved: submodel_output.csv")

# Save model
torch.save({
    'model_state': final_model.state_dict(),
    'config': {
        'd_model': d_model,
        'n_heads': n_heads,
        'n_layers': n_layers,
        'ffn_ratio': ffn_ratio,
        'dropout': dropout,
        'window_size': window_size
    }
}, "model.pt")
print("[OK] Saved: model.pt")

# Save training result JSON
result = {
    "feature": "temporal_context",
    "attempt": 1,
    "timestamp": datetime.now().isoformat(),
    "best_params": best_params,
    "metrics": final_metrics,
    "optuna_trials_completed": n_completed,
    "optuna_best_value": best_value,
    "model_param_count": n_params,
    "output_shape": list(output.shape),
    "output_columns": list(output.columns),
    "output_statistics": {
        "mean": float(output['temporal_context_score'].mean()),
        "std": float(output['temporal_context_score'].std()),
        "min": float(output['temporal_context_score'].min()),
        "max": float(output['temporal_context_score'].max()),
        "median": float(output['temporal_context_score'].median())
    },
    "data_info": {
        "train_samples": len(train_data),
        "val_samples": len(val_data),
        "test_samples": len(test_data),
        "full_samples": len(full_data),
        "window_size": window_size,
        "windowed_samples": len(output)
    },
    "quality_checks": {
        "nan_count": int(n_nan),
        "inf_count": int(n_inf),
        "is_constant": bool(is_constant),
        "in_range_0_1": bool(output['temporal_context_score'].min() >= 0 and output['temporal_context_score'].max() <= 1)
    }
}

with open("training_result.json", "w") as f:
    json.dump(result, f, indent=2, default=str)

print("[OK] Saved: training_result.json")

print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)
print(f"Finished: {datetime.now().isoformat()}")
print(f"\nOutputs:")
print(f"  1. submodel_output.csv ({len(output)} rows)")
print(f"  2. model.pt ({n_params:,} parameters)")
print(f"  3. training_result.json")
print("\nReady for evaluator.")