# Gold Prediction SubModel: real_rate Attempt 3

**Feature**: Multi-Country Real Interest Rate Dynamics  
**Architecture**: Compact Transformer Autoencoder with GPU Support  
**Generated by**: builder_model agent  
**Date**: 2026-02-14

## Overview

This notebook trains a Transformer-based autoencoder on multi-country interest rate data to extract global monetary policy regime context. The model compresses 28 features from 7 countries into 4-6 latent semantic dimensions.

**Key Design Principles**:
- Self-contained: All data fetching and processing in this notebook
- GPU-enabled: Utilizes CUDA if available
- Compact architecture: 8-45K parameters to prevent overfitting on ~253 monthly samples
- Pre-norm Transformer: More stable training than post-norm
- Monthlyâ†’daily forward-fill: Outputs regime indicators at daily frequency

In [None]:
# === CELL 1: Imports and GPU Detection ===

import os
import json
import warnings
from datetime import datetime
import subprocess
import sys

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import optuna
from optuna.pruners import MedianPruner

warnings.filterwarnings('ignore')

# Install fredapi if not available
try:
    from fredapi import Fred
except ImportError:
    print("Installing fredapi...")
    subprocess.run([sys.executable, "-m", "pip", "install", "fredapi"], check=True)
    from fredapi import Fred

# Install yfinance if not available
try:
    import yfinance as yf
except ImportError:
    print("Installing yfinance...")
    subprocess.run([sys.executable, "-m", "pip", "install", "yfinance"], check=True)
    import yfinance as yf

# GPU Detection
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("=" * 60)
print("DEVICE CONFIGURATION")
print("=" * 60)
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("No GPU available, using CPU")
print("=" * 60)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print("\nImports complete. Random seeds set.")

In [None]:
# === CELL 2: Data Fetching (Self-Contained) ===

def fetch_multi_country_features():
    """
    Fetch and process multi-country interest rate features.
    Returns: pd.DataFrame with monthly features, DatetimeIndex
    
    Features (28 total):
    - US TIPS (2): level + change
    - 6 countries x 3 (18): nominal level + change + lagged CPI
    - Cross-country aggregates (4): dispersions + spread
    - VIX (1): monthly average
    - Gold price (1): for calendar alignment
    """
    
    print("\n" + "=" * 60)
    print("DATA FETCHING")
    print("=" * 60)
    
    # Get FRED API key from Kaggle Secrets
    try:
        from kaggle_secrets import UserSecretsClient
        secrets = UserSecretsClient()
        api_key = secrets.get_secret("FRED_API_KEY")
        print("[OK] FRED API key loaded from Kaggle Secrets")
    except:
        # Fallback to environment variable (for local testing)
        api_key = os.environ.get('FRED_API_KEY')
        if api_key is None:
            raise RuntimeError("FRED_API_KEY not found in Kaggle Secrets or environment")
        print("[OK] FRED API key loaded from environment")
    
    fred = Fred(api_key=api_key)
    
    # === 1. Fetch US TIPS (daily) ===
    print("\n1/9: Fetching US TIPS (DFII10)...")
    us_tips = fred.get_series('DFII10', observation_start='2003-01-01')
    print(f"  -> {len(us_tips)} daily observations")
    
    # === 2. Fetch 6 countries nominal yields + CPI ===
    countries = {
        'germany': ('IRLTLT01DEM156N', 'CPALTT01DEM659N'),
        'uk': ('IRLTLT01GBM156N', 'CPALTT01GBM659N'),
        'canada': ('IRLTLT01CAM156N', 'CPALTT01CAM659N'),
        'switzerland': ('IRLTLT01CHM156N', 'CPALTT01CHM659N'),
        'norway': ('IRLTLT01NOM156N', 'CPALTT01NOM659N'),
        'sweden': ('IRLTLT01SEM156N', 'CPALTT01SEM659N')
    }
    
    country_data = {}
    for i, (country, (nominal_id, cpi_id)) in enumerate(countries.items(), start=2):
        print(f"{i}/9: Fetching {country.upper()} data...")
        try:
            nominal = fred.get_series(nominal_id, observation_start='2003-01-01')
            cpi = fred.get_series(cpi_id, observation_start='2003-01-01')
            country_data[country] = {'nominal': nominal, 'cpi': cpi}
            print(f"  -> {len(nominal)} nominal, {len(cpi)} CPI observations")
        except Exception as e:
            print(f"  WARNING: Failed to fetch {country}: {e}")
            continue
    
    # === 3. Fetch VIX (daily) ===
    print("8/9: Fetching VIX (VIXCLS)...")
    vix = fred.get_series('VIXCLS', observation_start='2003-01-01')
    print(f"  -> {len(vix)} daily observations")
    
    # === 4. Fetch Gold price for calendar alignment ===
    print("9/9: Fetching Gold price (GC=F) for calendar...")
    gold = yf.download('GC=F', start='2003-01-01', progress=False)['Close']
    print(f"  -> {len(gold)} daily observations")
    
    print("\n" + "=" * 60)
    print("FEATURE ENGINEERING")
    print("=" * 60)
    
    # === 5. Resample to monthly (month-start) ===
    print("\nStep 1: Resampling to month-start...")
    us_tips_monthly = us_tips.resample('MS').last()
    vix_monthly = vix.resample('MS').mean()
    gold_monthly = gold.resample('MS').last()
    
    # === 6. Build feature DataFrame ===
    print("Step 2: Building feature matrix...")
    df = pd.DataFrame(index=us_tips_monthly.index)
    
    # US TIPS features
    df['us_tips_level'] = us_tips_monthly
    df['us_tips_change'] = us_tips_monthly.diff()
    
    # Country features
    for country, data in country_data.items():
        nominal = data['nominal']
        cpi = data['cpi']
        
        df[f'{country}_nominal_level'] = nominal
        df[f'{country}_nominal_change'] = nominal.diff()
        df[f'{country}_cpi_lagged'] = cpi.shift(1)  # 1-month lag
    
    # Cross-country aggregates
    country_list = list(country_data.keys())
    level_cols = [f'{c}_nominal_level' for c in country_list]
    change_cols = [f'{c}_nominal_change' for c in country_list]
    cpi_cols = [f'{c}_cpi_lagged' for c in country_list]
    
    df['yield_dispersion'] = df[level_cols].std(axis=1)
    df['yield_change_dispersion'] = df[change_cols].std(axis=1)
    df['mean_cpi_change'] = df[cpi_cols].mean(axis=1)
    df['us_vs_global_spread'] = df['us_tips_level'] - df[level_cols].mean(axis=1)
    
    # VIX
    df['vix_monthly'] = vix_monthly
    
    # Gold (for alignment only, not a feature)
    df['gold_price'] = gold_monthly
    
    # === 7. Handle missing data ===
    print("Step 3: Cleaning missing data...")
    print(f"  Before: {df.shape[0]} rows, {df.isna().sum().sum()} NaN values")
    
    df = df.ffill(limit=3)
    df = df.dropna()
    
    print(f"  After: {df.shape[0]} rows, {df.isna().sum().sum()} NaN values")
    
    # === 8. Validation ===
    print("\nStep 4: Data validation...")
    
    # Feature count check
    feature_cols = [c for c in df.columns if c != 'gold_price']
    expected_features = 2 + (len(country_data) * 3) + 4 + 1  # 2 US + 18 countries + 4 agg + 1 VIX
    actual_features = len(feature_cols)
    
    print(f"  Features: {actual_features} (expected {expected_features})")
    if actual_features != expected_features:
        print(f"  WARNING: Feature count mismatch!")
    
    # US synthetic vs TIPS correlation check (EXPECTED to be low ~0.49)
    us_synthetic = df['germany_nominal_level'] - df['germany_cpi_lagged']  # Use Germany as proxy
    corr = us_synthetic.corr(df['us_tips_level'])
    print(f"  Synthetic vs TIPS correlation: {corr:.3f} (expected ~0.49, low is OK)")
    
    print(f"  Date range: {df.index.min()} to {df.index.max()}")
    print(f"  Total months: {len(df)}")
    
    print("\n[OK] Data fetching complete")
    return df, gold

# Fetch data
data_monthly, gold_calendar = fetch_multi_country_features()

In [None]:
# === CELL 3: Dataset Class ===

class SlidingWindowDataset(Dataset):
    """
    Creates sliding windows from monthly time series data.
    Each sample is a window of shape [window_size, n_features].
    """
    def __init__(self, data_array, window_size):
        """
        Args:
            data_array: np.ndarray of shape [n_months, n_features]
            window_size: int, number of months in each window
        """
        self.data = torch.FloatTensor(data_array)
        self.window_size = window_size
    
    def __len__(self):
        return len(self.data) - self.window_size + 1
    
    def __getitem__(self, idx):
        # Return window of shape [window_size, n_features]
        window = self.data[idx:idx + self.window_size]
        return window

print("[OK] Dataset class defined")

In [None]:
# === CELL 4: Transformer Autoencoder Model ===

import math

class PositionalEncoding(nn.Module):
    """Sinusoidal positional encoding for monthly time series."""
    def __init__(self, d_model, max_len=100, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        # x: [batch, seq_len, d_model]
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


class MultiCountryTransformerAutoencoder(nn.Module):
    """
    Compact Transformer Autoencoder for multi-country real rate dynamics.
    
    Design principles:
    - Small model (8-45K params) to prevent overfitting on ~253 samples
    - Pre-norm Transformer (more stable training)
    - Mean pooling for temporal aggregation
    - Tanh on latent output to bound range
    
    Input: [batch, seq_len, n_features]
    Latent: [batch, latent_dim]
    Output: [batch, seq_len, n_features] (reconstruction)
    """
    def __init__(self, n_features=25, d_model=32, nhead=2, num_encoder_layers=2,
                 dim_feedforward=64, latent_dim=4, dropout=0.3, max_seq_len=48):
        super().__init__()
        self.n_features = n_features
        self.d_model = d_model
        self.latent_dim = latent_dim
        
        # Input projection
        self.input_proj = nn.Sequential(
            nn.Linear(n_features, d_model),
            nn.LayerNorm(d_model),
            nn.Dropout(dropout)
        )
        
        # Positional encoding
        self.pos_encoder = PositionalEncoding(d_model, max_len=max_seq_len, dropout=dropout)
        
        # Transformer encoder (pre-norm)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
            norm_first=True  # Pre-norm for stability
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_encoder_layers,
            norm=nn.LayerNorm(d_model)
        )
        
        # Latent projection with Tanh activation
        self.latent_proj = nn.Sequential(
            nn.Linear(d_model, latent_dim),
            nn.Tanh()
        )
        
        # Decoder
        self.latent_expand = nn.Sequential(
            nn.Linear(latent_dim, d_model),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        
        self.pos_decoder = PositionalEncoding(d_model, max_len=max_seq_len, dropout=dropout)
        
        decoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
            norm_first=True
        )
        self.transformer_decoder = nn.TransformerEncoder(
            decoder_layer,
            num_layers=max(1, num_encoder_layers - 1),
            norm=nn.LayerNorm(d_model)
        )
        
        self.output_proj = nn.Linear(d_model, n_features)
    
    def encode(self, x):
        """x: [batch, seq_len, n_features] -> z: [batch, latent_dim]"""
        h = self.input_proj(x)
        h = self.pos_encoder(h)
        h = self.transformer_encoder(h)
        h = h.mean(dim=1)  # Mean pooling over time
        z = self.latent_proj(h)
        return z
    
    def decode(self, z, seq_len):
        """z: [batch, latent_dim] -> reconstruction: [batch, seq_len, n_features]"""
        batch_size = z.size(0)
        h = self.latent_expand(z)
        h = h.unsqueeze(1).repeat(1, seq_len, 1)
        h = self.pos_decoder(h)
        h = self.transformer_decoder(h)
        reconstruction = self.output_proj(h)
        return reconstruction
    
    def forward(self, x):
        """Full forward pass for training"""
        seq_len = x.size(1)
        z = self.encode(x)
        reconstruction = self.decode(z, seq_len)
        return reconstruction, z
    
    def transform(self, x):
        """Generate latent features (inference mode)"""
        self.eval()
        with torch.no_grad():
            z = self.encode(x)
        return z

print("[OK] Transformer Autoencoder model defined")

In [None]:
# === CELL 5: Training Function ===

def train_epoch(model, dataloader, optimizer, scheduler, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        batch = batch.to(device)
        
        optimizer.zero_grad()
        reconstruction, latent = model(batch)
        loss = nn.functional.mse_loss(reconstruction, batch)
        
        loss.backward()
        # Gradient clipping (essential for Transformers)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
    
    scheduler.step()
    return total_loss / len(dataloader)


def evaluate(model, dataloader, device):
    """Evaluate on validation/test set"""
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            reconstruction, latent = model(batch)
            loss = nn.functional.mse_loss(reconstruction, batch)
            total_loss += loss.item()
    
    return total_loss / len(dataloader)


def train_model(model, train_loader, val_loader, config, device):
    """Full training loop with early stopping"""
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config['learning_rate'],
        weight_decay=config['weight_decay']
    )
    
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer,
        T_0=50,
        T_mult=2
    )
    
    best_val_loss = float('inf')
    best_train_loss = float('inf')
    patience_counter = 0
    best_state = None
    
    for epoch in range(config['max_epochs']):
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
        val_loss = evaluate(model, val_loader, device)
        
        # Early stopping
        if val_loss < best_val_loss - 1e-6:
            best_val_loss = val_loss
            best_train_loss = train_loss
            patience_counter = 0
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        else:
            patience_counter += 1
            if patience_counter >= config['patience']:
                break
    
    # Restore best weights
    if best_state:
        model.load_state_dict(best_state)
    
    return model, {
        'train_loss': best_train_loss,
        'val_loss': best_val_loss,
        'overfit_ratio': best_val_loss / (best_train_loss + 1e-10),
        'epochs_trained': epoch + 1
    }

print("[OK] Training functions defined")

In [None]:
# === CELL 6: Optuna HPO ===

def create_dataloaders(features_standardized, window_size, batch_size, train_idx, val_idx, device):
    """Create train and validation dataloaders"""
    train_data = features_standardized[:train_idx].values
    val_data = features_standardized[train_idx:val_idx].values
    
    train_dataset = SlidingWindowDataset(train_data, window_size)
    val_dataset = SlidingWindowDataset(val_data, window_size)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False
    )
    
    return train_loader, val_loader


def run_optuna_hpo(features_standardized, train_idx, val_idx, n_features, device, n_trials=30, timeout=3600):
    """Run Optuna hyperparameter optimization"""
    
    print("\n" + "=" * 60)
    print("OPTUNA HYPERPARAMETER OPTIMIZATION")
    print("=" * 60)
    print(f"Trials: {n_trials}")
    print(f"Timeout: {timeout}s")
    print(f"Device: {device}")
    
    def objective(trial):
        # Sample hyperparameters
        window_size = trial.suggest_categorical('window_size', [12, 18, 24])
        d_model = trial.suggest_categorical('d_model', [24, 32, 48])
        nhead = trial.suggest_categorical('nhead', [2, 4])
        num_encoder_layers = trial.suggest_categorical('num_encoder_layers', [2, 3])
        latent_dim = trial.suggest_categorical('latent_dim', [4, 5, 6])
        dropout = trial.suggest_float('dropout', 0.3, 0.5)
        learning_rate = trial.suggest_float('learning_rate', 5e-5, 5e-4, log=True)
        weight_decay = trial.suggest_float('weight_decay', 1e-3, 5e-2, log=True)
        batch_size = trial.suggest_categorical('batch_size', [16, 32])
        
        # Create dataloaders
        train_loader, val_loader = create_dataloaders(
            features_standardized, window_size, batch_size, train_idx, val_idx, device
        )
        
        # Create model
        model = MultiCountryTransformerAutoencoder(
            n_features=n_features,
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            dim_feedforward=2 * d_model,
            latent_dim=latent_dim,
            dropout=dropout,
            max_seq_len=window_size
        ).to(device)
        
        config = {
            'max_epochs': 200,
            'patience': 20,
            'learning_rate': learning_rate,
            'weight_decay': weight_decay
        }
        
        optimizer = optim.AdamW(
            model.parameters(),
            lr=learning_rate,
            weight_decay=weight_decay
        )
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=50, T_mult=2)
        
        best_val_loss = float('inf')
        patience_counter = 0
        
        for epoch in range(config['max_epochs']):
            train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
            val_loss = evaluate(model, val_loader, device)
            
            # Report to Optuna for pruning
            trial.report(val_loss, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()
            
            # Early stopping
            if val_loss < best_val_loss - 1e-6:
                best_val_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= config['patience']:
                    break
        
        return best_val_loss
    
    # Create study with MedianPruner
    study = optuna.create_study(
        direction='minimize',
        pruner=MedianPruner(
            n_startup_trials=7,
            n_warmup_steps=30
        ),
        sampler=optuna.samplers.TPESampler(seed=42)
    )
    
    study.optimize(objective, n_trials=n_trials, timeout=timeout)
    
    print("\n" + "=" * 60)
    print("OPTUNA RESULTS")
    print("=" * 60)
    print(f"Best value: {study.best_value:.6f}")
    print(f"Best params: {study.best_params}")
    print(f"Completed trials: {len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])}")
    print(f"Pruned trials: {len([t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED])}")
    
    return study.best_params, study.best_value, len(study.trials)

print("[OK] Optuna HPO function defined")

In [None]:
# === CELL 7: Main Execution ===

print("\n" + "=" * 60)
print("MAIN EXECUTION: REAL_RATE ATTEMPT 3")
print("=" * 60)
print(f"Started: {datetime.now().isoformat()}")

# === 1. Prepare features (exclude gold_price column) ===
feature_cols = [c for c in data_monthly.columns if c != 'gold_price']
features = data_monthly[feature_cols].copy()
n_features = len(feature_cols)

print(f"\nFeatures: {n_features} columns")
print(f"Samples: {len(features)} months")
print(f"Date range: {features.index.min()} to {features.index.max()}")

# === 2. Time-series split (70/15/15) ===
n_samples = len(features)
train_idx = int(n_samples * 0.70)
val_idx = int(n_samples * 0.85)

print(f"\nSplit: train={train_idx}, val={val_idx-train_idx}, test={n_samples-val_idx}")

# === 3. Standardization (fit on train only) ===
train_mean = features.iloc[:train_idx].mean()
train_std = features.iloc[:train_idx].std()

features_standardized = (features - train_mean) / (train_std + 1e-8)

print("\n[OK] Features standardized (train statistics)")

# === 4. Run Optuna HPO ===
best_params, best_value, n_trials_completed = run_optuna_hpo(
    features_standardized,
    train_idx,
    val_idx,
    n_features,
    device,
    n_trials=30,
    timeout=3600
)

# === 5. Final training with best params ===
print("\n" + "=" * 60)
print("FINAL MODEL TRAINING")
print("=" * 60)

window_size = best_params['window_size']
batch_size = best_params['batch_size']

# Create dataloaders
train_loader, val_loader = create_dataloaders(
    features_standardized, window_size, batch_size, train_idx, val_idx, device
)

# Create final model
final_model = MultiCountryTransformerAutoencoder(
    n_features=n_features,
    d_model=best_params['d_model'],
    nhead=best_params['nhead'],
    num_encoder_layers=best_params['num_encoder_layers'],
    dim_feedforward=2 * best_params['d_model'],
    latent_dim=best_params['latent_dim'],
    dropout=best_params['dropout'],
    max_seq_len=window_size
).to(device)

# Count parameters
n_params = sum(p.numel() for p in final_model.parameters())
print(f"Model parameters: {n_params:,}")

config = {
    'max_epochs': 200,
    'patience': 20,
    'learning_rate': best_params['learning_rate'],
    'weight_decay': best_params['weight_decay']
}

# Train
final_model, metrics = train_model(final_model, train_loader, val_loader, config, device)

print("\n[OK] Final model trained")
print(f"Train loss: {metrics['train_loss']:.6f}")
print(f"Val loss: {metrics['val_loss']:.6f}")
print(f"Overfit ratio: {metrics['overfit_ratio']:.3f}")
print(f"Epochs: {metrics['epochs_trained']}")

# === 6. Generate latent features for ALL data ===
print("\n" + "=" * 60)
print("GENERATING SUBMODEL OUTPUT")
print("=" * 60)

final_model.eval()
all_latents = []
monthly_dates = []

# Create dataset from all standardized features
all_dataset = SlidingWindowDataset(features_standardized.values, window_size)
all_loader = DataLoader(all_dataset, batch_size=32, shuffle=False)

with torch.no_grad():
    for batch in all_loader:
        batch = batch.to(device)
        latent = final_model.encode(batch)
        all_latents.append(latent.cpu().numpy())

all_latents = np.vstack(all_latents)
print(f"\nRaw latent shape: {all_latents.shape}")

# Align dates (windows start at index window_size-1)
monthly_dates = features.index[window_size-1:window_size-1+len(all_latents)]

# === 7. Apply first-difference postprocessing (at monthly level) ===
print("\nApplying first-difference postprocessing...")
latent_diff = np.diff(all_latents, axis=0)
# Prepend NaN for first window
latent_output = np.vstack([np.full((1, all_latents.shape[1]), np.nan), latent_diff])

# Create monthly output DataFrame
output_monthly = pd.DataFrame(
    latent_output,
    index=monthly_dates,
    columns=[f'real_rate_sem_{i}' for i in range(best_params['latent_dim'])]
)

print(f"Monthly output shape: {output_monthly.shape}")

# === 8. Expand to daily using forward-fill ===
print("\nExpanding to daily frequency (forward-fill)...")

# Get gold trading calendar for alignment
gold_daily_dates = gold_calendar.index
date_range = pd.date_range(
    output_monthly.index.min(),
    output_monthly.index.max(),
    freq='D'
)

# Reindex to daily and forward-fill
output_daily = output_monthly.reindex(date_range).ffill()

# Align to gold calendar (keep only trading days)
output_daily = output_daily.reindex(gold_daily_dates).ffill()

# Drop any remaining NaN rows
output_daily = output_daily.dropna()

print(f"Daily output shape: {output_daily.shape}")
print(f"Date range: {output_daily.index.min()} to {output_daily.index.max()}")

# === 9. Save outputs ===
print("\n" + "=" * 60)
print("SAVING OUTPUTS")
print("=" * 60)

# Save submodel output (daily)
output_daily.to_csv('submodel_output.csv')
print("[OK] Saved: submodel_output.csv")

# Save model weights
torch.save({
    'model_state': final_model.state_dict(),
    'config': best_params,
    'train_mean': train_mean.to_dict(),
    'train_std': train_std.to_dict()
}, 'model.pt')
print("[OK] Saved: model.pt")

# Save training results
result = {
    'feature': 'real_rate',
    'attempt': 3,
    'timestamp': datetime.now().isoformat(),
    'architecture': 'MultiCountryTransformerAutoencoder',
    'device': str(device),
    'best_params': best_params,
    'metrics': metrics,
    'model_parameters': n_params,
    'optuna_trials_completed': n_trials_completed,
    'optuna_best_value': float(best_value),
    'output_shape': list(output_daily.shape),
    'output_columns': list(output_daily.columns),
    'data_info': {
        'n_features': n_features,
        'n_samples_monthly': len(features),
        'train_samples': train_idx,
        'val_samples': val_idx - train_idx,
        'test_samples': n_samples - val_idx,
        'window_size': window_size,
        'output_samples_daily': len(output_daily)
    }
}

with open('training_result.json', 'w') as f:
    json.dump(result, f, indent=2)
print("[OK] Saved: training_result.json")

print("\n" + "=" * 60)
print("TRAINING COMPLETE")
print("=" * 60)
print(f"Finished: {datetime.now().isoformat()}")
print(f"\nOutput summary:")
print(f"  - Submodel output: {output_daily.shape[0]} rows x {output_daily.shape[1]} columns")
print(f"  - Latent dimensions: {best_params['latent_dim']}")
print(f"  - Model parameters: {n_params:,}")
print(f"  - Overfit ratio: {metrics['overfit_ratio']:.3f}")
print("\n[SUCCESS] All files saved to Kaggle output directory")