# Gold Prediction SubModel Training - real_rate Attempt 2

**Feature**: real_rate (DFII10 - 10Y TIPS Yield)  
**Attempt**: 2  
**Architecture**: GRU Autoencoder (Sequence-to-Sequence)  
**Generated by**: builder_model agent  
**Date**: 2026-02-14

## Key Changes from Attempt 1

- Architecture: MLP → **GRU Autoencoder**
- Latent dimensions: 4 → **2** (tighter compression)
- Dropout: 0.13 → **0.3-0.5** (stronger regularization)
- Weight decay: 1e-6 → **1e-4 to 1e-2** (100-10000x stronger)
- Window size: 20 → **40-80** (longer context)
- Optuna trials: 5 → **20** (better exploration)
- **New**: First-difference postprocessing to break autocorr >0.99

## Expected Improvements

- Overfit ratio: 2.69 → **1.1-1.4**
- Autocorrelation: >0.995 → **<0.5** (after differencing)
- Gate 2 MI increase: maintained at **15-20%**
- Gate 3: Direction accuracy **+0.3-0.5%**

## Cell 1: Imports and Device Configuration

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import pandas as pd
import numpy as np
import json
import os
from datetime import datetime
from scipy.stats import percentileofscore

import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device configuration (CPU for this task)
device = torch.device('cpu')
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")
print(f"Started: {datetime.now().isoformat()}")

## Cell 2: Data Fetching (Self-Contained)

In [None]:
def fetch_real_rate_features():
    """
    Self-contained data fetching and feature engineering.
    Embedded from scripts/fetch_real_rate_features.py
    Returns: DataFrame with 8 engineered features
    """
    # Install dependencies if needed
    try:
        from fredapi import Fred
    except ImportError:
        print("Installing fredapi...")
        import subprocess
        subprocess.run(["pip", "install", "fredapi"], check=True)
        from fredapi import Fred
    
    try:
        import yfinance as yf
    except ImportError:
        print("Installing yfinance...")
        import subprocess
        subprocess.run(["pip", "install", "yfinance"], check=True)
        import yfinance as yf
    
    # === Load API key from Kaggle Secrets ===
    try:
        from kaggle_secrets import UserSecretsClient
        secrets = UserSecretsClient()
        api_key = secrets.get_secret("FRED_API_KEY")
        print("Loaded FRED_API_KEY from Kaggle Secrets")
    except Exception as e:
        print(f"Failed to load FRED_API_KEY from Kaggle Secrets: {e}")
        # Fallback to environment variable (for local testing)
        api_key = os.environ.get('FRED_API_KEY')
        if api_key is None:
            raise RuntimeError(
                "FRED_API_KEY not found. Please add it to Kaggle Secrets or environment."
            )
    
    fred = Fred(api_key=api_key)
    
    # === Fetch DFII10 from FRED ===
    print("Fetching DFII10 from FRED...")
    # Fetch from 2013-06-01 to allow 252-day rolling windows before schema start (2015-01-30)
    series = fred.get_series('DFII10', observation_start='2013-06-01')
    
    df = pd.DataFrame({'level': series})
    df.index = pd.to_datetime(df.index)
    df.index.name = 'Date'
    
    # Drop NaN rows (holidays)
    df = df.dropna()
    print(f"FRED data shape after dropping NaN: {df.shape}")
    
    # === Align with gold trading days ===
    print("Fetching gold trading days from yfinance...")
    gold = yf.download('GC=F', start='2013-06-01', progress=False)
    gold_dates = pd.DatetimeIndex(gold.index)
    
    # Reindex to gold trading days and forward-fill (max 5 days)
    df = df.reindex(gold_dates)
    df['level'] = df['level'].ffill(limit=5)
    df = df.dropna()
    print(f"Data shape after alignment to gold trading days: {df.shape}")
    
    # === Feature Engineering ===
    print("Computing 8 hand-crafted features...")
    
    # Feature 2: Daily change
    df['change_1d'] = df['level'].diff()
    
    # Rolling std for normalization (60-day window)
    rolling_std_60d = df['change_1d'].rolling(60, min_periods=60).std()
    
    # Feature 3: Velocity 20-day (normalized)
    df['velocity_20d'] = (df['level'] - df['level'].shift(20)) / rolling_std_60d
    
    # Feature 4: Velocity 60-day (normalized)
    df['velocity_60d'] = (df['level'] - df['level'].shift(60)) / rolling_std_60d
    
    # Feature 5: Acceleration (change in 20-day velocity)
    df['accel_20d'] = df['velocity_20d'] - df['velocity_20d'].shift(20)
    
    # Feature 6: Rolling std 20-day
    df['rolling_std_20d'] = df['change_1d'].rolling(20, min_periods=20).std()
    
    # Feature 7: Regime percentile (252-day rolling percentile rank)
    def percentile_rank(x):
        if len(x) < 2:
            return np.nan
        return percentileofscore(x, x.iloc[-1]) / 100.0
    
    df['regime_percentile'] = df['level'].rolling(252, min_periods=252).apply(percentile_rank, raw=False)
    
    # Feature 8: Autocorrelation of daily changes (60-day window, lag 1)
    def rolling_autocorr(x):
        if len(x) < 2:
            return np.nan
        try:
            return pd.Series(x).autocorr(lag=1)
        except:
            return np.nan
    
    df['autocorr_20d'] = df['change_1d'].rolling(60, min_periods=60).apply(rolling_autocorr, raw=False)
    
    # Drop rows with NaN from rolling calculations
    initial_rows = len(df)
    df = df.dropna()
    print(f"Dropped {initial_rows - len(df)} rows due to rolling window NaN")
    
    # === Align to schema date range ===
    schema_start = '2015-01-30'
    schema_end = '2025-02-12'
    
    # First check if we have data covering the schema range
    if df.index.min() > pd.Timestamp(schema_start):
        raise ValueError(f"Insufficient data buffer. First valid date: {df.index.min()}, "
                        f"but schema starts at {schema_start}")
    
    df = df.loc[schema_start:schema_end]
    
    # Verify NO NaN in final output
    if df.isna().any().any():
        nan_count = df.isna().sum().sum()
        print(f"WARNING: {nan_count} NaN values remain in schema range")
        print("Dropping rows with NaN...")
        df = df.dropna()
    
    print(f"Final shape after schema alignment: {df.shape}")
    print(f"Date range: {df.index.min()} to {df.index.max()}")
    print(f"NaN count: {df.isna().sum().sum()}")
    
    # Verify we have 8 feature columns
    feature_cols = ['level', 'change_1d', 'velocity_20d', 'velocity_60d',
                    'accel_20d', 'rolling_std_20d', 'regime_percentile', 'autocorr_20d']
    assert all(col in df.columns for col in feature_cols), "Missing feature columns"
    
    return df[feature_cols]

# Fetch data
print("=" * 60)
print("Real Rate Feature Fetching - Attempt 2")
print("=" * 60)

full_data = fetch_real_rate_features()
print(f"\nFetched data shape: {full_data.shape}")
print(f"Columns: {list(full_data.columns)}")

## Cell 3: Dataset Class for GRU Input

In [None]:
class SlidingWindowDataset(Dataset):
    """
    Dataset for GRU autoencoder input.
    Returns windows with shape [seq_len, n_features] (NOT flattened).
    
    This is different from Attempt 1 MLP which flattened to [seq_len * n_features].
    GRU processes the temporal sequence natively.
    """
    def __init__(self, data_array, window_size):
        """
        Args:
            data_array: numpy array of shape [total_samples, n_features]
            window_size: int, length of sliding window
        """
        self.data = torch.FloatTensor(data_array)
        self.window_size = window_size
    
    def __len__(self):
        return len(self.data) - self.window_size + 1
    
    def __getitem__(self, idx):
        # Return window with shape [window_size, n_features]
        # NO flattening (unlike MLP Attempt 1)
        window = self.data[idx:idx + self.window_size]
        return window

print("Dataset class defined successfully")

## Cell 4: GRU Autoencoder Model

In [None]:
class RealRateGRUAutoencoder(nn.Module):
    """
    GRU-based autoencoder for real rate temporal dynamics.
    
    Input: [batch, seq_len, n_features=8]
    Latent: [batch, latent_dim=2]
    Output: [batch, seq_len, n_features=8]
    
    Changes from Attempt 1 MLP:
    - Sequential processing (no flattening)
    - Tighter bottleneck (latent_dim=2 vs 4)
    - Separate dropout layers (GRU dropout only between layers)
    - Decoder GRU reconstructs sequence from compressed state
    """
    def __init__(self, n_features=8, gru_hidden_dim=32, gru_num_layers=1,
                 latent_dim=2, dropout=0.3, bidirectional=False):
        super().__init__()
        self.n_features = n_features
        self.gru_hidden_dim = gru_hidden_dim
        self.gru_num_layers = gru_num_layers
        self.latent_dim = latent_dim
        self.bidirectional = bidirectional
        self.num_directions = 2 if bidirectional else 1
        
        # --- ENCODER ---
        # GRU dropout only applies between layers; for single layer, set to 0
        # This is a PyTorch gotcha: dropout parameter is ignored for num_layers=1
        gru_dropout = dropout if gru_num_layers > 1 else 0.0
        
        self.encoder_gru = nn.GRU(
            input_size=n_features,
            hidden_size=gru_hidden_dim,
            num_layers=gru_num_layers,
            batch_first=True,
            dropout=gru_dropout,
            bidirectional=bidirectional
        )
        
        # Post-GRU dropout (applies to final hidden state)
        self.encoder_dropout = nn.Dropout(dropout)
        
        # Compress to latent space
        encoder_output_dim = gru_hidden_dim * self.num_directions
        self.encoder_fc = nn.Linear(encoder_output_dim, latent_dim)
        
        # --- DECODER ---
        self.decoder_fc = nn.Linear(latent_dim, gru_hidden_dim)
        self.decoder_dropout = nn.Dropout(dropout)
        
        self.decoder_gru = nn.GRU(
            input_size=gru_hidden_dim,
            hidden_size=gru_hidden_dim,
            num_layers=1,  # Always 1 layer for decoder
            batch_first=True
        )
        
        self.decoder_output = nn.Linear(gru_hidden_dim, n_features)
    
    def encode(self, x):
        """
        Encode sequential input to latent space.
        
        Args:
            x: [batch, seq_len, n_features]
        Returns:
            z: [batch, latent_dim]
        """
        # GRU encoding
        _, hidden = self.encoder_gru(x)
        # hidden: [num_layers*num_dir, batch, gru_hidden_dim]
        
        # Take the last layer's hidden state
        if self.bidirectional:
            # Concatenate forward and backward final hidden states
            h_forward = hidden[-2]   # [batch, gru_hidden_dim]
            h_backward = hidden[-1]  # [batch, gru_hidden_dim]
            h = torch.cat([h_forward, h_backward], dim=-1)
        else:
            h = hidden[-1]  # [batch, gru_hidden_dim]
        
        # Dropout + compress to latent
        h = self.encoder_dropout(h)
        z = torch.tanh(self.encoder_fc(h))
        return z
    
    def decode(self, z, seq_len):
        """
        Decode latent representation to sequence.
        
        Args:
            z: [batch, latent_dim]
            seq_len: int (original sequence length to reconstruct)
        Returns:
            reconstruction: [batch, seq_len, n_features]
        """
        # Expand latent to GRU input dimension
        h = torch.relu(self.decoder_fc(z))
        h = self.decoder_dropout(h)
        
        # Repeat across time steps
        decoder_input = h.unsqueeze(1).repeat(1, seq_len, 1)
        
        # Decode sequence
        decoder_output, _ = self.decoder_gru(decoder_input)
        reconstruction = self.decoder_output(decoder_output)
        return reconstruction
    
    def forward(self, x):
        """
        Full forward pass for training.
        
        Args:
            x: [batch, seq_len, n_features]
        Returns:
            reconstruction: [batch, seq_len, n_features]
            z: [batch, latent_dim]
        """
        seq_len = x.size(1)
        z = self.encode(x)
        reconstruction = self.decode(z, seq_len)
        return reconstruction, z
    
    def transform(self, x):
        """
        Generate latent features for inference (no gradient).
        
        Args:
            x: [batch, seq_len, n_features]
        Returns:
            z: [batch, latent_dim]
        """
        self.eval()
        with torch.no_grad():
            z = self.encode(x)
        return z

print("GRU Autoencoder model class defined successfully")

## Cell 5: Training Function

In [None]:
def train_model(model, train_loader, val_loader, config, verbose=False):
    """
    Train the GRU autoencoder with early stopping.
    
    Args:
        model: RealRateGRUAutoencoder instance
        train_loader: DataLoader for training
        val_loader: DataLoader for validation
        config: dict with training hyperparameters
        verbose: bool, whether to print progress
    
    Returns:
        model: trained model (best checkpoint restored)
        metrics: dict with training metrics
    """
    optimizer = optim.Adam(
        model.parameters(),
        lr=config['learning_rate'],
        weight_decay=config['weight_decay']
    )
    
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        patience=7,
        factor=0.5,
        min_lr=1e-6
    )
    
    best_val_loss = float('inf')
    best_train_loss = float('inf')
    patience_counter = 0
    best_state = None
    
    max_epochs = config.get('max_epochs', 150)
    early_stop_patience = config.get('early_stop_patience', 15)
    
    for epoch in range(max_epochs):
        # === TRAIN ===
        model.train()
        train_loss = 0.0
        
        for batch in train_loader:
            batch = batch.to(device)
            
            optimizer.zero_grad()
            reconstruction, _ = model(batch)
            loss = F.mse_loss(reconstruction, batch)
            loss.backward()
            
            # Gradient clipping (essential for GRU stability)
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        
        # === VALIDATE ===
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                reconstruction, _ = model(batch)
                loss = F.mse_loss(reconstruction, batch)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        
        # Learning rate scheduler step
        scheduler.step(val_loss)
        
        # Overfit ratio for monitoring
        overfit_ratio = val_loss / (train_loss + 1e-10)
        
        if verbose and epoch % 10 == 0:
            print(f"Epoch {epoch:3d}: train_loss={train_loss:.6f}, val_loss={val_loss:.6f}, "
                  f"overfit_ratio={overfit_ratio:.3f}")
        
        # === EARLY STOPPING ===
        if val_loss < best_val_loss - 1e-6:
            best_val_loss = val_loss
            best_train_loss = train_loss  # Record train_loss at same epoch for fair comparison
            patience_counter = 0
            best_state = {k: v.clone() for k, v in model.state_dict().items()}
        else:
            patience_counter += 1
            if patience_counter >= early_stop_patience:
                if verbose:
                    print(f"Early stopping at epoch {epoch}")
                break
    
    # Restore best weights
    if best_state is not None:
        model.load_state_dict(best_state)
    
    return model, {
        'train_loss': best_train_loss,
        'val_loss': best_val_loss,
        'overfit_ratio': best_val_loss / (best_train_loss + 1e-10),
        'epochs_trained': epoch + 1
    }

print("Training function defined successfully")

## Cell 6: Optuna HPO Function

In [None]:
def run_hpo(train_data, val_data, n_trials=20, timeout=1800):
    """
    Run Optuna hyperparameter optimization.
    
    Args:
        train_data: numpy array [n_samples, n_features]
        val_data: numpy array [n_samples, n_features]
        n_trials: number of Optuna trials
        timeout: timeout in seconds
    
    Returns:
        best_params: dict with best hyperparameters
        best_value: best validation loss
        n_completed: number of completed trials
    """
    def objective(trial):
        # === HYPERPARAMETER SEARCH SPACE ===
        window_size = trial.suggest_categorical('window_size', [40, 60, 80])
        gru_hidden_dim = trial.suggest_categorical('gru_hidden_dim', [16, 32, 64])
        gru_num_layers = trial.suggest_categorical('gru_num_layers', [1, 2])
        dropout = trial.suggest_float('dropout', 0.3, 0.5)
        learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
        weight_decay = trial.suggest_float('weight_decay', 1e-4, 1e-2, log=True)
        bidirectional = trial.suggest_categorical('bidirectional', [True, False])
        batch_size = trial.suggest_categorical('batch_size', [16, 32])
        
        # === CREATE DATASETS ===
        train_dataset = SlidingWindowDataset(train_data, window_size)
        val_dataset = SlidingWindowDataset(val_data, window_size)
        
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,  # Shuffle batches (temporal order within window preserved)
            drop_last=True  # Avoid batch size 1
        )
        
        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            shuffle=False,
            drop_last=False
        )
        
        # === CREATE MODEL ===
        model = RealRateGRUAutoencoder(
            n_features=8,
            gru_hidden_dim=gru_hidden_dim,
            gru_num_layers=gru_num_layers,
            latent_dim=2,  # FIXED at 2
            dropout=dropout,
            bidirectional=bidirectional
        ).to(device)
        
        # === TRAIN ===
        config = {
            'learning_rate': learning_rate,
            'weight_decay': weight_decay,
            'max_epochs': 150,
            'early_stop_patience': 15
        }
        
        _, metrics = train_model(model, train_loader, val_loader, config, verbose=False)
        
        # Prune if overfitting is severe (>2.0)
        if metrics['overfit_ratio'] > 2.0:
            raise optuna.TrialPruned()
        
        return metrics['val_loss']
    
    # === RUN STUDY ===
    study = optuna.create_study(
        direction='minimize',
        sampler=TPESampler(seed=42),
        pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=15)
    )
    
    study.optimize(objective, n_trials=n_trials, timeout=timeout)
    
    return study.best_params, study.best_value, len(study.trials)

print("HPO function defined successfully")

## Cell 7: Main Execution

In [None]:
print("=" * 60)
print("MAIN EXECUTION: Training Pipeline")
print("=" * 60)

# === 1. PREPARE DATA ===
print("\n[1/7] Preparing data...")

# Split into train/val/test (70/15/15, chronological)
n = len(full_data)
train_end = int(n * 0.70)
val_end = int(n * 0.85)

train_df = full_data.iloc[:train_end]
val_df = full_data.iloc[train_end:val_end]
test_df = full_data.iloc[val_end:]

print(f"Train: {len(train_df)} samples ({train_df.index.min()} to {train_df.index.max()})")
print(f"Val:   {len(val_df)} samples ({val_df.index.min()} to {val_df.index.max()})")
print(f"Test:  {len(test_df)} samples ({test_df.index.min()} to {test_df.index.max()})")

# === 2. STANDARDIZE FEATURES ===
print("\n[2/7] Standardizing features (fit on train only)...")

# Compute mean/std from train split ONLY
train_mean = train_df.mean()
train_std = train_df.std()

# Apply to all splits
train_data = ((train_df - train_mean) / train_std).values
val_data = ((val_df - train_mean) / train_std).values
test_data = ((test_df - train_mean) / train_std).values
full_data_normalized = ((full_data - train_mean) / train_std).values

print(f"Train mean range: [{train_mean.min():.4f}, {train_mean.max():.4f}]")
print(f"Train std range:  [{train_std.min():.4f}, {train_std.max():.4f}]")

# === 3. HYPERPARAMETER OPTIMIZATION ===
print("\n[3/7] Running Optuna HPO (20 trials, 30 min timeout)...")
print("This may take 15-25 minutes...")

best_params, best_value, n_completed = run_hpo(
    train_data,
    val_data,
    n_trials=20,
    timeout=1800
)

print(f"\nOptuna completed {n_completed} trials")
print(f"Best validation loss: {best_value:.6f}")
print(f"Best hyperparameters:")
for key, value in best_params.items():
    print(f"  {key}: {value}")

# === 4. TRAIN FINAL MODEL ===
print("\n[4/7] Training final model with best hyperparameters...")

window_size = best_params['window_size']
batch_size = best_params['batch_size']

# Create datasets with best window size
train_dataset = SlidingWindowDataset(train_data, window_size)
val_dataset = SlidingWindowDataset(val_data, window_size)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=False)

# Create model with best hyperparameters
final_model = RealRateGRUAutoencoder(
    n_features=8,
    gru_hidden_dim=best_params['gru_hidden_dim'],
    gru_num_layers=best_params['gru_num_layers'],
    latent_dim=2,
    dropout=best_params['dropout'],
    bidirectional=best_params['bidirectional']
).to(device)

config = {
    'learning_rate': best_params['learning_rate'],
    'weight_decay': best_params['weight_decay'],
    'max_epochs': 150,
    'early_stop_patience': 15
}

final_model, metrics = train_model(final_model, train_loader, val_loader, config, verbose=True)

print(f"\nFinal model metrics:")
print(f"  Train loss: {metrics['train_loss']:.6f}")
print(f"  Val loss:   {metrics['val_loss']:.6f}")
print(f"  Overfit ratio: {metrics['overfit_ratio']:.3f}")
print(f"  Epochs trained: {metrics['epochs_trained']}")

# === 5. GENERATE LATENT FEATURES ===
print("\n[5/7] Generating latent features for all dates...")

# Create dataset for full data
full_dataset = SlidingWindowDataset(full_data_normalized, window_size)
full_loader = DataLoader(full_dataset, batch_size=64, shuffle=False, drop_last=False)

# Generate latent features
final_model.eval()
latent_features = []

with torch.no_grad():
    for batch in full_loader:
        batch = batch.to(device)
        z = final_model.encode(batch)
        latent_features.append(z.cpu().numpy())

latent_features = np.vstack(latent_features)
print(f"Raw latent features shape: {latent_features.shape}")

# === 6. FIRST-DIFFERENCE POSTPROCESSING ===
print("\n[6/7] Applying first-difference postprocessing...")

# Create DataFrame for raw latent features
# First window_size-1 rows are NaN (insufficient lookback)
output_df = pd.DataFrame(
    index=full_data.index,
    columns=[f'real_rate_latent_{i}' for i in range(2)]
)
output_df.iloc[window_size - 1:] = latent_features

# Drop NaN rows (insufficient lookback)
output_df_clean = output_df.dropna()

# Compute autocorrelation of raw latent (before differencing)
raw_autocorr = [output_df_clean[col].autocorr(lag=1) for col in output_df_clean.columns]
print(f"Raw latent autocorr (lag 1): {raw_autocorr}")

# Apply first-difference
output_diff = output_df_clean.diff().dropna()  # This removes one more row

# Compute autocorrelation of differenced latent
diff_autocorr = [output_diff[col].autocorr(lag=1) for col in output_diff.columns]
print(f"Differenced latent autocorr (lag 1): {diff_autocorr}")

print(f"\nFinal output shape: {output_diff.shape}")
print(f"Date range: {output_diff.index.min()} to {output_diff.index.max()}")
print(f"Columns: {list(output_diff.columns)}")

# === 7. SAVE RESULTS ===
print("\n[7/7] Saving results to Kaggle output directory...")

# Save submodel output (first-differenced latent features)
output_diff.to_csv("submodel_output.csv")
print(f"Saved: submodel_output.csv ({output_diff.shape[0]} rows, {output_diff.shape[1]} columns)")

# Save model weights
torch.save({
    'model_state': final_model.state_dict(),
    'config': best_params,
    'standardization': {
        'mean': train_mean.to_dict(),
        'std': train_std.to_dict()
    }
}, "model.pt")
print(f"Saved: model.pt")

# Save training result JSON
result = {
    "feature": "real_rate",
    "attempt": 2,
    "architecture": "GRU_Autoencoder",
    "timestamp": datetime.now().isoformat(),
    "best_params": best_params,
    "metrics": metrics,
    "optuna_trials_completed": n_completed,
    "optuna_best_value": best_value,
    "output_shape": list(output_diff.shape),
    "output_columns": list(output_diff.columns),
    "autocorrelation": {
        "raw_latent": raw_autocorr,
        "differenced_latent": diff_autocorr
    },
    "data_info": {
        "train_samples": len(train_data),
        "val_samples": len(val_data),
        "test_samples": len(test_data),
        "full_samples": len(full_data),
        "window_size": window_size,
        "latent_dim": 2
    },
    "model_params": sum(p.numel() for p in final_model.parameters())
}

with open("training_result.json", "w") as f:
    json.dump(result, f, indent=2, default=str)
print(f"Saved: training_result.json")

# === FINAL SUMMARY ===
print("\n" + "=" * 60)
print("TRAINING COMPLETE!")
print("=" * 60)
print(f"Finished: {datetime.now().isoformat()}")
print(f"\nKey Results:")
print(f"  Overfit ratio: {metrics['overfit_ratio']:.3f} (target: <1.5)")
print(f"  Autocorr reduction: {raw_autocorr[0]:.3f} → {diff_autocorr[0]:.3f}")
print(f"  Output shape: {output_diff.shape}")
print(f"  Model parameters: {result['model_params']:,}")
print("\nFiles saved:")
print("  - submodel_output.csv")
print("  - model.pt")
print("  - training_result.json")