# Gold Prediction SubModel Training - real_rate Attempt 6

**Method**: Deterministic Bond Vol Regime + Rate Momentum Persistence

**Self-contained**: Data fetch → Feature computation → Optuna HPO → Save results

In [1]:
# Cell 1: Header + Libraries
import subprocess
subprocess.check_call(['pip', 'install', '-q', 'fredapi'])

import pandas as pd
import numpy as np
import json
import os
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')

# FRED API
from fredapi import Fred
fred = Fred(api_key="3ffb68facdf6321e180e380c00e909c8")

# Optuna
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# sklearn MI
from sklearn.metrics import mutual_info_score

print(f"Started: {datetime.now().isoformat()}")
print("Libraries loaded successfully")

Started: 2026-02-18T01:48:27.599400
Libraries loaded successfully


In [2]:
# Cell 2: Data Fetching
print("=" * 60)
print("DATA FETCHING")
print("=" * 60)

# Fetch DFII10 from FRED (with buffer for rolling window warmup)
dfii10_raw = fred.get_series('DFII10', observation_start='2014-01-01')
dfii10_raw = dfii10_raw.dropna()
print(f"DFII10: {len(dfii10_raw)} observations, {dfii10_raw.index[0]} to {dfii10_raw.index[-1]}")

# Fetch gold prices for date alignment
import yfinance as yf
gold = yf.download('GC=F', start='2014-01-01', auto_adjust=True, progress=False)
gold_dates = gold.index

# Align DFII10 to gold trading calendar
dfii10 = dfii10_raw.reindex(gold_dates, method='ffill')
dfii10_change = dfii10.diff()

print(f"Aligned: {len(dfii10)} trading days")
print(f"dfii10 range: [{dfii10.min():.4f}, {dfii10.max():.4f}]")
print(f"dfii10_change range: [{dfii10_change.min():.4f}, {dfii10_change.max():.4f}]")

DATA FETCHING


DFII10: 3029 observations, 2014-01-02 00:00:00 to 2026-02-12 00:00:00


Aligned: 3048 trading days
dfii10 range: [-1.1900, 2.5200]
dfii10_change range: [-0.4500, 0.3900]


In [3]:
# Cell 3: Target variable
# Compute gold return (next-day) as target for MI evaluation
gold_close = gold['Close'].squeeze()
gold_return = gold_close.pct_change() * 100  # percentage
gold_return_next = gold_return.shift(-1)  # next-day return

# Align all series
common_dates = dfii10.dropna().index.intersection(gold_return_next.dropna().index)
print(f"Common dates: {len(common_dates)}, {common_dates[0]} to {common_dates[-1]}")

Common dates: 3047, 2014-01-02 00:00:00 to 2026-02-13 00:00:00


In [4]:
# Cell 4: Feature computation functions
def compute_bond_vol_z(dfii10_change, vol_window, zscore_window):
    """Bond Volatility Regime: realized vol of DFII10 changes, z-scored."""
    realized_vol = dfii10_change.rolling(window=vol_window, min_periods=vol_window).std()
    # shift(1) to avoid look-ahead: z-score compares to PAST vol only
    mean_vol = realized_vol.shift(1).rolling(window=zscore_window, min_periods=zscore_window).mean()
    std_vol = realized_vol.shift(1).rolling(window=zscore_window, min_periods=zscore_window).std()
    z = (realized_vol - mean_vol) / std_vol
    z = z.clip(-4, 4)
    return z

def compute_momentum_z(dfii10_change, autocorr_window, zscore_window):
    """Rate Momentum Persistence: lag-1 autocorrelation of DFII10 changes, z-scored."""
    autocorr = dfii10_change.rolling(window=autocorr_window, min_periods=autocorr_window).apply(
        lambda x: pd.Series(x).autocorr(lag=1), raw=False
    )
    # shift(1) to avoid look-ahead
    mean_ac = autocorr.shift(1).rolling(window=zscore_window, min_periods=zscore_window).mean()
    std_ac = autocorr.shift(1).rolling(window=zscore_window, min_periods=zscore_window).std()
    z = (autocorr - mean_ac) / std_ac
    z = z.clip(-4, 4)
    return z

def compute_mi(feature, target, n_bins=20):
    """Compute mutual information between feature and target using quantile binning."""
    mask = feature.notna() & target.notna()
    f = feature[mask]
    t = target[mask]
    if len(f) < 100:
        return 0.0
    f_binned = pd.qcut(f, q=n_bins, labels=False, duplicates='drop')
    t_binned = pd.qcut(t, q=n_bins, labels=False, duplicates='drop')
    return mutual_info_score(f_binned, t_binned)

print("Feature computation functions defined")

Feature computation functions defined


In [5]:
# Cell 5: Data split
print("=" * 60)
print("DATA SPLIT")
print("=" * 60)

# Schema date range
SCHEMA_START = '2015-01-30'
SCHEMA_END = datetime.now().strftime('%Y-%m-%d')

# Filter to schema range for split
schema_dates = common_dates[(common_dates >= SCHEMA_START) & (common_dates <= SCHEMA_END)]
n = len(schema_dates)
n_train = int(n * 0.70)
n_val = int(n * 0.15)

train_dates = schema_dates[:n_train]
val_dates = schema_dates[n_train:n_train+n_val]
test_dates = schema_dates[n_train+n_val:]

print(f"Total: {n}, Train: {len(train_dates)} ({train_dates[0]}~{train_dates[-1]})")
print(f"Val: {len(val_dates)} ({val_dates[0]}~{val_dates[-1]})")
print(f"Test: {len(test_dates)} ({test_dates[0]}~{test_dates[-1]})")

DATA SPLIT
Total: 2776, Train: 1943 (2015-01-30 00:00:00~2022-10-20 00:00:00)
Val: 416 (2022-10-21 00:00:00~2024-06-18 00:00:00)
Test: 417 (2024-06-20 00:00:00~2026-02-13 00:00:00)


In [6]:
# Cell 6: Optuna HPO
print("=" * 60)
print("OPTUNA HYPERPARAMETER OPTIMIZATION")
print("=" * 60)

def optuna_objective(trial):
    vol_window = trial.suggest_categorical('vol_window', [10, 15, 20])
    vol_zscore_window = trial.suggest_categorical('vol_zscore_window', [60, 120])
    autocorr_window = trial.suggest_categorical('autocorr_window', [5, 10, 15])
    autocorr_zscore_window = trial.suggest_categorical('autocorr_zscore_window', [30, 60])
    
    bond_vol_z = compute_bond_vol_z(dfii10_change, vol_window, vol_zscore_window)
    momentum_z = compute_momentum_z(dfii10_change, autocorr_window, autocorr_zscore_window)
    
    # MI on validation set only
    target_val = gold_return_next.loc[val_dates]
    mi_vol = compute_mi(bond_vol_z.loc[val_dates], target_val)
    mi_mom = compute_mi(momentum_z.loc[val_dates], target_val)
    
    mi_sum = mi_vol + mi_mom
    
    # Log for reference
    trial.set_user_attr('mi_vol', mi_vol)
    trial.set_user_attr('mi_mom', mi_mom)
    trial.set_user_attr('autocorr_vol', bond_vol_z.loc[val_dates].autocorr(lag=1))
    trial.set_user_attr('autocorr_mom', momentum_z.loc[val_dates].autocorr(lag=1))
    
    return mi_sum

study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(optuna_objective, n_trials=36, timeout=300)

print(f"\nBest MI sum: {study.best_value:.6f}")
print(f"Best params: {study.best_params}")
print(f"Best MI vol: {study.best_trial.user_attrs['mi_vol']:.6f}")
print(f"Best MI mom: {study.best_trial.user_attrs['mi_mom']:.6f}")

OPTUNA HYPERPARAMETER OPTIMIZATION



Best MI sum: 1.093802
Best params: {'vol_window': 20, 'vol_zscore_window': 120, 'autocorr_window': 15, 'autocorr_zscore_window': 60}
Best MI vol: 0.571211
Best MI mom: 0.522591


In [7]:
# Cell 7: Generate final features with best params
print("=" * 60)
print("GENERATING FINAL FEATURES")
print("=" * 60)

best = study.best_params
print(f"Generating final features with: {best}")

rr_bond_vol_z = compute_bond_vol_z(dfii10_change, best['vol_window'], best['vol_zscore_window'])
rr_momentum_z = compute_momentum_z(dfii10_change, best['autocorr_window'], best['autocorr_zscore_window'])

# Build output dataframe
output = pd.DataFrame({
    'rr_bond_vol_z': rr_bond_vol_z,
    'rr_momentum_z': rr_momentum_z
}, index=dfii10_change.index)

# Trim to schema range
output = output.loc[SCHEMA_START:SCHEMA_END]

# Fill NaN (from rolling window warmup) with 0.0 (z-score neutral)
nan_count_before = output.isna().sum().sum()
output = output.fillna(0.0)

print(f"Output shape: {output.shape}")
print(f"NaN filled: {nan_count_before}")
print(f"Date range: {output.index[0]} to {output.index[-1]}")

GENERATING FINAL FEATURES
Generating final features with: {'vol_window': 20, 'vol_zscore_window': 120, 'autocorr_window': 15, 'autocorr_zscore_window': 60}


Output shape: (2777, 2)
NaN filled: 0
Date range: 2015-01-30 00:00:00 to 2026-02-17 00:00:00


In [8]:
# Cell 8: Validation checks
print("=" * 60)
print("VALIDATION CHECKS")
print("=" * 60)

all_pass = True
for col in ['rr_bond_vol_z', 'rr_momentum_z']:
    autocorr_val = output[col].autocorr(lag=1)
    std_val = output[col].std()
    nan_val = output[col].isna().sum()
    
    ac_pass = autocorr_val < 0.95
    std_pass = std_val > 0.1
    nan_pass = nan_val == 0
    
    print(f"\n{col}:")
    print(f"  autocorr(1) = {autocorr_val:.4f} {'PASS' if ac_pass else 'FAIL'} (< 0.95)")
    print(f"  std = {std_val:.4f} {'PASS' if std_pass else 'FAIL'} (> 0.1)")
    print(f"  NaN count = {nan_val} {'PASS' if nan_pass else 'FAIL'} (== 0)")
    print(f"  mean = {output[col].mean():.4f}")
    print(f"  min = {output[col].min():.4f}, max = {output[col].max():.4f}")
    
    if not (ac_pass and std_pass and nan_pass):
        all_pass = False

# Cross-correlation
cross_corr = output['rr_bond_vol_z'].corr(output['rr_momentum_z'])
print(f"\nCross-correlation: {cross_corr:.4f}")
print(f"\nOverall: {'ALL PASS' if all_pass else 'SOME CHECKS FAILED'}")

VALIDATION CHECKS

rr_bond_vol_z:
  autocorr(1) = 0.9605 FAIL (< 0.95)
  std = 1.2125 PASS (> 0.1)
  NaN count = 0 PASS (== 0)
  mean = -0.0158
  min = -3.3520, max = 4.0000

rr_momentum_z:
  autocorr(1) = 0.8516 PASS (< 0.95)
  std = 1.1318 PASS (> 0.1)
  NaN count = 0 PASS (== 0)
  mean = -0.0103
  min = -4.0000, max = 4.0000

Cross-correlation: 0.0155

Overall: SOME CHECKS FAILED


In [9]:
# Cell 9: Detailed diagnostics (train/val/test splits)
print("=" * 60)
print("DIAGNOSTICS BY SPLIT")
print("=" * 60)

for split_name, split_dates in [('Train', train_dates), ('Val', val_dates), ('Test', test_dates)]:
    split_data = output.loc[output.index.isin(split_dates)]
    print(f"\n--- {split_name} ({len(split_data)} rows) ---")
    for col in ['rr_bond_vol_z', 'rr_momentum_z']:
        s = split_data[col]
        print(f"  {col}: mean={s.mean():.4f}, std={s.std():.4f}, "
              f"min={s.min():.4f}, max={s.max():.4f}, autocorr={s.autocorr(lag=1):.4f}")
    
    # MI with target on this split
    target_split = gold_return_next.loc[gold_return_next.index.isin(split_dates)]
    for col in ['rr_bond_vol_z', 'rr_momentum_z']:
        mi = compute_mi(split_data[col], target_split)
        print(f"  MI({col}, target) = {mi:.6f}")

DIAGNOSTICS BY SPLIT

--- Train (1943 rows) ---
  rr_bond_vol_z: mean=0.1181, std=1.2476, min=-3.3520, max=4.0000, autocorr=0.9595
  rr_momentum_z: mean=-0.0127, std=1.1364, min=-4.0000, max=4.0000, autocorr=0.8514
  MI(rr_bond_vol_z, target) = 0.102258
  MI(rr_momentum_z, target) = 0.098818

--- Val (416 rows) ---
  rr_bond_vol_z: mean=-0.3873, std=0.9464, min=-2.6581, max=1.9228, autocorr=0.9547
  rr_momentum_z: mean=0.0668, std=1.1051, min=-3.0046, max=3.1942, autocorr=0.8584
  MI(rr_bond_vol_z, target) = 0.571211
  MI(rr_momentum_z, target) = 0.522591

--- Test (417 rows) ---
  rr_bond_vol_z: mean=-0.2694, std=1.1682, min=-2.1743, max=4.0000, autocorr=0.9608
  rr_momentum_z: mean=-0.0731, std=1.1353, min=-3.1603, max=3.1096, autocorr=0.8450
  MI(rr_bond_vol_z, target) = 0.523664
  MI(rr_momentum_z, target) = 0.525942


In [10]:
# Cell 10: Save outputs
print("=" * 60)
print("SAVING OUTPUTS")
print("=" * 60)

# 1. Save submodel output CSV
output_csv = output.copy()
output_csv.index.name = 'date'
output_csv.to_csv('submodel_output.csv')
print(f"Saved submodel_output.csv: {output_csv.shape}")

# 2. Save training_result.json
result = {
    "feature": "real_rate",
    "attempt": 6,
    "method": "deterministic_bond_vol_momentum_zscore",
    "timestamp": datetime.now().isoformat(),
    "best_params": best,
    "validation_mi_sum": study.best_value,
    "per_feature_mi": {
        "rr_bond_vol_z": study.best_trial.user_attrs['mi_vol'],
        "rr_momentum_z": study.best_trial.user_attrs['mi_mom']
    },
    "autocorrelation": {
        "rr_bond_vol_z": float(output['rr_bond_vol_z'].autocorr(lag=1)),
        "rr_momentum_z": float(output['rr_momentum_z'].autocorr(lag=1))
    },
    "cross_correlation": float(cross_corr),
    "output_shape": list(output.shape),
    "output_columns": ["rr_bond_vol_z", "rr_momentum_z"],
    "output_stats": {
        col: {
            "mean": float(output[col].mean()),
            "std": float(output[col].std()),
            "min": float(output[col].min()),
            "max": float(output[col].max())
        }
        for col in ['rr_bond_vol_z', 'rr_momentum_z']
    },
    "nan_filled_rows": int(nan_count_before),
    "n_optuna_trials": len(study.trials),
    "total_combinations": 36,
    "all_checks_passed": all_pass,
    "split_info": {
        "train_rows": len(train_dates),
        "val_rows": len(val_dates),
        "test_rows": len(test_dates)
    }
}

with open('training_result.json', 'w') as f:
    json.dump(result, f, indent=2)
print(f"Saved training_result.json")

print(f"\nFinished: {datetime.now().isoformat()}")
print("Training complete!")

SAVING OUTPUTS
Saved submodel_output.csv: (2777, 2)
Saved training_result.json

Finished: 2026-02-18T01:48:56.202921
Training complete!
