# Gold SubModel: Yield Curve - Attempt 4
**Approach**: Pure deterministic feature engineering - acceleration + structural decomposition (2nd-order dynamics)

**Key differences from Attempt 2/3**:
- No HMM, no PyTorch, no neural networks
- Attempt 2 captured 'how fast' the curve moves (1st derivative)
- Attempt 4 captures 'how the movement is accelerating', 'which end is driving', and 'how volatility structure is shifting' (2nd derivative + structural)

**Output features**:
1. `yc_spread_accel_z`: Z-score of 2nd derivative of 10Y-3M spread (autocorr=-0.496)
2. `yc_curv_change_z`: Z-score of daily change in 2Y-5Y-10Y butterfly (autocorr=-0.149)
3. `yc_mom_divergence_z`: Difference of z-scored long-end vs short-end momentum (autocorr=0.733)
4. `yc_vol_ratio_chg_z`: Z-score of daily CHANGE in (DGS3MO vol / DGS10 vol) (autocorr=0.121)

In [None]:
import subprocess
import sys

# Install fredapi (not pre-installed on Kaggle)
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'fredapi', '-q'])

import numpy as np
import pandas as pd
import yfinance as yf
import optuna
import json
import os
import warnings
from datetime import datetime
from sklearn.metrics import mutual_info_score

optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore')
np.random.seed(42)

print('=== Gold SubModel Training: yield_curve attempt 4 ===')
print('Approach: Acceleration + Structural Decomposition (2nd-order dynamics)')
print(f'Started: {datetime.now().isoformat()}')

In [None]:
# FRED API key via Kaggle Secrets (with graceful fallback to public endpoint)
FRED_API_KEY = None
try:
    from kaggle_secrets import UserSecretsClient
    FRED_API_KEY = UserSecretsClient().get_secret('FRED_API_KEY')
    print('FRED_API_KEY loaded from Kaggle Secrets')
except Exception:
    FRED_API_KEY = os.environ.get('FRED_API_KEY')
    if FRED_API_KEY:
        print('FRED_API_KEY loaded from environment')
    else:
        print('FRED_API_KEY not found - will use public FRED CSV endpoint (no key required)')

In [None]:
# Dynamic path resolution for bigbigzabuton/gold-prediction-submodels
# Strategy: check BOTH path AND file existence, then list files for debugging.
import glob as _glob

DATASET_SLUG = 'gold-prediction-submodels'
PROBE_FILES = ['base_features.csv', 'base_features_raw.csv', 'vix.csv']

candidates = [
    f'/kaggle/input/{DATASET_SLUG}',
    f'/kaggle/input/datasets/bigbigzabuton/{DATASET_SLUG}',
]

DATASET_PATH = None
for c in candidates:
    if os.path.isdir(c):
        files_in_dir = os.listdir(c) if os.path.isdir(c) else []
        if any(f in files_in_dir for f in PROBE_FILES):
            DATASET_PATH = c
            print(f'Dataset found: {DATASET_PATH}')
            print(f'  Files ({len(files_in_dir)}): {sorted(files_in_dir)[:10]}')
            break
        else:
            print(f'Dir exists but missing probe files: {c} -> {files_in_dir[:5]}')

if DATASET_PATH is None:
    # Last resort: glob for any matching directory
    found = _glob.glob('/kaggle/input/*gold*') + _glob.glob('/kaggle/input/datasets/*/*gold*')
    raise RuntimeError(
        f'Dataset not found.\n'
        f'Tried: {candidates}\n'
        f'Glob /kaggle/input/*gold*: {found}\n'
        f'All /kaggle/input/: {os.listdir("/kaggle/input") if os.path.exists("/kaggle/input") else "N/A"}'
    )

In [None]:
START = '2014-10-01'
END = '2026-01-31'
TICKERS = ['DGS10', 'DGS2', 'DGS5', 'DGS3MO']

# Fetch yield series from FRED (API key path or public CSV fallback)
series = {}
if FRED_API_KEY:
    from fredapi import Fred
    fred = Fred(api_key=FRED_API_KEY)
    for ticker in TICKERS:
        s = fred.get_series(ticker, observation_start=START, observation_end=END)
        s = s.ffill(limit=3)
        series[ticker] = s
        print(f'{ticker}: {len(s.dropna())} obs (via fredapi)')
else:
    # Public FRED CSV endpoint â€” no API key required
    for ticker in TICKERS:
        url = f'https://fred.stlouisfed.org/graph/fredgraph.csv?id={ticker}'
        s = pd.read_csv(url, index_col=0, parse_dates=True, na_values='.')
        s = s.iloc[:, 0]  # single value column
        s.name = ticker
        s = s[(s.index >= pd.Timestamp(START)) & (s.index <= pd.Timestamp(END))]
        s = s.ffill(limit=3)
        series[ticker] = s
        print(f'{ticker}: {len(s.dropna())} obs (via public CSV)')

# Build aligned DataFrame
df = pd.DataFrame(series)
df.index = pd.to_datetime(df.index)
df = df.dropna()
print(f'Combined yields: {len(df)} obs, {df.index[0].date()} to {df.index[-1].date()}')

# Fetch gold price from Yahoo Finance
gold = yf.download('GC=F', start=START, end=END, auto_adjust=True, progress=False)
if gold.empty or len(gold) < 100:
    raise ValueError(f'GC=F download returned insufficient data: {len(gold)} rows')

if isinstance(gold.columns, pd.MultiIndex):
    gold_close = gold['Close'].iloc[:, 0]
else:
    gold_close = gold['Close'].squeeze()

gold_ret = gold_close.pct_change() * 100
gold_ret_next = gold_ret.shift(-1)  # next-day return (target)
gold_ret_next.name = 'gold_return_next'
gold_ret_next.index = pd.to_datetime(gold_ret_next.index)

# Align yield data with gold target
common_idx = df.index.intersection(gold_ret_next.index)
df = df.loc[common_idx]
gold_ret_next = gold_ret_next.loc[common_idx]
print(f'Aligned: {len(df)} obs')

In [None]:
def rolling_zscore(x, window):
    """Rolling z-score with NaN-safe handling."""
    min_p = max(window // 2, 10)
    m = x.rolling(window, min_periods=min_p).mean()
    s = x.rolling(window, min_periods=min_p).std()
    z = (x - m) / s.replace(0, np.nan)
    return z.replace([np.inf, -np.inf], np.nan)


def generate_spread_accel(dgs10, dgs3mo, zscore_window):
    """Second derivative of 10Y-3M spread, z-scored.
    Positive = steepening accelerating (or flattening decelerating).
    Negative = flattening accelerating.
    Autocorr ~ -0.496 (mean-reverting, no Gate 1 risk).
    """
    spread = dgs10 - dgs3mo
    velocity = spread.diff()
    accel = velocity.diff()
    z = rolling_zscore(accel, zscore_window)
    return z.clip(-4, 4)


def generate_curv_change(dgs5, dgs2, dgs10, zscore_window):
    """Z-score of daily change in 2Y-5Y-10Y butterfly (curvature).
    Positive = belly bowing outward.
    Negative = belly flattening.
    Autocorr ~ -0.149 (near white noise, no Gate 1 risk).
    """
    curvature = 2 * dgs5 - dgs10 - dgs2
    curv_change = curvature.diff()
    z = rolling_zscore(curv_change, zscore_window)
    return z.clip(-4, 4)


def generate_mom_divergence(dgs10, dgs3mo, momentum_window, zscore_window):
    """Difference between long-end and short-end momentum z-scores.
    Positive = long-end rising faster (bear steepening / term premium).
    Negative = short-end rising faster (policy tightening).
    momentum_window MUST NOT exceed 10 (autocorr rises to 0.92 at window=20).
    """
    dgs10_mom = dgs10.diff().rolling(momentum_window, min_periods=1).sum()
    dgs3mo_mom = dgs3mo.diff().rolling(momentum_window, min_periods=1).sum()
    dgs10_mom_z = rolling_zscore(dgs10_mom, zscore_window)
    dgs3mo_mom_z = rolling_zscore(dgs3mo_mom, zscore_window)
    divergence = dgs10_mom_z - dgs3mo_mom_z
    return divergence.clip(-6, 6)


def generate_vol_ratio_chg(dgs10, dgs3mo, vol_window, zscore_window):
    """Z-score of daily CHANGE in (short-end vol / long-end vol ratio).
    Using CHANGE (not level) is critical - level has autocorr ~0.95+.
    Positive = shift toward policy uncertainty (short-end vol rising).
    Negative = shift toward term premium uncertainty (long-end vol rising).
    Autocorr ~ 0.121 (near white noise, no Gate 1 risk).
    """
    min_p = max(vol_window // 2, 5)
    dgs10_vol = dgs10.diff().abs().rolling(vol_window, min_periods=min_p).mean()
    dgs3mo_vol = dgs3mo.diff().abs().rolling(vol_window, min_periods=min_p).mean()
    vol_ratio = dgs3mo_vol / dgs10_vol.replace(0, np.nan)
    vol_ratio_change = vol_ratio.diff()
    z = rolling_zscore(vol_ratio_change, zscore_window)
    return z.clip(-4, 4)


def compute_mi(feature, target, n_bins=20):
    """MI between feature and target using quantile binning."""
    valid = feature.dropna().index.intersection(target.dropna().index)
    if len(valid) < 50:
        return 0.0
    f = feature[valid]
    t = target[valid]
    try:
        f_binned = pd.qcut(f, q=n_bins, labels=False, duplicates='drop')
        t_binned = pd.qcut(t, q=n_bins, labels=False, duplicates='drop')
        valid2 = f_binned.notna() & t_binned.notna()
        if valid2.sum() < 50:
            return 0.0
        return float(mutual_info_score(f_binned[valid2], t_binned[valid2]))
    except Exception:
        return 0.0


print('Feature generation functions defined')

In [None]:
# Load base_features for date alignment (try both filename variants)
base_path = None
for fname in ['base_features.csv', 'base_features_raw.csv']:
    candidate = os.path.join(DATASET_PATH, fname)
    if os.path.exists(candidate):
        base_path = candidate
        print(f'Using base features file: {fname}')
        break

if base_path is None:
    files_in_dir = os.listdir(DATASET_PATH)
    raise FileNotFoundError(
        f'base_features[_raw].csv not found in {DATASET_PATH}.\n'
        f'Available files: {sorted(files_in_dir)}'
    )

base = pd.read_csv(base_path, index_col=0, parse_dates=True)
TARGET_DATES = base.index
print(f'TARGET_DATES: {len(TARGET_DATES)} dates, {TARGET_DATES[0].date()} to {TARGET_DATES[-1].date()}')

# Filter to target dates
df_aligned = df[df.index.isin(TARGET_DATES)].copy()
gold_aligned = gold_ret_next[gold_ret_next.index.isin(TARGET_DATES)].copy()
common = df_aligned.index.intersection(gold_aligned.index)
df_aligned = df_aligned.loc[common]
gold_aligned = gold_aligned.loc[common]

n = len(df_aligned)
train_end_idx = int(n * 0.70)
val_end_idx = int(n * 0.85)

train_mask = pd.Series(False, index=df_aligned.index)
val_mask = pd.Series(False, index=df_aligned.index)
test_mask = pd.Series(False, index=df_aligned.index)
train_mask.iloc[:train_end_idx] = True
val_mask.iloc[train_end_idx:val_end_idx] = True
test_mask.iloc[val_end_idx:] = True

print(f'Total: {n}, Train: {train_mask.sum()}, Val: {val_mask.sum()}, Test: {test_mask.sum()}')
print(f'Train: {df_aligned.index[0].date()} to {df_aligned.index[train_end_idx-1].date()}')
print(f'Val:   {df_aligned.index[train_end_idx].date()} to {df_aligned.index[val_end_idx-1].date()}')
print(f'Test:  {df_aligned.index[val_end_idx].date()} to {df_aligned.index[-1].date()}')

In [None]:
# Extract aligned yield series for feature generation
dgs10 = df_aligned['DGS10']
dgs2 = df_aligned['DGS2']
dgs5 = df_aligned['DGS5']
dgs3mo = df_aligned['DGS3MO']


def objective(trial):
    accel_zw = trial.suggest_categorical('accel_zscore_window', [30, 45, 60, 90])
    curv_zw = trial.suggest_categorical('curv_zscore_window', [30, 45, 60, 90])
    mom_w = trial.suggest_categorical('momentum_window', [3, 5, 7, 10])
    mom_zw = trial.suggest_categorical('mom_zscore_window', [30, 45, 60, 90])
    vol_w = trial.suggest_categorical('vol_window', [10, 15, 20, 30])
    vol_zw = trial.suggest_categorical('vol_zscore_window', [30, 45, 60, 90])

    f1 = generate_spread_accel(dgs10, dgs3mo, accel_zw)
    f2 = generate_curv_change(dgs5, dgs2, dgs10, curv_zw)
    f3 = generate_mom_divergence(dgs10, dgs3mo, mom_w, mom_zw)
    f4 = generate_vol_ratio_chg(dgs10, dgs3mo, vol_w, vol_zw)

    target_val = gold_aligned[val_mask]
    mi_sum = sum(compute_mi(feat[val_mask], target_val) for feat in [f1, f2, f3, f4])
    return mi_sum


print('Running Optuna HPO (50 trials, timeout=600s)...')
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(objective, n_trials=50, timeout=600)

best_params = study.best_params
print(f'Optuna complete: {len(study.trials)} trials')
print(f'Best params: {best_params}')
print(f'Best MI sum (val): {study.best_value:.6f}')

In [None]:
# Generate final features with best Optuna parameters
f1 = generate_spread_accel(dgs10, dgs3mo, best_params['accel_zscore_window'])
f2 = generate_curv_change(dgs5, dgs2, dgs10, best_params['curv_zscore_window'])
f3 = generate_mom_divergence(dgs10, dgs3mo, best_params['momentum_window'], best_params['mom_zscore_window'])
f4 = generate_vol_ratio_chg(dgs10, dgs3mo, best_params['vol_window'], best_params['vol_zscore_window'])

features_dict = {
    'yc_spread_accel_z': f1,
    'yc_curv_change_z': f2,
    'yc_mom_divergence_z': f3,
    'yc_vol_ratio_chg_z': f4,
}

# Gate 1 safety: autocorrelation check (threshold 0.95)
print('Autocorrelations (Gate 1 threshold: 0.95):')
autocorr_results = {}
for name, feat in features_dict.items():
    train_feat = feat[train_mask].dropna()
    if len(train_feat) > 1:
        ac = float(np.corrcoef(train_feat.values[:-1], train_feat.values[1:])[0, 1])
    else:
        ac = 0.0
    autocorr_results[name] = round(ac, 4)
    status = 'WARN' if abs(ac) > 0.95 else 'OK'
    print(f'  {name}: {ac:.4f} [{status}]')

# Build output DataFrame (full date range, forward-filled)
output = pd.DataFrame(features_dict, index=df_aligned.index)
output = output.ffill(limit=5)

print(f'\nOutput shape: {output.shape}')
print(f'NaN counts: {output.isna().sum().to_dict()}')
print('\nOutput summary:')
print(output.describe())

In [None]:
# Compute MI on test set for final evaluation
test_mi = {}
for name, feat in features_dict.items():
    mi = compute_mi(feat[test_mask], gold_aligned[test_mask])
    test_mi[name] = round(mi, 6)

print('Test set MI per feature:')
for name, mi in test_mi.items():
    print(f'  {name}: {mi:.6f}')
print(f'  Test MI sum: {sum(test_mi.values()):.6f}')

# Validation set MI (re-compute from final features for record)
val_mi = {}
for name, feat in features_dict.items():
    mi = compute_mi(feat[val_mask], gold_aligned[val_mask])
    val_mi[name] = round(mi, 6)

print('\nVal set MI per feature:')
for name, mi in val_mi.items():
    print(f'  {name}: {mi:.6f}')
print(f'  Val MI sum: {sum(val_mi.values()):.6f}')

In [None]:
# Save results
output.to_csv('/kaggle/working/submodel_output.csv')
print('Saved: submodel_output.csv')

result = {
    'feature': 'yield_curve',
    'attempt': 4,
    'timestamp': datetime.now().isoformat(),
    'best_params': best_params,
    'metrics': {
        'mi_sum_val': round(study.best_value, 6),
        'mi_individual_val': val_mi,
        'mi_individual_test': test_mi,
        'mi_sum_test': round(sum(test_mi.values()), 6),
        'autocorr': autocorr_results,
        'optuna_best_value': round(study.best_value, 6),
        'optuna_trials_completed': len(study.trials),
    },
    'output_shape': list(output.shape),
    'output_columns': list(output.columns),
    'data_info': {
        'total_samples': n,
        'train_samples': int(train_mask.sum()),
        'val_samples': int(val_mask.sum()),
        'test_samples': int(test_mask.sum()),
        'date_range_start': str(df_aligned.index.min().date()),
        'date_range_end': str(df_aligned.index.max().date()),
    },
}

with open('/kaggle/working/training_result.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=2, default=str)
print('Saved: training_result.json')

print('\n=== Training complete! ===')
print(f'Finished: {datetime.now().isoformat()}')
print(f'Output columns: {list(output.columns)}')
print(f'Output shape: {output.shape}')
print(json.dumps(result, indent=2, default=str))