# Gold Prediction SubModel Training — Yield Curve Attempt 7

**Approach**: 2Y Policy Velocity + 2Y-10Y Slope Velocity

The 2-year Treasury yield is the most Fed-policy-sensitive point on the curve and captures expected monetary policy trajectory over the next 2 years — the most actionable window for gold positioning. Unlike att2's (DGS10-DGS3MO) which anchors to the overnight rate, this approach anchors to the policy-expectation-driven 2Y.

**Features**:
1. `yc_2y_vel_z`: Z-scored daily change in DGS2 (2Y Treasury yield velocity)
2. `yc_2y10y_vel_z`: Z-scored daily change in (DGS10 − DGS2) slope (classic steepener/flattener)

**Notebook flow**: Data fetch → Feature engineering → Optuna HPO (z-score window) → Quality checks → Save output

In [None]:
# === 1. Libraries ===
import numpy as np
import pandas as pd
from sklearn.metrics import mutual_info_score
import optuna
import json
import os
from datetime import datetime

optuna.logging.set_verbosity(optuna.logging.WARNING)

# === 2. Constants ===
FEATURE_NAME = "yield_curve"
ATTEMPT = 7
OUTPUT_COLUMNS = ['yc_2y_vel_z', 'yc_2y10y_vel_z']
CLIP_RANGE = (-4, 4)

print(f"Yield Curve Submodel — Attempt {ATTEMPT}")
print(f"Approach: 2Y Policy Velocity + 2Y-10Y Slope Velocity")
print(f"Output columns: {OUTPUT_COLUMNS}")

In [None]:
# === 3. Dataset Path Resolution (standard block from MEMORY.md) ===
import glob as _glob

PROBE_FILES = ['base_features.csv', 'base_features_raw.csv', 'vix.csv']
candidates = [
    '/kaggle/input/gold-prediction-submodels',
    '/kaggle/input/datasets/bigbigzabuton/gold-prediction-submodels'
]
DATASET_PATH = None
for c in candidates:
    if os.path.isdir(c) and any(f in os.listdir(c) for f in PROBE_FILES):
        DATASET_PATH = c
        break
    elif os.path.isdir(c):
        print(f'Dir exists but probe files missing: {c} -> {os.listdir(c)[:5]}')

if DATASET_PATH is None:
    raise RuntimeError(
        f'Dataset not found. Tried: {candidates}. '
        f'/kaggle/input/: {os.listdir("/kaggle/input")}'
    )
print(f"DATASET_PATH = {DATASET_PATH}")
print(f"Contents: {os.listdir(DATASET_PATH)[:10]}")

In [None]:
# === 4. FRED API Key ===
# Do NOT raise if key is missing — will fall back to FRED public CSV endpoints (no key required).
FRED_API_KEY = None
try:
    from kaggle_secrets import UserSecretsClient
    FRED_API_KEY = UserSecretsClient().get_secret("FRED_API_KEY")
    print("FRED_API_KEY loaded from Kaggle Secrets")
except Exception as e:
    FRED_API_KEY = os.environ.get('FRED_API_KEY')
    if FRED_API_KEY:
        print("FRED_API_KEY loaded from environment variable")
    else:
        print(f"WARNING: FRED_API_KEY not available ({e}). Will use FRED public CSV endpoints (no key required).")

try:
    from fredapi import Fred
except ImportError:
    import subprocess, sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'fredapi', '-q'])
    from fredapi import Fred

if FRED_API_KEY:
    print("FRED API: will use fredapi with key")
else:
    print("FRED API: will use public CSV fallback (no key)")

In [None]:
# === 5. Fetch Yield Data from FRED ===

def fetch_fred_series(ticker, start, api_key=None):
    """Fetch FRED series via fredapi (if key available) or public CSV fallback (no key needed)."""
    if api_key:
        try:
            client = Fred(api_key=api_key)
            s = client.get_series(ticker, observation_start=start)
            print(f"  {ticker}: loaded via fredapi ({len(s.dropna())} obs)")
            return s
        except Exception as e:
            print(f"  {ticker}: fredapi failed ({e}), falling back to public CSV...")
    # Fallback: FRED public CSV endpoint (no API key required)
    url = f'https://fred.stlouisfed.org/graph/fredgraph.csv?id={ticker}'
    df = pd.read_csv(url, index_col=0, parse_dates=True, na_values='.')
    s = df.iloc[:, 0]
    s.index = pd.to_datetime(s.index)
    s = s[s.index >= pd.Timestamp(start)]
    print(f"  {ticker}: loaded via public CSV ({len(s.dropna())} obs)")
    return s


print("Fetching FRED yield data...")
DATA_START = '2014-06-01'  # Buffer for rolling window warmup

dgs10 = fetch_fred_series('DGS10', DATA_START, FRED_API_KEY)
dgs2  = fetch_fred_series('DGS2',  DATA_START, FRED_API_KEY)
dgs3m = fetch_fred_series('DGS3MO', DATA_START, FRED_API_KEY)

# Forward-fill up to 3 days for weekends/holidays (FRED data gaps)
dgs10 = dgs10.ffill(limit=3)
dgs2  = dgs2.ffill(limit=3)
dgs3m = dgs3m.ffill(limit=3)

# Align to common dates (inner join)
yields_df = pd.DataFrame({
    'dgs10': dgs10,
    'dgs2':  dgs2,
    'dgs3m': dgs3m,
}).dropna()

print(f"Yield data: {len(yields_df)} rows")
print(f"  Date range: {yields_df.index[0].date()} to {yields_df.index[-1].date()}")
print(f"  DGS10 range: {yields_df['dgs10'].min():.2f}% to {yields_df['dgs10'].max():.2f}%")
print(f"  DGS2  range: {yields_df['dgs2'].min():.2f}% to {yields_df['dgs2'].max():.2f}%")
print(f"  DGS3MO range: {yields_df['dgs3m'].min():.2f}% to {yields_df['dgs3m'].max():.2f}%")

# Sanity check: 2Y-10Y spread
spread_2y10y = yields_df['dgs10'] - yields_df['dgs2']
print(f"  10Y-2Y spread range: {spread_2y10y.min():.2f}% to {spread_2y10y.max():.2f}%")
print(f"  Current 10Y-2Y: {spread_2y10y.iloc[-1]:.2f}%")

In [None]:
# === 6. Fetch Gold Price for Target ===
import yfinance as yf
print("Fetching gold price data...")

gold = yf.download('GC=F', start=DATA_START, auto_adjust=True, progress=False)
if gold.empty or len(gold) < 100:
    raise ValueError(f"GC=F download returned insufficient data: {len(gold)} rows")

# Handle yfinance multi-index columns (newer yfinance versions return MultiIndex)
if isinstance(gold.columns, pd.MultiIndex):
    gold_close = gold['Close'].iloc[:, 0]
else:
    gold_close = gold['Close'].squeeze()

gold_ret_next = gold_close.pct_change().shift(-1) * 100  # next-day return in %
gold_ret_next.index = pd.to_datetime(gold_ret_next.index).tz_localize(None)
gold_ret_next.name = 'gold_return_next'

print(f"Gold data: {len(gold_ret_next.dropna())} valid observations")
print(f"  Date range: {gold_ret_next.dropna().index[0].date()} to {gold_ret_next.dropna().index[-1].date()}")

In [None]:
# === 7. Load Base Features for Date Alignment ===
bf_path = os.path.join(DATASET_PATH, 'base_features.csv')
if not os.path.exists(bf_path):
    bf_path = os.path.join(DATASET_PATH, 'base_features_raw.csv')
    print(f"Using base_features_raw.csv")

base_features = pd.read_csv(bf_path, parse_dates=['Date'], index_col='Date')
print(f"Base features: {len(base_features)} rows")
print(f"  Date range: {base_features.index[0].date()} to {base_features.index[-1].date()}")
print(f"  Columns: {list(base_features.columns[:5])}...")

In [None]:
# === 8. Feature Generation Functions ===

def rolling_zscore(x, window):
    """Rolling z-score with NaN handling."""
    min_per = max(window // 2, 10)
    m = x.rolling(window, min_periods=min_per).mean()
    s = x.rolling(window, min_periods=min_per).std()
    z = (x - m) / s
    z = z.replace([np.inf, -np.inf], np.nan)
    return z


def generate_features(yields_df, zscore_window):
    """
    Generate 2Y policy velocity and 2Y-10Y slope velocity z-scores.
    
    Features:
    1. yc_2y_vel_z: Z-scored daily change in DGS2
       - Captures Fed policy expectation velocity
       - Rising = Fed expected to hike more → gold-negative
       - Falling = Fed expected to cut → gold-positive
    
    2. yc_2y10y_vel_z: Z-scored daily change in (DGS10 - DGS2)
       - Classic steepener/flattener signal
       - Rising (steepening) = growth optimism / recession recovery → complex gold signal
       - Falling (flattening) = recession fears / Fed overtightening → gold-positive (safe haven)
    """
    features = pd.DataFrame(index=yields_df.index)
    
    # Feature 1: 2Y yield velocity z-score
    dgs2_vel = yields_df['dgs2'].diff()
    features['yc_2y_vel_z'] = rolling_zscore(dgs2_vel, zscore_window).clip(*CLIP_RANGE)
    
    # Feature 2: 10Y-2Y spread velocity z-score (steepener/flattener)
    spread_2y10y = yields_df['dgs10'] - yields_df['dgs2']
    spread_vel = spread_2y10y.diff()
    features['yc_2y10y_vel_z'] = rolling_zscore(spread_vel, zscore_window).clip(*CLIP_RANGE)
    
    return features


print("Feature generation functions defined")
print(f"Features: {OUTPUT_COLUMNS}")

In [None]:
# === 9. MI Computation Helper ===

def compute_mi(feature, target, n_bins=20):
    """MI between feature and target using quantile binning."""
    valid = feature.dropna().index.intersection(target.dropna().index)
    if len(valid) < 50:
        return 0.0
    f = feature[valid]
    t = target[valid]
    try:
        f_binned = pd.qcut(f, q=n_bins, labels=False, duplicates='drop')
        t_binned = pd.qcut(t, q=n_bins, labels=False, duplicates='drop')
        return float(mutual_info_score(f_binned, t_binned))
    except Exception:
        return 0.0

print("MI computation helper defined")

In [None]:
# === 10. Data Split ===
common_dates = (
    yields_df.index
    .intersection(base_features.index)
    .intersection(gold_ret_next.dropna().index)
    .sort_values()
)

n = len(common_dates)
train_end = int(n * 0.70)
val_end   = int(n * 0.85)

train_dates = common_dates[:train_end]
val_dates   = common_dates[train_end:val_end]
test_dates  = common_dates[val_end:]

print(f"Common dates: {n} total")
print(f"  Train: {len(train_dates)} ({train_dates[0].date()} to {train_dates[-1].date()})")
print(f"  Val:   {len(val_dates)}   ({val_dates[0].date()} to {val_dates[-1].date()})")
print(f"  Test:  {len(test_dates)}  ({test_dates[0].date()} to {test_dates[-1].date()})")

In [None]:
# === 11. Optuna HPO ===

val_target = gold_ret_next.reindex(common_dates)[common_dates.isin(val_dates)]

def objective(trial):
    zscore_window = trial.suggest_categorical('zscore_window', [20, 30, 45, 60, 90, 120])
    
    features = generate_features(yields_df, zscore_window)
    
    mi_sum = 0.0
    for col in OUTPUT_COLUMNS:
        feat_val = features[col].reindex(val_dates)
        mi_sum += compute_mi(feat_val, val_target)
    
    return mi_sum


print("Running Optuna HPO (z-score window selection)...")
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(objective, n_trials=25, timeout=300, show_progress_bar=False)

best_params = study.best_params
best_value  = study.best_value

print(f"\nOptuna complete: {len(study.trials)} trials")
print(f"Best params: {best_params}")
print(f"Best MI sum (val): {best_value:.4f}")

print("\n=== All Optuna Trials (sorted by MI sum) ===")
for t in sorted(study.trials, key=lambda x: x.value or 0, reverse=True):
    print(f"  window={t.params['zscore_window']:>4d}  MI_sum={t.value:.4f}")

In [None]:
# === 12. Generate Final Features with Best Params ===
print(f"\nGenerating final features with zscore_window={best_params['zscore_window']}...")
final_features = generate_features(yields_df, zscore_window=best_params['zscore_window'])

print(f"Generated features shape: {final_features.shape}")
print(f"Sample (last 5 rows):")
print(final_features.tail().to_string())

In [None]:
# === 13. Quality Checks ===
print("\n=== GATE 1: Quality Checks ===")

autocorr_results = {}
for col in OUTPUT_COLUMNS:
    ac = final_features[col].dropna().autocorr(lag=1)
    autocorr_results[col] = float(ac)
    status = "PASS" if abs(ac) < 0.95 else "FAIL"
    print(f"  Autocorr(lag=1) {col}: {ac:.4f} [{status}]")

# Internal correlation
corr_matrix = final_features[OUTPUT_COLUMNS].dropna().corr()
internal_corr = float(corr_matrix.iloc[0, 1])
print(f"\n  Internal correlation ({OUTPUT_COLUMNS[0]} vs {OUTPUT_COLUMNS[1]}): {internal_corr:.4f}")

# VIF check
from numpy.linalg import inv as np_inv
X = final_features[OUTPUT_COLUMNS].dropna().values
cm = np.corrcoef(X.T)
try:
    inv_cm = np_inv(cm)
    vif_values = np.diag(inv_cm)
    for col, v in zip(OUTPUT_COLUMNS, vif_values):
        status = "PASS" if v < 10 else "FAIL"
        print(f"  VIF {col}: {v:.3f} [{status}]")
    vif_max = float(np.max(vif_values))
except Exception as e:
    print(f"  VIF calculation failed: {e}")
    vif_max = None

# NaN check
print("\n  NaN counts:")
for col in OUTPUT_COLUMNS:
    n_nan = final_features[col].isna().sum()
    n_total = len(final_features)
    print(f"    {col}: {n_nan}/{n_total} ({100*n_nan/n_total:.1f}%)")

# Descriptive stats
print("\n  Descriptive statistics:")
print(final_features[OUTPUT_COLUMNS].describe().round(4).to_string())

In [None]:
# === 14. MI per Feature (Validation Set) ===
print("\n=== GATE 2: Information Content ===")

individual_mi = {}
for col in OUTPUT_COLUMNS:
    feat_val = final_features[col].reindex(val_dates)
    mi = compute_mi(feat_val, val_target)
    individual_mi[col] = mi
    print(f"  MI({col}, gold_return_next) = {mi:.4f}")

print(f"  Total MI sum: {sum(individual_mi.values()):.4f}")

# Rolling correlation stability check
print("\n  Rolling correlation std (stability):")
stability = {}
for col in OUTPUT_COLUMNS:
    rolling_corr = final_features[col].dropna().rolling(63).corr(
        gold_ret_next.reindex(final_features.index)
    )
    std = float(rolling_corr.std())
    stability[col] = std
    status = "PASS" if std < 0.15 else "FAIL"
    print(f"    {col}: {std:.4f} [{status}]")

In [None]:
# === 15. Correlation vs Attempt 2 Features (for Gate 2 VIF) ===
print("\n=== Correlation with Attempt 2 yield_curve features ===")

# Load yield_curve.csv from dataset (contains att2 production features)
yc_path = os.path.join(DATASET_PATH, 'yield_curve.csv')
if os.path.exists(yc_path):
    yc_df = pd.read_csv(yc_path, parse_dates=['Date'], index_col='Date')
    print(f"  Loaded yield_curve.csv: {yc_df.shape}, columns: {list(yc_df.columns)}")
    
    # Compute correlations
    max_corr_overall = 0.0
    for new_col in OUTPUT_COLUMNS:
        max_corr = 0.0
        for att2_col in yc_df.columns:
            common = final_features[new_col].dropna().index.intersection(yc_df[att2_col].dropna().index)
            if len(common) > 100:
                corr = abs(float(final_features[new_col][common].corr(yc_df[att2_col][common])))
                max_corr = max(max_corr, corr)
        print(f"  Max |corr| of {new_col} with att2 features: {max_corr:.4f}")
        max_corr_overall = max(max_corr_overall, max_corr)
    
    # Also check correlation with att2's 3m-10y spread velocity proxy
    dgs3m_series = yields_df['dgs3m']
    spread_3m10y = yields_df['dgs10'] - yields_df['dgs3m']
    spread_3m10y_vel = spread_3m10y.diff()
    att2_proxy = rolling_zscore(spread_3m10y_vel, best_params['zscore_window']).clip(*CLIP_RANGE)
    
    for new_col in OUTPUT_COLUMNS:
        common = final_features[new_col].dropna().index.intersection(att2_proxy.dropna().index)
        corr = abs(float(final_features[new_col][common].corr(att2_proxy[common])))
        print(f"  Corr({new_col}, att2_proxy_3m10y_vel_z): {corr:.4f}")
else:
    print("  yield_curve.csv not found in dataset. Skipping att2 correlation check.")

In [None]:
# === 16. Align to Base Features Dates and Save ===
print("\n=== Aligning to base_features date range ===")

output = final_features[OUTPUT_COLUMNS].reindex(base_features.index)

# Forward-fill up to 3 days for minor gaps (holidays etc.)
output = output.ffill(limit=3)

# Drop rows that are entirely NaN (warmup period)
output = output.dropna(how='all')
output.index.name = 'Date'

print(f"  Output shape: {output.shape}")
print(f"  Date range: {output.index[0].date()} to {output.index[-1].date()}")
print(f"  NaN per column after alignment:")
for col in OUTPUT_COLUMNS:
    n_nan = output[col].isna().sum()
    print(f"    {col}: {n_nan} ({100*n_nan/len(output):.1f}%)")

print(f"\n  Sample output (last 5 rows):")
print(output.tail().to_string())

In [None]:
# === 17. Save Outputs ===
output.to_csv("/kaggle/working/submodel_output.csv")
print(f"Saved /kaggle/working/submodel_output.csv ({len(output)} rows x {len(OUTPUT_COLUMNS)} columns)")

# Save training result JSON
result = {
    "feature": FEATURE_NAME,
    "attempt": ATTEMPT,
    "timestamp": datetime.now().isoformat(),
    "approach": "2Y Policy Velocity + 2Y-10Y Slope Velocity",
    "description": (
        "Z-scored daily changes in the 2Y Treasury yield (yc_2y_vel_z: Fed policy expectations) "
        "and the 10Y-2Y spread (yc_2y10y_vel_z: classic steepener/flattener). "
        "Uses the policy-sensitive 2Y anchor vs att2's overnight 3M anchor. "
        "2 features, both velocity-based (daily changes), no level variables."
    ),
    "output_columns": OUTPUT_COLUMNS,
    "best_params": best_params,
    "n_trials": len(study.trials),
    "best_mi_sum_val": float(best_value),
    "metrics": {
        "autocorrelations": autocorr_results,
        "individual_mi_val": individual_mi,
        "internal_correlation": internal_corr,
        "stability": stability,
        "vif_max": vif_max,
    },
    "output_shape": list(output.shape),
    "output_nan_counts": {col: int(output[col].isna().sum()) for col in OUTPUT_COLUMNS},
    "gate1_expected": "PASS (velocity features, near-zero autocorr expected)",
    "gate2_expected": "UNCERTAIN (2Y is different tenor than att2; some overlap possible)",
    "gate3_expected": "UNCERTAIN (2Y policy channel is key for gold; different from att6)",
}

with open("/kaggle/working/training_result.json", "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2, default=str)

print("Saved /kaggle/working/training_result.json")
print("\n=== Training Complete ===")
print(f"Feature: {FEATURE_NAME} | Attempt: {ATTEMPT}")
print(f"Output columns: {OUTPUT_COLUMNS}")
print(f"Best zscore_window: {best_params['zscore_window']}")
print(f"Best MI sum (val): {best_value:.4f}")
print(f"Autocorr: {autocorr_results}")