# Gold SubModel: Yield Curve - Attempt 6
**Approach**: Yield Decomposition Velocity (Breakeven + TIPS velocity z-scores)

**Key differences from previous attempts**:
- No ML model, no PyTorch, no HMM
- Decomposes nominal yield dynamics into real rate and inflation premium components
- Velocity (daily change) z-scored to avoid autocorrelation

**Output features**:
1. `yc_be_vel_z`: Z-scored daily change in 10Y breakeven inflation rate (DGS10 - DFII10)
2. `yc_tips_vel_z`: Z-scored daily change in 10Y TIPS real yield (DFII10)

**Economic logic**:
- When yields rise, gold responds differently depending on whether it is real rates or inflation expectations driving the move
- `yc_be_vel_z` positive = inflation expectations rising = gold-positive
- `yc_tips_vel_z` positive = real rates rising = gold-negative (higher opportunity cost)

**Expected Gate 1**: autocorr max 0.046 (far below 0.95 threshold)
**Expected Gate 2**: VIF max 1.36 with att2 features. Max corr with att2 = 0.240.

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import optuna
import json
import os
import warnings
from datetime import datetime
from sklearn.metrics import mutual_info_score
from numpy.linalg import inv as np_inv

# Install fredapi if not available
try:
    from fredapi import Fred
except ImportError:
    import subprocess, sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'fredapi', '-q'])
    from fredapi import Fred

optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore')
np.random.seed(42)

FEATURE_NAME = 'yield_curve'
ATTEMPT = 6
OUTPUT_COLUMNS = ['yc_be_vel_z', 'yc_tips_vel_z']
CLIP_RANGE = (-4, 4)

print(f'=== Gold SubModel Training: {FEATURE_NAME} attempt {ATTEMPT} ===')
print('Approach: Yield Decomposition Velocity (Breakeven + TIPS velocity z-scores)')
print(f'Started: {datetime.now().isoformat()}')

In [None]:
# FRED API key via Kaggle Secrets (preferred) with environment variable fallback
FRED_API_KEY = None
try:
    from kaggle_secrets import UserSecretsClient
    FRED_API_KEY = UserSecretsClient().get_secret('FRED_API_KEY')
    print('FRED_API_KEY loaded from Kaggle Secrets')
except Exception:
    FRED_API_KEY = os.environ.get('FRED_API_KEY')
    if FRED_API_KEY:
        print('FRED_API_KEY loaded from environment')
    else:
        print('WARNING: FRED_API_KEY not found in Kaggle Secrets or environment')

if not FRED_API_KEY:
    raise RuntimeError('FRED_API_KEY not found in Kaggle Secrets or environment. '
                       'Add FRED_API_KEY to Kaggle Secrets at kaggle.com/settings.')

In [None]:
# Dynamic path resolution for bigbigzabuton/gold-prediction-submodels
# Strategy: check BOTH path AND file existence, then list files for debugging.
import glob as _glob

PROBE_FILES = ['base_features.csv', 'base_features_raw.csv', 'vix.csv']
candidates = [
    '/kaggle/input/gold-prediction-submodels',
    '/kaggle/input/datasets/bigbigzabuton/gold-prediction-submodels',
]

DATASET_PATH = None
for c in candidates:
    if os.path.isdir(c):
        files_in_dir = os.listdir(c)
        if any(f in files_in_dir for f in PROBE_FILES):
            DATASET_PATH = c
            print(f'Dataset found: {DATASET_PATH}')
            print(f'  Files ({len(files_in_dir)}): {sorted(files_in_dir)[:10]}')
            break
        else:
            print(f'Dir exists but missing probe files: {c} -> {files_in_dir[:5]}')

if DATASET_PATH is None:
    found = _glob.glob('/kaggle/input/*gold*') + _glob.glob('/kaggle/input/datasets/*/*gold*')
    raise RuntimeError(
        f'Dataset not found.\n'
        f'Tried: {candidates}\n'
        f'Glob /kaggle/input/*gold*: {found}\n'
        f'All /kaggle/input/: {os.listdir("/kaggle/input") if os.path.exists("/kaggle/input") else "N/A"}'
    )

In [None]:
# Fetch DGS10 and DFII10 from FRED
# Only 2 series needed: DGS10 (nominal 10Y) and DFII10 (real 10Y TIPS yield)
# Breakeven = DGS10 - DFII10 (equivalent to T10YIE by identity, no extra API call needed)
START = '2014-06-01'

print('Fetching FRED yield data...')
fred = Fred(api_key=FRED_API_KEY)

dgs10_raw = fred.get_series('DGS10', observation_start=START)
dfii10_raw = fred.get_series('DFII10', observation_start=START)

print(f'DGS10: {len(dgs10_raw.dropna())} obs, {dgs10_raw.dropna().index[0].date()} to {dgs10_raw.dropna().index[-1].date()}')
print(f'DFII10: {len(dfii10_raw.dropna())} obs, {dfii10_raw.dropna().index[0].date()} to {dfii10_raw.dropna().index[-1].date()}')

# Forward-fill up to 3 days for weekends/holidays
dgs10_raw = dgs10_raw.ffill(limit=3)
dfii10_raw = dfii10_raw.ffill(limit=3)

# Inner join: only keep dates where both series have values
yields_df = pd.DataFrame({
    'dgs10': dgs10_raw,
    'dfii10': dfii10_raw,
}).dropna()
yields_df.index = pd.to_datetime(yields_df.index)
print(f'Combined yields (inner join): {len(yields_df)} obs, {yields_df.index[0].date()} to {yields_df.index[-1].date()}')

# Sanity check: breakeven should be mostly positive and in 0.5-3.5% range
breakeven_check = yields_df['dgs10'] - yields_df['dfii10']
print(f'Breakeven (DGS10 - DFII10): min={breakeven_check.min():.2f}%, max={breakeven_check.max():.2f}%, mean={breakeven_check.mean():.2f}%')
pct_positive = (breakeven_check > 0).mean() * 100
print(f'Breakeven positive: {pct_positive:.1f}% of observations')
print(f'DGS10 range: {yields_df["dgs10"].min():.2f}% to {yields_df["dgs10"].max():.2f}%')
print(f'DFII10 range: {yields_df["dfii10"].min():.2f}% to {yields_df["dfii10"].max():.2f}%')

In [None]:
# Fetch gold price from Yahoo Finance for target computation
gold = yf.download('GC=F', start=START, auto_adjust=True, progress=False)

if gold.empty or len(gold) < 100:
    raise ValueError(f'GC=F download returned insufficient data: {len(gold)} rows')

if isinstance(gold.columns, pd.MultiIndex):
    gold_close = gold['Close'].iloc[:, 0]
else:
    gold_close = gold['Close'].squeeze()

gold_ret = gold_close.pct_change() * 100
gold_ret_next = gold_ret.shift(-1)  # next-day return (target)
gold_ret_next.name = 'gold_return_next'
gold_ret_next.index = pd.to_datetime(gold_ret_next.index).tz_localize(None)
print(f'GC=F: {len(gold_ret_next.dropna())} obs, {gold_ret_next.dropna().index[0].date()} to {gold_ret_next.dropna().index[-1].date()}')

In [None]:
# Load base_features for date alignment (try both filename variants)
base_path = None
for fname in ['base_features.csv', 'base_features_raw.csv']:
    candidate = os.path.join(DATASET_PATH, fname)
    if os.path.exists(candidate):
        base_path = candidate
        print(f'Using base features file: {fname}')
        break

if base_path is None:
    files_in_dir = os.listdir(DATASET_PATH)
    raise FileNotFoundError(
        f'base_features[_raw].csv not found in {DATASET_PATH}.\n'
        f'Available files: {sorted(files_in_dir)}'
    )

base_features = pd.read_csv(base_path, index_col=0, parse_dates=True)
base_features.index = pd.to_datetime(base_features.index)
TARGET_DATES = base_features.index
print(f'Base features: {len(base_features)} rows, {TARGET_DATES[0].date()} to {TARGET_DATES[-1].date()}')

In [None]:
# Data split: train/val/test = 70/15/15 on base_features dates
common_dates = yields_df.index.intersection(TARGET_DATES)
common_dates = common_dates.intersection(gold_ret_next.dropna().index)
common_dates = common_dates.sort_values()

n = len(common_dates)
train_end_idx = int(n * 0.70)
val_end_idx = int(n * 0.85)

train_dates = common_dates[:train_end_idx]
val_dates = common_dates[train_end_idx:val_end_idx]
test_dates = common_dates[val_end_idx:]

print(f'Total aligned: {n}')
print(f'Train: {len(train_dates)} ({train_dates[0].date()} to {train_dates[-1].date()})')
print(f'Val:   {len(val_dates)} ({val_dates[0].date()} to {val_dates[-1].date()})')
print(f'Test:  {len(test_dates)} ({test_dates[0].date()} to {test_dates[-1].date()})')

# Create masks on yields_df index
val_mask = yields_df.index.isin(val_dates)
test_mask = yields_df.index.isin(test_dates)

# Aligned gold target
gold_aligned = gold_ret_next.reindex(yields_df.index)

In [None]:
# Feature generation functions

def rolling_zscore(x, window):
    """Rolling z-score with NaN-safe handling."""
    min_p = max(window // 2, 10)
    m = x.rolling(window, min_periods=min_p).mean()
    s = x.rolling(window, min_periods=min_p).std()
    z = (x - m) / s.replace(0, np.nan)
    return z.replace([np.inf, -np.inf], np.nan)


def generate_all_features(yields_df, zscore_window):
    """
    Generate yield decomposition velocity z-scores.

    Approach:
    - breakeven = DGS10 - DFII10 (10Y inflation premium)
    - yc_be_vel_z: z-scored daily change in breakeven (inflation expectation speed)
    - yc_tips_vel_z: z-scored daily change in DFII10 (real rate speed)

    Both features use the same zscore_window for temporal consistency.
    Velocity (diff) not level avoids autocorrelation > 0.95.
    """
    # Breakeven inflation rate = DGS10 - DFII10 (identity: equals T10YIE)
    breakeven = yields_df['dgs10'] - yields_df['dfii10']

    # Feature 1: Breakeven velocity z-score
    be_vel = breakeven.diff()  # daily change in breakeven

    # Feature 2: TIPS velocity z-score
    tips_vel = yields_df['dfii10'].diff()  # daily change in real rate

    features = pd.DataFrame(index=yields_df.index)
    features['yc_be_vel_z'] = rolling_zscore(be_vel, zscore_window).clip(*CLIP_RANGE)
    features['yc_tips_vel_z'] = rolling_zscore(tips_vel, zscore_window).clip(*CLIP_RANGE)

    return features


def compute_mi(feature, target, n_bins=20):
    """MI between feature and target using quantile binning."""
    valid = feature.dropna().index.intersection(target.dropna().index)
    if len(valid) < 50:
        return 0.0
    f = feature[valid]
    t = target[valid]
    try:
        f_binned = pd.qcut(f, q=n_bins, labels=False, duplicates='drop')
        t_binned = pd.qcut(t, q=n_bins, labels=False, duplicates='drop')
        valid2 = f_binned.notna() & t_binned.notna()
        if valid2.sum() < 50:
            return 0.0
        return float(mutual_info_score(f_binned[valid2], t_binned[valid2]))
    except Exception:
        return 0.0


print('Feature generation functions defined')

In [None]:
# Optuna HPO: maximize sum of MI between 2 features and gold_return_next on validation set
# Search space: zscore_window in {30, 45, 60, 90, 120}
# Only 5 unique values; 20 trials gives complete coverage with repetition for robustness.
# Both features share the same zscore_window for temporal consistency.

def objective(trial):
    zscore_window = trial.suggest_categorical('zscore_window', [30, 45, 60, 90, 120])

    features = generate_all_features(yields_df, zscore_window)

    target_val = gold_aligned[val_mask]
    mi_sum = 0.0
    for col in OUTPUT_COLUMNS:
        feat_val = features[col][val_mask]
        mi_sum += compute_mi(feat_val, target_val)

    return mi_sum


print('Running Optuna HPO (20 trials, timeout=180s, TPE sampler)...')
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(objective, n_trials=20, timeout=180)

best_params = study.best_params
best_value = study.best_value
n_completed = len(study.trials)
print(f'Optuna complete: {n_completed} trials')
print(f'Best params: {best_params}')
print(f'Best MI sum (val): {best_value:.6f}')

# Print all trial results for transparency
print('\n=== All Optuna Trials ===')
for t in sorted(study.trials, key=lambda t: t.value if t.value is not None else 0, reverse=True):
    print(f'  zscore_window={t.params["zscore_window"]:>4d}, MI_sum={t.value:.6f}')

In [None]:
# Generate final features with best Optuna parameters
print(f'Generating final features with zscore_window={best_params["zscore_window"]}...')
final_features = generate_all_features(
    yields_df,
    zscore_window=best_params['zscore_window']
)

print(f'Final features shape: {final_features.shape}')
print(f'NaN counts before alignment:')
for col in OUTPUT_COLUMNS:
    print(f'  {col}: {final_features[col].isna().sum()}')

In [None]:
# Quality checks: autocorrelation, internal correlation, VIF
print('=== Quality Checks ===')

# Autocorrelation check (Gate 1 threshold: 0.95)
autocorr_results = {}
for col in OUTPUT_COLUMNS:
    series_clean = final_features[col].dropna()
    if len(series_clean) > 1:
        ac = float(series_clean.autocorr(lag=1))
    else:
        ac = 0.0
    autocorr_results[col] = round(ac, 6)
    status = 'FAIL' if abs(ac) > 0.95 else 'PASS'
    print(f'Autocorr {col}: {ac:.6f} [{status}]')

# NaN check
print('\nNaN per column:')
for col in OUTPUT_COLUMNS:
    nan_count = final_features[col].isna().sum()
    nan_pct = nan_count / len(final_features) * 100
    print(f'  {col}: {nan_count} ({nan_pct:.1f}%)')

# Internal correlation check
clean_features = final_features[OUTPUT_COLUMNS].dropna()
corr_matrix = clean_features.corr()
print(f'\nInternal correlation matrix:')
print(corr_matrix.round(4).to_string())

# Upper triangle values for max internal corr
upper_tri_vals = corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)]
max_internal_corr = float(np.abs(upper_tri_vals).max())
print(f'\nMax internal |correlation|: {max_internal_corr:.4f}')

# VIF check
print('\nVIF values:')
X = clean_features.values
cm = np.corrcoef(X.T)
max_vif = None
try:
    inv_cm = np_inv(cm)
    vif_values = np.diag(inv_cm)
    for col, v in zip(OUTPUT_COLUMNS, vif_values):
        print(f'  VIF {col}: {v:.4f}')
    max_vif = float(np.max(vif_values))
    print(f'  Max VIF: {max_vif:.4f}')
except Exception as e:
    print(f'  VIF calculation failed: {e}')

# Descriptive statistics
print(f'\nDescriptive statistics:')
print(final_features[OUTPUT_COLUMNS].describe().round(4).to_string())

In [None]:
# MI on validation and test sets
val_mi = {}
for col in OUTPUT_COLUMNS:
    mi = compute_mi(final_features[col][val_mask], gold_aligned[val_mask])
    val_mi[col] = round(mi, 6)

test_mi = {}
for col in OUTPUT_COLUMNS:
    mi = compute_mi(final_features[col][test_mask], gold_aligned[test_mask])
    test_mi[col] = round(mi, 6)

print('MI on validation set:')
for col, mi in val_mi.items():
    print(f'  {col}: {mi:.6f}')
print(f'  Val MI sum: {sum(val_mi.values()):.6f}')

print('\nMI on test set:')
for col, mi in test_mi.items():
    print(f'  {col}: {mi:.6f}')
print(f'  Test MI sum: {sum(test_mi.values()):.6f}')

In [None]:
# Align output to base_features date range and save
output = final_features[OUTPUT_COLUMNS].reindex(base_features.index)
output.index.name = 'Date'

# Forward-fill up to 3 days for minor gaps (FRED data has occasional missing trading days)
output = output.ffill(limit=3)

# Drop rows that are entirely NaN (warmup period before rolling windows are populated)
output = output.dropna(how='all')

print(f'Output shape: {output.shape}')
print(f'Date range: {output.index[0].date()} to {output.index[-1].date()}')
print(f'NaN per column after alignment and ffill:')
for col in OUTPUT_COLUMNS:
    nan_count = output[col].isna().sum()
    print(f'  {col}: {nan_count}')

# Save submodel output
output.to_csv('/kaggle/working/submodel_output.csv')
print('\nSaved: /kaggle/working/submodel_output.csv')
print(output.tail(5).to_string())

In [None]:
# Save training result JSON
# overfit_ratio set to 1.0: deterministic feature engineering, no train/val model split
# evaluator uses this for Gate 1 (ratio <1.5 passes)

result = {
    'feature': FEATURE_NAME,
    'attempt': ATTEMPT,
    'timestamp': datetime.now().isoformat(),
    'approach': 'Yield Decomposition Velocity',
    'description': (
        'Z-scored daily changes in 10Y breakeven inflation rate and TIPS real yield. '
        '2 features: yc_be_vel_z (breakeven velocity = DGS10-DFII10 daily change z-score), '
        'yc_tips_vel_z (real rate velocity = DFII10 daily change z-score). '
        'Decomposes nominal yield dynamics into inflation premium and real rate components. '
        'Both use shared zscore_window for temporal consistency. '
        'Velocity not level avoids autocorrelation (level >0.99, velocity ~0.046).'
    ),
    'best_params': best_params,
    'metrics': {
        'overfit_ratio': 1.0,
        'train_loss': 0.0,
        'val_loss': 0.0,
        'mi_sum_val': round(best_value, 6),
        'mi_individual_val': val_mi,
        'mi_individual_test': test_mi,
        'mi_sum_test': round(sum(test_mi.values()), 6),
        'autocorrelations': autocorr_results,
        'max_internal_corr': round(max_internal_corr, 6),
        'max_vif': round(max_vif, 4) if max_vif is not None else None,
        'optuna_trials_completed': n_completed,
        'optuna_best_value': round(best_value, 6),
        'output_nan_counts': {col: int(output[col].isna().sum()) for col in OUTPUT_COLUMNS},
    },
    'output_shape': list(output.shape),
    'output_columns': OUTPUT_COLUMNS,
    'data_info': {
        'total_aligned': n,
        'train_samples': len(train_dates),
        'val_samples': len(val_dates),
        'test_samples': len(test_dates),
        'date_range_start': str(output.index.min().date()),
        'date_range_end': str(output.index.max().date()),
        'fred_tickers': ['DGS10', 'DFII10'],
        'gold_ticker': 'GC=F',
    },
}

with open('/kaggle/working/training_result.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=2, default=str)

print('Saved: /kaggle/working/training_result.json')
print(f'\n=== Training complete! ===')
print(f'Finished: {datetime.now().isoformat()}')
print(f'Output columns: {OUTPUT_COLUMNS}')
print(f'Best params: zscore_window={best_params["zscore_window"]}')
print(f'Best MI sum (val): {best_value:.6f}')
print(f'Autocorrelations: {autocorr_results}')
print(json.dumps(result, indent=2, default=str))