# Gold SubModel: Yield Curve - Attempt 5
**Approach**: Cross-Tenor Correlation Dynamics

**Key differences from previous attempts**:
- No ML model, no PyTorch, no HMM
- Captures the STRUCTURE of co-movement between yield tenors
- Takes CHANGE in rolling correlation (not level) to avoid autocorrelation

**Output features**:
1. `yc_corr_long_short_z`: Z-scored daily CHANGE in rolling corr(DGS10 changes, DGS3MO changes)
2. `yc_corr_long_mid_z`: Z-scored daily CHANGE in rolling corr(DGS10 changes, DGS2 changes)
3. `yc_corr_1y10y_z`: Z-scored daily CHANGE in rolling corr(DGS1 changes, DGS10 changes)

**Expected Gate 1**: autocorr max 0.053 (far below 0.95 threshold)
**Expected Gate 2**: VIF max 1.52 internal, 1.55 combined with att2. Max corr with att2 = 0.073.

In [None]:
import subprocess
import sys

# Install fredapi if not available
try:
    from fredapi import Fred
except ImportError:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'fredapi', '-q'])
    from fredapi import Fred

import numpy as np
import pandas as pd
import yfinance as yf
import optuna
import json
import os
import warnings
from datetime import datetime
from sklearn.metrics import mutual_info_score
from numpy.linalg import inv as np_inv

optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore')
np.random.seed(42)

FEATURE_NAME = 'yield_curve'
ATTEMPT = 5
OUTPUT_COLUMNS = ['yc_corr_long_short_z', 'yc_corr_long_mid_z', 'yc_corr_1y10y_z']
CLIP_RANGE = (-4, 4)

print(f'=== Gold SubModel Training: {FEATURE_NAME} attempt {ATTEMPT} ===')
print('Approach: Cross-Tenor Correlation Dynamics')
print(f'Started: {datetime.now().isoformat()}')

In [None]:
# FRED API key via Kaggle Secrets (with fallback)
FRED_API_KEY = None
try:
    from kaggle_secrets import UserSecretsClient
    FRED_API_KEY = UserSecretsClient().get_secret('FRED_API_KEY')
    print('FRED_API_KEY loaded from Kaggle Secrets')
except Exception:
    FRED_API_KEY = os.environ.get('FRED_API_KEY')
    if FRED_API_KEY:
        print('FRED_API_KEY loaded from environment')
    else:
        raise RuntimeError('FRED_API_KEY not found in Kaggle Secrets or environment')

In [None]:
# Dynamic path resolution for bigbigzabuton/gold-prediction-submodels
# Strategy: check BOTH path AND file existence, then list files for debugging.
import glob as _glob

PROBE_FILES = ['base_features.csv', 'base_features_raw.csv', 'vix.csv']
candidates = [
    '/kaggle/input/gold-prediction-submodels',
    '/kaggle/input/datasets/bigbigzabuton/gold-prediction-submodels',
]

DATASET_PATH = None
for c in candidates:
    if os.path.isdir(c):
        files_in_dir = os.listdir(c)
        if any(f in files_in_dir for f in PROBE_FILES):
            DATASET_PATH = c
            print(f'Dataset found: {DATASET_PATH}')
            print(f'  Files ({len(files_in_dir)}): {sorted(files_in_dir)[:10]}')
            break
        else:
            print(f'Dir exists but missing probe files: {c} -> {files_in_dir[:5]}')

if DATASET_PATH is None:
    found = _glob.glob('/kaggle/input/*gold*') + _glob.glob('/kaggle/input/datasets/*/*gold*')
    raise RuntimeError(
        f'Dataset not found.\n'
        f'Tried: {candidates}\n'
        f'Glob /kaggle/input/*gold*: {found}\n'
        f'All /kaggle/input/: {os.listdir("/kaggle/input") if os.path.exists("/kaggle/input") else "N/A"}'
    )

In [None]:
# Fetch yield data from FRED
# DGS10, DGS2, DGS3MO, DGS1 from 2014-06-01
# Buffer for rolling window warmup: corr_window(90) + zscore_window(90) + buffer = ~200 trading days
START = '2014-06-01'
TICKERS_FRED = ['DGS10', 'DGS2', 'DGS3MO', 'DGS1']

fred = Fred(api_key=FRED_API_KEY)

series = {}
for ticker in TICKERS_FRED:
    s = fred.get_series(ticker, observation_start=START)
    s = s.ffill(limit=3)
    series[ticker] = s
    print(f'{ticker}: {len(s.dropna())} obs, {s.dropna().index[0].date()} to {s.dropna().index[-1].date()}')

# Build aligned DataFrame of yield series
yields_df = pd.DataFrame({
    'dgs10': series['DGS10'],
    'dgs2': series['DGS2'],
    'dgs3mo': series['DGS3MO'],
    'dgs1': series['DGS1'],
})
yields_df.index = pd.to_datetime(yields_df.index)
yields_df = yields_df.dropna()
print(f'Combined yields: {len(yields_df)} obs, {yields_df.index[0].date()} to {yields_df.index[-1].date()}')
print(f'Sample yields (last 3 rows):')
print(yields_df.tail(3))

In [None]:
# Fetch gold price from Yahoo Finance for target computation
gold = yf.download('GC=F', start=START, auto_adjust=True, progress=False)

if gold.empty or len(gold) < 100:
    raise ValueError(f'GC=F download returned insufficient data: {len(gold)} rows')

if isinstance(gold.columns, pd.MultiIndex):
    gold_close = gold['Close'].iloc[:, 0]
else:
    gold_close = gold['Close'].squeeze()

gold_ret = gold_close.pct_change() * 100
gold_ret_next = gold_ret.shift(-1)  # next-day return (target)
gold_ret_next.name = 'gold_return_next'
gold_ret_next.index = pd.to_datetime(gold_ret_next.index).tz_localize(None)
print(f'GC=F: {len(gold_ret_next.dropna())} obs, {gold_ret_next.dropna().index[0].date()} to {gold_ret_next.dropna().index[-1].date()}')

In [None]:
# Load base_features for date alignment (try both filename variants)
base_path = None
for fname in ['base_features.csv', 'base_features_raw.csv']:
    candidate = os.path.join(DATASET_PATH, fname)
    if os.path.exists(candidate):
        base_path = candidate
        print(f'Using base features file: {fname}')
        break

if base_path is None:
    files_in_dir = os.listdir(DATASET_PATH)
    raise FileNotFoundError(
        f'base_features[_raw].csv not found in {DATASET_PATH}.\n'
        f'Available files: {sorted(files_in_dir)}'
    )

base_features = pd.read_csv(base_path, index_col=0, parse_dates=True)
base_features.index = pd.to_datetime(base_features.index)
TARGET_DATES = base_features.index
print(f'Base features: {len(base_features)} rows, {TARGET_DATES[0].date()} to {TARGET_DATES[-1].date()}')

In [None]:
# Data split: train/val/test = 70/15/15 on base_features dates
# Align yields and gold target to base_features dates
common_dates = yields_df.index.intersection(TARGET_DATES)
common_dates = common_dates.intersection(gold_ret_next.dropna().index)
common_dates = common_dates.sort_values()

n = len(common_dates)
train_end_idx = int(n * 0.70)
val_end_idx = int(n * 0.85)

train_dates = common_dates[:train_end_idx]
val_dates = common_dates[train_end_idx:val_end_idx]
test_dates = common_dates[val_end_idx:]

print(f'Total aligned: {n}')
print(f'Train: {len(train_dates)} ({train_dates[0].date()} to {train_dates[-1].date()})')
print(f'Val:   {len(val_dates)} ({val_dates[0].date()} to {val_dates[-1].date()})')
print(f'Test:  {len(test_dates)} ({test_dates[0].date()} to {test_dates[-1].date()})')

# Create masks on yields_df index (not just common_dates)
val_mask = yields_df.index.isin(val_dates)
test_mask = yields_df.index.isin(test_dates)

# Aligned gold target
gold_aligned = gold_ret_next.reindex(yields_df.index)

In [None]:
# Feature generation functions

def rolling_zscore(x, window):
    """Rolling z-score with NaN-safe handling."""
    min_p = max(window // 2, 10)
    m = x.rolling(window, min_periods=min_p).mean()
    s = x.rolling(window, min_periods=min_p).std()
    z = (x - m) / s.replace(0, np.nan)
    return z.replace([np.inf, -np.inf], np.nan)


def compute_corr_change_z(series_a_chg, series_b_chg, corr_window, zscore_window):
    """
    Compute the z-scored daily change in rolling correlation
    between two yield change series.

    Steps:
    1. Rolling correlation between daily yield changes
    2. First difference of correlation (avoids level autocorrelation ~0.98)
    3. Z-score over rolling window
    4. Clip to [-4, 4]

    Critical: use CHANGE not level. Level autocorr ~0.98; change autocorr ~0.05.
    """
    min_p = max(corr_window // 2, 15)
    corr = series_a_chg.rolling(corr_window, min_periods=min_p).corr(series_b_chg)
    corr_chg = corr.diff()
    z = rolling_zscore(corr_chg, zscore_window)
    return z.clip(*CLIP_RANGE)


def generate_all_features(yields_df, corr_window, zscore_window):
    """Generate all 3 cross-tenor correlation change z-scores."""
    dgs10_chg = yields_df['dgs10'].diff()
    dgs2_chg = yields_df['dgs2'].diff()
    dgs3mo_chg = yields_df['dgs3mo'].diff()
    dgs1_chg = yields_df['dgs1'].diff()

    features = pd.DataFrame(index=yields_df.index)

    # Feature 1: 10Y vs 3M correlation change (long-end vs short-end)
    # Positive = curve synchronizing. Negative = curve fragmenting.
    features['yc_corr_long_short_z'] = compute_corr_change_z(
        dgs10_chg, dgs3mo_chg, corr_window, zscore_window
    )

    # Feature 2: 10Y vs 2Y correlation change (long-end vs mid-range)
    # Positive = 10Y and 2Y moving in sync. Negative = term premium shift.
    features['yc_corr_long_mid_z'] = compute_corr_change_z(
        dgs10_chg, dgs2_chg, corr_window, zscore_window
    )

    # Feature 3: 1Y vs 10Y correlation change (policy-end vs long-end)
    # 1Y = policy expectations. 10Y = term premium. Decorrelation = policy/market disconnect.
    features['yc_corr_1y10y_z'] = compute_corr_change_z(
        dgs1_chg, dgs10_chg, corr_window, zscore_window
    )

    return features


def compute_mi(feature, target, n_bins=20):
    """MI between feature and target using quantile binning."""
    valid = feature.dropna().index.intersection(target.dropna().index)
    if len(valid) < 50:
        return 0.0
    f = feature[valid]
    t = target[valid]
    try:
        f_binned = pd.qcut(f, q=n_bins, labels=False, duplicates='drop')
        t_binned = pd.qcut(t, q=n_bins, labels=False, duplicates='drop')
        valid2 = f_binned.notna() & t_binned.notna()
        if valid2.sum() < 50:
            return 0.0
        return float(mutual_info_score(f_binned[valid2], t_binned[valid2]))
    except Exception:
        return 0.0


print('Feature generation functions defined')

In [None]:
# Optuna HPO: maximize sum of MI between 3 features and gold_return_next on validation set
# Search space: corr_window in {30, 45, 60, 90}, zscore_window in {30, 45, 60, 90}
# Total unique combinations: 4 x 4 = 16. 30 trials provides complete coverage.

def objective(trial):
    corr_window = trial.suggest_categorical('corr_window', [30, 45, 60, 90])
    zscore_window = trial.suggest_categorical('zscore_window', [30, 45, 60, 90])

    features = generate_all_features(yields_df, corr_window, zscore_window)

    # Compute MI on validation set only
    target_val = gold_aligned[val_mask]
    mi_sum = 0.0
    for col in OUTPUT_COLUMNS:
        feat_val = features[col][val_mask]
        mi_sum += compute_mi(feat_val, target_val)

    return mi_sum


print('Running Optuna HPO (30 trials, timeout=300s, TPE sampler)...')
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)
study.optimize(objective, n_trials=30, timeout=300)

best_params = study.best_params
best_value = study.best_value
n_completed = len(study.trials)
print(f'Optuna complete: {n_completed} trials')
print(f'Best params: {best_params}')
print(f'Best MI sum (val): {best_value:.6f}')

In [None]:
# Generate final features with best Optuna parameters
print(f'Generating final features with corr_window={best_params["corr_window"]}, zscore_window={best_params["zscore_window"]}...')
final_features = generate_all_features(
    yields_df,
    corr_window=best_params['corr_window'],
    zscore_window=best_params['zscore_window']
)

print(f'Final features shape: {final_features.shape}')
print(f'NaN counts before alignment:')
for col in OUTPUT_COLUMNS:
    print(f'  {col}: {final_features[col].isna().sum()}')

In [None]:
# Quality checks: autocorrelation, internal correlation, VIF
print('=== Quality Checks ===')

# Autocorrelation check (Gate 1 threshold: 0.95)
autocorr_results = {}
for col in OUTPUT_COLUMNS:
    series_clean = final_features[col].dropna()
    if len(series_clean) > 1:
        ac = float(series_clean.autocorr(lag=1))
    else:
        ac = 0.0
    autocorr_results[col] = round(ac, 6)
    status = 'FAIL' if abs(ac) > 0.95 else 'PASS'
    print(f'Autocorr {col}: {ac:.6f} [{status}]')

# NaN check
print('\nNaN per column:')
for col in OUTPUT_COLUMNS:
    nan_count = final_features[col].isna().sum()
    nan_pct = nan_count / len(final_features) * 100
    print(f'  {col}: {nan_count} ({nan_pct:.1f}%)')

# Internal correlation check
clean_features = final_features[OUTPUT_COLUMNS].dropna()
corr_matrix = clean_features.corr()
print(f'\nInternal correlation matrix:')
print(corr_matrix.round(4).to_string())

# Upper triangle values for max internal corr
upper_tri_vals = corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)]
max_internal_corr = float(np.abs(upper_tri_vals).max())
print(f'\nMax internal |correlation|: {max_internal_corr:.4f}')

# VIF check
print('\nVIF values:')
X = clean_features.values
cm = np.corrcoef(X.T)
try:
    inv_cm = np_inv(cm)
    vif_values = np.diag(inv_cm)
    for col, v in zip(OUTPUT_COLUMNS, vif_values):
        print(f'  VIF {col}: {v:.4f}')
    max_vif = float(np.max(vif_values))
    print(f'  Max VIF: {max_vif:.4f}')
except Exception as e:
    print(f'  VIF calculation failed: {e}')
    max_vif = None

# Descriptive statistics
print(f'\nDescriptive statistics:')
print(final_features[OUTPUT_COLUMNS].describe().round(4).to_string())

In [None]:
# MI on validation and test sets
val_mi = {}
for col in OUTPUT_COLUMNS:
    mi = compute_mi(final_features[col][val_mask], gold_aligned[val_mask])
    val_mi[col] = round(mi, 6)

test_mi = {}
for col in OUTPUT_COLUMNS:
    mi = compute_mi(final_features[col][test_mask], gold_aligned[test_mask])
    test_mi[col] = round(mi, 6)

print('MI on validation set:')
for col, mi in val_mi.items():
    print(f'  {col}: {mi:.6f}')
print(f'  Val MI sum: {sum(val_mi.values()):.6f}')

print('\nMI on test set:')
for col, mi in test_mi.items():
    print(f'  {col}: {mi:.6f}')
print(f'  Test MI sum: {sum(test_mi.values()):.6f}')

In [None]:
# Align output to base_features date range and save
output = final_features[OUTPUT_COLUMNS].reindex(base_features.index)
output.index.name = 'Date'

# Forward-fill up to 3 days for minor gaps (FRED data has occasional missing trading days)
output = output.ffill(limit=3)

# Drop rows that are entirely NaN (warmup period before rolling windows are populated)
output = output.dropna(how='all')

print(f'Output shape: {output.shape}')
print(f'Date range: {output.index[0].date()} to {output.index[-1].date()}')
print(f'NaN per column after alignment and ffill:')
for col in OUTPUT_COLUMNS:
    nan_count = output[col].isna().sum()
    print(f'  {col}: {nan_count}')

# Save submodel output
output.to_csv('/kaggle/working/submodel_output.csv')
print('\nSaved: /kaggle/working/submodel_output.csv')
print(output.tail(5).to_string())

In [None]:
# Save training result JSON
# overfit_ratio: set to 1.0 (deterministic, no train/val model; evaluator uses for Gate 1)
# autocorr: dict of {column_name: autocorr_value} for all 3 features
# mi_sum_val: best Optuna value
# optuna_trials_completed: number of completed trials

result = {
    'feature': FEATURE_NAME,
    'attempt': ATTEMPT,
    'timestamp': datetime.now().isoformat(),
    'approach': 'Cross-Tenor Correlation Dynamics',
    'description': (
        'Z-scored daily changes in rolling cross-tenor yield correlations. '
        '3 features: 10Y-3M corr change, 10Y-2Y corr change, 1Y-10Y corr change. '
        'CHANGE not level avoids autocorrelation (level ~0.98, change ~0.05).'
    ),
    'best_params': best_params,
    'metrics': {
        'overfit_ratio': 1.0,
        'mi_sum_val': round(best_value, 6),
        'mi_individual_val': val_mi,
        'mi_individual_test': test_mi,
        'mi_sum_test': round(sum(test_mi.values()), 6),
        'autocorr': autocorr_results,
        'max_internal_corr': round(max_internal_corr, 6),
        'optuna_trials_completed': n_completed,
        'optuna_best_value': round(best_value, 6),
        'output_nan_counts': {col: int(output[col].isna().sum()) for col in OUTPUT_COLUMNS},
    },
    'output_shape': list(output.shape),
    'output_columns': OUTPUT_COLUMNS,
    'data_info': {
        'total_aligned': n,
        'train_samples': len(train_dates),
        'val_samples': len(val_dates),
        'test_samples': len(test_dates),
        'date_range_start': str(output.index.min().date()),
        'date_range_end': str(output.index.max().date()),
        'fred_tickers': TICKERS_FRED,
        'gold_ticker': 'GC=F',
    },
}

with open('/kaggle/working/training_result.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=2, default=str)

print('Saved: /kaggle/working/training_result.json')
print(f'\n=== Training complete! ===')
print(f'Finished: {datetime.now().isoformat()}')
print(f'Output columns: {OUTPUT_COLUMNS}')
print(f'Best params: corr_window={best_params["corr_window"]}, zscore_window={best_params["zscore_window"]}')
print(f'Best MI sum (val): {best_value:.6f}')
print(f'Autocorrelations: {autocorr_results}')
print(json.dumps(result, indent=2, default=str))