# Gold SubModel: Yield Curve - Attempt 2
**Approach**: Deterministic z-score features only (no HMM)
**Changes from Attempt 1**:
- Removed HMM (collapsed to single state in attempt 1 -> yc_regime_prob was constant)
- Added DGS3MO (3-month T-bill, not in base features) for new information
- 4 output columns: curvature_z, spread_velocity_z, 10y3m_velocity_z, dgs3mo_velocity_z
- 50 Optuna trials (up from 30)

In [None]:
import subprocess
import sys

subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas-datareader', '-q'])

import numpy as np
import pandas as pd
import pandas_datareader.data as pdr
import yfinance as yf
import optuna
import json
import os
import warnings
from datetime import datetime
from sklearn.metrics import mutual_info_score

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)
np.random.seed(42)

print(f'=== Gold SubModel Training: yield_curve attempt 2 ===')
print(f'Approach: Deterministic z-score features (no HMM, no API key needed)')
print(f'Started: {datetime.now().isoformat()}')

In [None]:
# No FRED API key needed - using pandas_datareader for direct FRED public access
# pandas_datareader fetches FRED data over public HTTP (no authentication required)
print('Using pandas_datareader for FRED data (no API key required)')

In [None]:
def fetch_data():
    """
    Fetch yield curve data from FRED via pandas_datareader (no API key needed)
    Series: DGS10, DGS2, DGS5, DGS3MO
    DGS3MO is NEW data not in base_features.
    """
    start = '2014-01-01'
    print('Fetching yield curve data from FRED via pandas_datareader...')

    # pandas_datareader fetches FRED data publicly (no API key required)
    dgs10 = pdr.DataReader('DGS10', 'fred', start=start)['DGS10'] / 100
    dgs2  = pdr.DataReader('DGS2',  'fred', start=start)['DGS2']  / 100
    dgs5  = pdr.DataReader('DGS5',  'fred', start=start)['DGS5']  / 100
    dgs3mo = pdr.DataReader('DGS3MO', 'fred', start=start)['DGS3MO'] / 100

    df = pd.DataFrame({
        'dgs10':  dgs10,
        'dgs2':   dgs2,
        'dgs5':   dgs5,
        'dgs3mo': dgs3mo,
    })
    df.index = pd.to_datetime(df.index)
    df.index.name = 'date'

    # Forward-fill weekends and holidays (max 5 business days)
    df = df.ffill(limit=5)
    df = df.dropna()

    # Derived series
    df['spread_10y2y'] = df['dgs10'] - df['dgs2']
    df['spread_10y3m'] = df['dgs10'] - df['dgs3mo']
    df['curvature']    = df['dgs5'] - 0.5 * (df['dgs2'] + df['dgs10'])

    print(f'Yield data: {len(df)} rows from {df.index.min().date()} to {df.index.max().date()}')

    # Gold target from Yahoo Finance
    print('Fetching GLD for gold target...')
    gld = yf.download('GLD', start='2015-01-01', progress=False)
    if gld.empty:
        raise ValueError('GLD download returned empty DataFrame')
    if len(gld) < 100:
        raise ValueError(f'GLD download too short: {len(gld)} rows')

    if isinstance(gld.columns, pd.MultiIndex):
        gold_close = gld['Close'].iloc[:, 0]
    else:
        gold_close = gld['Close']

    target = pd.DataFrame({'gold_close': gold_close.values}, index=gld.index)
    target.index = pd.to_datetime(target.index)
    target.index.name = 'date'
    target['gold_return_next'] = target['gold_close'].pct_change().shift(-1) * 100
    target = target.dropna(subset=['gold_return_next'])

    # Align on common dates, restrict to 2015-01-01 onwards
    common_idx = df.index.intersection(target.index)
    common_idx = common_idx[common_idx >= pd.Timestamp('2015-01-01')]
    df = df.loc[common_idx]
    target = target.loc[common_idx]

    print(f'Aligned: {len(df)} rows from {df.index.min().date()} to {df.index.max().date()}')
    return df, target

In [None]:
def generate_features(df, change_window, velocity_zscore_window, curvature_zscore_window):
    """
    Generate 4 deterministic yield curve features (no HMM).
    All features use z-scores of CHANGES to avoid unit-root autocorrelation issues.

    Outputs:
      yc_curvature_z        : z-score of daily curvature change (DGS5-based, autocorr ~ -0.15)
      yc_spread_velocity_z  : z-score of N-day 10Y-2Y spread change (autocorr 0.62-0.85)
      yc_10y3m_velocity_z   : z-score of N-day 10Y-3M spread change (NEW: uses DGS3MO)
      yc_dgs3mo_velocity_z  : z-score of N-day DGS3MO change (NEW: Fed policy signal)
    """
    result = pd.DataFrame(index=df.index)

    def zscore_of_change(series, n_change, window):
        chg = series.diff(n_change)
        mu = chg.rolling(window).mean()
        sigma = chg.rolling(window).std()
        z = (chg - mu) / sigma
        return z.clip(-4, 4).ffill()

    # 1. Curvature z-score (curvature = DGS5 - 0.5*(DGS2+DGS10))
    #    Use 1-day change; curvature_zscore_window for normalization
    result['yc_curvature_z'] = zscore_of_change(df['curvature'], 1, curvature_zscore_window)

    # 2. 10Y-2Y spread velocity z-score
    result['yc_spread_velocity_z'] = zscore_of_change(df['spread_10y2y'], change_window, velocity_zscore_window)

    # 3. 10Y-3M spread velocity z-score (NEW: uses DGS3MO data)
    result['yc_10y3m_velocity_z'] = zscore_of_change(df['spread_10y3m'], change_window, velocity_zscore_window)

    # 4. DGS3MO velocity z-score (NEW: captures Fed policy shifts)
    result['yc_dgs3mo_velocity_z'] = zscore_of_change(df['dgs3mo'], change_window, velocity_zscore_window)

    return result

In [None]:
def create_objective(df, target_df, train_end, val_end):
    val_y = target_df['gold_return_next'].iloc[train_end:val_end].values

    def discretize(x, bins=20):
        valid = ~np.isnan(x)
        if valid.sum() < bins:
            return None
        x_clean = x.copy()
        x_clean[~valid] = np.nanmedian(x)
        try:
            return pd.qcut(x_clean, bins, labels=False, duplicates='drop')
        except Exception:
            return None

    def objective(trial):
        change_window = trial.suggest_categorical('change_window', [3, 5, 10])
        velocity_zscore_window = trial.suggest_categorical('velocity_zscore_window', [30, 60, 90, 120])
        curvature_zscore_window = trial.suggest_categorical('curvature_zscore_window', [30, 60, 90, 120])

        try:
            features = generate_features(df, change_window, velocity_zscore_window, curvature_zscore_window)
            val_X = features.iloc[train_end:val_end]

            mi_sum = 0.0
            for col in features.columns:
                feat_val = val_X[col].values
                mask = ~np.isnan(feat_val) & ~np.isnan(val_y)
                if mask.sum() < 50:
                    continue
                feat_disc = discretize(feat_val[mask])
                tgt_disc = discretize(val_y[mask])
                if feat_disc is not None and tgt_disc is not None:
                    mi_sum += mutual_info_score(feat_disc, tgt_disc)
            return mi_sum
        except Exception as e:
            print(f'Trial failed: {e}')
            return 0.0

    return objective

In [None]:
# === Fetch Data ===
data_df, target_df = fetch_data()

n = len(data_df)
train_end = int(n * 0.70)
val_end = int(n * 0.85)

print(f'\nData split:')
print(f'  Train: {train_end} rows ({data_df.index[0].date()} - {data_df.index[train_end-1].date()})')
print(f'  Val:   {val_end - train_end} rows ({data_df.index[train_end].date()} - {data_df.index[val_end-1].date()})')
print(f'  Test:  {n - val_end} rows ({data_df.index[val_end].date()} - {data_df.index[-1].date()})')

In [None]:
# === Optuna HPO: 50 trials ===
print('Running Optuna HPO (50 trials)...')
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

study.optimize(
    create_objective(data_df, target_df, train_end, val_end),
    n_trials=50,
    timeout=600
)

best_params = study.best_params
print(f'\nOptuna complete:')
print(f'  Best MI sum: {study.best_value:.4f}')
print(f'  Best params: {best_params}')
print(f'  Completed trials: {len(study.trials)}')

In [None]:
# === Generate Final Output with Best Params ===
print('\nGenerating final submodel output...')
output = generate_features(data_df, **best_params)

print(f'Output shape: {output.shape}')
print(f'Output columns: {list(output.columns)}')
print(f'Date range: {output.index.min().date()} to {output.index.max().date()}')
print('\nOutput summary:')
print(output.describe())

# Gate 1 checks: std and autocorrelation
print('\nGate 1 checks (training set):')
train_output = output.iloc[:train_end]
autocorr_results = {}
for col in output.columns:
    vals = train_output[col].dropna().values
    std_val = np.std(vals)
    if len(vals) > 1:
        ac = np.corrcoef(vals[:-1], vals[1:])[0, 1]
    else:
        ac = 0.0
    autocorr_results[col] = float(ac)
    status = 'FAIL' if std_val < 1e-6 else ('WARN' if abs(ac) > 0.95 else 'OK')
    print(f'  {col}: std={std_val:.4f}, autocorr={ac:.4f} [{status}]')

In [None]:
# === Gate 2 prep: MI on validation set ===
def discretize_final(x, bins=20):
    valid = ~np.isnan(x)
    if valid.sum() < bins:
        return None
    x_clean = x.copy()
    x_clean[~valid] = np.nanmedian(x)
    try:
        return pd.qcut(x_clean, bins, labels=False, duplicates='drop')
    except Exception:
        return None

val_X = output.iloc[train_end:val_end]
val_y = target_df['gold_return_next'].iloc[train_end:val_end]

mi_results = {}
for col in output.columns:
    feat_val = val_X[col].values
    mask = ~np.isnan(feat_val) & ~np.isnan(val_y.values)
    feat_disc = discretize_final(feat_val[mask])
    tgt_disc = discretize_final(val_y.values[mask])
    if feat_disc is not None and tgt_disc is not None:
        mi_results[col] = float(mutual_info_score(feat_disc, tgt_disc))
    else:
        mi_results[col] = 0.0

mi_sum = sum(mi_results.values())
print(f'MI results (validation set):')
for col, mi in mi_results.items():
    print(f'  {col}: {mi:.4f}')
print(f'  MI Sum: {mi_sum:.4f}')

In [None]:
# === Save Results ===
print('\nSaving results...')

output_with_date = output.reset_index()
output_with_date.columns = ['Date'] + list(output.columns)
output_with_date.to_csv('submodel_output.csv', index=False)

result = {
    'feature': 'yield_curve',
    'attempt': 2,
    'timestamp': datetime.now().isoformat(),
    'best_params': best_params,
    'metrics': {
        'mi_individual': mi_results,
        'mi_sum': mi_sum,
        'autocorr': autocorr_results,
        'optuna_best_value': float(study.best_value),
        'optuna_trials_completed': len(study.trials)
    },
    'output_shape': list(output.shape),
    'output_columns': list(output.columns),
    'data_info': {
        'total_samples': len(data_df),
        'train_samples': train_end,
        'val_samples': val_end - train_end,
        'test_samples': n - val_end,
        'date_range_start': str(data_df.index.min().date()),
        'date_range_end': str(data_df.index.max().date())
    }
}

with open('training_result.json', 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=2, default=str)

print(f'=== Training complete! ===')
print(f'Finished: {datetime.now().isoformat()}')
print(f'Files: submodel_output.csv, training_result.json')
print(f'Output shape: {output.shape}')
print(f'Columns: {list(output.columns)}')