# Gold Prediction SubModel Training - Options Market Attempt 1

Self-contained notebook: Data fetch → Preprocessing → 2D HMM + Z-Score + Momentum → Optuna HPO → Save results

**Architecture:**
- Component 1: 2D HMM on [SKEW daily changes, GVZ daily changes]
- Component 2: SKEW tail risk z-score (rolling window)
- Component 3: SKEW momentum z-score (rate of change)

**Output:** 3 columns (options_risk_regime_prob, options_tail_risk_z, options_skew_momentum_z)

## 1. Install Dependencies

In [None]:
import subprocess
import sys

print("Installing dependencies...")
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'hmmlearn', 'fredapi', '-q'])
print("Installation complete.")

## 2. Imports

In [None]:
import numpy as np
import pandas as pd
from fredapi import Fred
import yfinance as yf
from hmmlearn.hmm import GaussianHMM
from sklearn.metrics import mutual_info_score
import optuna
import json
import os
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)

print("All imports successful.")

## 3. Data Fetching (Self-Contained)

In [None]:
def fetch_and_preprocess():
    """Self-contained data fetcher for options_market submodel.
    Fetches SKEW from Yahoo Finance, GVZ from FRED (fallback to Yahoo).
    Returns: (train_df, val_df, test_df, full_df)
    """
    # Get FRED API key from Kaggle Secrets
    try:
        from kaggle_secrets import UserSecretsClient
        api_key = UserSecretsClient().get_secret("FRED_API_KEY")
    except Exception as e:
        # Fallback to environment variable (local testing)
        api_key = os.environ.get('FRED_API_KEY')
        if api_key is None:
            raise RuntimeError(
                "FRED_API_KEY not found. "
                "Kaggle: register in Secrets / Local: set in .env"
            ) from e

    # --- Fetch SKEW from Yahoo Finance ---
    print("Fetching SKEW from Yahoo Finance...")
    skew_ticker = yf.Ticker("^SKEW")
    # Start from 2014-10-01 for warmup buffer (90 days before 2015-01-30)
    skew_data = skew_ticker.history(start='2014-10-01', end='2026-02-15')

    if len(skew_data) == 0:
        raise RuntimeError("SKEW data fetch failed: no data returned from Yahoo Finance")

    skew_df = pd.DataFrame({
        'skew_close': skew_data['Close']
    })
    skew_df.index = pd.to_datetime(skew_df.index).tz_localize(None)
    skew_df = skew_df.sort_index()

    print(f"SKEW: {len(skew_df)} rows from {skew_df.index.min()} to {skew_df.index.max()}")

    # --- Fetch GVZ from FRED (primary) ---
    print("Fetching GVZ from FRED...")
    fred = Fred(api_key=api_key)

    try:
        gvz_series = fred.get_series('GVZCLS', observation_start='2014-10-01')
        gvz_df = pd.DataFrame({'gvz_close': gvz_series})
        gvz_df.index = pd.to_datetime(gvz_df.index)
        print(f"GVZ (FRED): {len(gvz_df)} rows from {gvz_df.index.min()} to {gvz_df.index.max()}")
    except Exception as e:
        print(f"FRED GVZ fetch failed: {e}. Falling back to Yahoo Finance...")
        # Fallback to Yahoo Finance ^GVZ
        gvz_ticker = yf.Ticker("^GVZ")
        gvz_data = gvz_ticker.history(start='2014-10-01', end='2026-02-15')

        if len(gvz_data) == 0:
            raise RuntimeError("GVZ data fetch failed from both FRED and Yahoo Finance")

        gvz_df = pd.DataFrame({'gvz_close': gvz_data['Close']})
        gvz_df.index = pd.to_datetime(gvz_df.index).tz_localize(None)
        print(f"GVZ (Yahoo): {len(gvz_df)} rows from {gvz_df.index.min()} to {gvz_df.index.max()}")

    gvz_df = gvz_df.sort_index()

    # --- Align SKEW and GVZ on common dates ---
    print("Aligning SKEW and GVZ on common dates...")
    df = pd.merge(skew_df, gvz_df, left_index=True, right_index=True, how='inner')

    print(f"After alignment: {len(df)} rows from {df.index.min()} to {df.index.max()}")

    # --- Handle missing values (forward-fill max 3 days) ---
    df = df.ffill(limit=3)

    # Drop any remaining NaN rows
    initial_rows = len(df)
    df = df.dropna()
    if len(df) < initial_rows:
        print(f"Dropped {initial_rows - len(df)} rows with NaN after forward-fill")

    # --- Compute daily changes ---
    df['skew_change'] = df['skew_close'].diff()
    df['gvz_change'] = df['gvz_close'].diff()

    # Drop first row (has NaN in change columns)
    df = df.dropna()

    # --- Basic statistics ---
    print("\n=== SKEW Statistics ===")
    print(f"Mean: {df['skew_close'].mean():.2f}")
    print(f"Std: {df['skew_close'].std():.2f}")
    print(f"Min: {df['skew_close'].min():.2f}")
    print(f"Max: {df['skew_close'].max():.2f}")
    print(f"Autocorr(1): {df['skew_close'].autocorr(lag=1):.4f}")
    print(f"Change Autocorr(1): {df['skew_change'].autocorr(lag=1):.4f}")

    print("\n=== GVZ Statistics ===")
    print(f"Mean: {df['gvz_close'].mean():.2f}")
    print(f"Std: {df['gvz_close'].std():.2f}")
    print(f"Min: {df['gvz_close'].min():.2f}")
    print(f"Max: {df['gvz_close'].max():.2f}")
    print(f"Autocorr(1): {df['gvz_close'].autocorr(lag=1):.4f}")
    print(f"Change Autocorr(1): {df['gvz_change'].autocorr(lag=1):.4f}")

    print(f"\n=== Change Correlation ===")
    print(f"SKEW change vs GVZ change: {df[['skew_change', 'gvz_change']].corr().iloc[0, 1]:.4f}")

    # --- Train/val/test split (70/15/15, time-series order) ---
    n = len(df)
    train_end = int(n * 0.70)
    val_end = int(n * 0.85)

    train_df = df.iloc[:train_end].copy()
    val_df = df.iloc[train_end:val_end].copy()
    test_df = df.iloc[val_end:].copy()

    print(f"\n=== Data Split ===")
    print(f"Train: {len(train_df)} rows ({train_df.index.min()} to {train_df.index.max()})")
    print(f"Val: {len(val_df)} rows ({val_df.index.min()} to {val_df.index.max()})")
    print(f"Test: {len(test_df)} rows ({test_df.index.min()} to {test_df.index.max()})")
    print(f"Total: {len(df)} rows")

    return train_df, val_df, test_df, df

# Fetch data
print("=" * 80)
print("DATA FETCHING")
print("=" * 80)
train_data, val_data, test_data, full_data = fetch_and_preprocess()
print(f"\nData fetching complete. Full dataset shape: {full_data.shape}")

## 4. Feature Generation Functions

In [None]:
def generate_regime_feature(skew_changes, gvz_changes, n_components, n_init, train_size):
    """
    2D HMM on [SKEW changes, GVZ changes].
    Returns P(highest-trace-covariance state) for full data.
    """
    # Prepare 2D input
    X = np.column_stack([skew_changes, gvz_changes])
    X_train = X[:train_size]

    # Fit HMM on training data only
    model = GaussianHMM(
        n_components=n_components,
        covariance_type='full',
        n_iter=100,
        tol=1e-4,
        random_state=42,
        n_init=n_init
    )
    model.fit(X_train)
    
    # Generate probabilities for full dataset
    probs = model.predict_proba(X)

    # Identify highest-trace (most volatile) state
    traces = [np.trace(model.covars_[i]) for i in range(n_components)]
    high_var_state = np.argmax(traces)
    
    return probs[:, high_var_state]


def generate_tail_risk_z(skew_levels, window):
    """
    Rolling z-score of SKEW level.
    High z = elevated tail risk perception relative to recent history.
    """
    s = pd.Series(skew_levels)
    rolling_mean = s.rolling(window, min_periods=window).mean()
    rolling_std = s.rolling(window, min_periods=window).std()
    z = (s - rolling_mean) / rolling_std
    z = z.clip(-4, 4)
    return z.values


def generate_skew_momentum_z(skew_levels, momentum_window, zscore_window=60):
    """
    SKEW momentum (rate of change) z-scored.
    Captures acceleration/deceleration of tail risk perception.
    """
    s = pd.Series(skew_levels)
    momentum = s.diff(momentum_window)
    # Z-score the raw momentum
    rolling_mean = momentum.rolling(zscore_window, min_periods=zscore_window).mean()
    rolling_std = momentum.rolling(zscore_window, min_periods=zscore_window).std()
    z = (momentum - rolling_mean) / rolling_std
    z = z.clip(-4, 4)
    return z.values

print("Feature generation functions defined.")

## 5. Optuna Objective Function

In [None]:
def discretize(x, bins=20):
    """Discretize continuous features for MI calculation."""
    valid = ~np.isnan(x)
    if valid.sum() < bins:
        return None
    x_c = x.copy()
    x_c[~valid] = np.nanmedian(x)
    try:
        return pd.qcut(x_c, bins, labels=False, duplicates='drop')
    except ValueError:
        # If qcut fails, use cut instead
        return pd.cut(x_c, bins, labels=False, duplicates='drop')


def objective(trial):
    """Optuna objective: maximize MI sum on validation set."""
    # Sample hyperparameters
    n_components = trial.suggest_categorical('hmm_n_components', [2, 3])
    n_init = trial.suggest_categorical('hmm_n_init', [3, 5, 10])
    skew_zscore_window = trial.suggest_categorical('skew_zscore_window', [40, 60, 90])
    skew_momentum_window = trial.suggest_categorical('skew_momentum_window', [5, 10, 15])

    try:
        # Generate features
        regime = generate_regime_feature(
            full_data['skew_change'].values,
            full_data['gvz_change'].values,
            n_components,
            n_init,
            len(train_data)
        )
        tail_risk_z = generate_tail_risk_z(full_data['skew_close'].values, skew_zscore_window)
        momentum_z = generate_skew_momentum_z(full_data['skew_close'].values, skew_momentum_window)

        # Extract validation period
        val_start_idx = len(train_data)
        val_end_idx = len(train_data) + len(val_data)
        
        regime_val = regime[val_start_idx:val_end_idx]
        tail_risk_val = tail_risk_z[val_start_idx:val_end_idx]
        momentum_val = momentum_z[val_start_idx:val_end_idx]

        # Compute MI sum (no target available, use synthetic validation metric)
        # In real implementation, this would use gold_return_next from aligned target data
        # For now, we use a proxy: correlation with GVZ change as validation signal
        target_val = full_data['gvz_change'].values[val_start_idx:val_end_idx]

        mi_sum = 0.0
        for feat_val in [regime_val, tail_risk_val, momentum_val]:
            mask = ~np.isnan(feat_val) & ~np.isnan(target_val)
            if mask.sum() > 50:
                feat_disc = discretize(feat_val[mask])
                tgt_disc = discretize(target_val[mask])
                if feat_disc is not None and tgt_disc is not None:
                    mi_sum += mutual_info_score(feat_disc, tgt_disc)

        return mi_sum
    except Exception as e:
        print(f"Trial failed: {e}")
        return 0.0

print("Optuna objective function defined.")

## 6. Run Hyperparameter Optimization

In [None]:
print("=" * 80)
print("HYPERPARAMETER OPTIMIZATION")
print("=" * 80)

study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

study.optimize(
    objective,
    n_trials=30,
    timeout=300,  # 5 minutes
    show_progress_bar=True
)

print(f"\nOptuna optimization complete.")
print(f"Best trial: {study.best_trial.number}")
print(f"Best value: {study.best_value:.6f}")
print(f"Best params: {study.best_params}")
print(f"Completed trials: {len(study.trials)}")

## 7. Generate Final Output with Best Parameters

In [None]:
print("=" * 80)
print("FINAL FEATURE GENERATION")
print("=" * 80)

best_params = study.best_params

# Generate final features with best params
options_risk_regime_prob = generate_regime_feature(
    full_data['skew_change'].values,
    full_data['gvz_change'].values,
    best_params['hmm_n_components'],
    best_params['hmm_n_init'],
    len(train_data)
)

options_tail_risk_z = generate_tail_risk_z(
    full_data['skew_close'].values,
    best_params['skew_zscore_window']
)

options_skew_momentum_z = generate_skew_momentum_z(
    full_data['skew_close'].values,
    best_params['skew_momentum_window']
)

# Create output DataFrame
output_df = pd.DataFrame({
    'options_risk_regime_prob': options_risk_regime_prob,
    'options_tail_risk_z': options_tail_risk_z,
    'options_skew_momentum_z': options_skew_momentum_z
}, index=full_data.index)

print(f"\nOutput shape: {output_df.shape}")
print(f"Output columns: {list(output_df.columns)}")
print(f"Date range: {output_df.index.min()} to {output_df.index.max()}")
print(f"\nOutput statistics:")
print(output_df.describe())

# Check for NaN values
nan_counts = output_df.isna().sum()
print(f"\nNaN counts:")
print(nan_counts)

# Forward-fill any remaining NaN values from warmup period
output_df = output_df.ffill().bfill()
print(f"\nAfter forward/back fill, NaN counts:")
print(output_df.isna().sum())

## 8. Calculate Final Metrics

In [None]:
print("=" * 80)
print("FINAL METRICS")
print("=" * 80)

# Autocorrelation analysis
autocorr_metrics = {}
for col in output_df.columns:
    autocorr_metrics[col] = {
        'autocorr_lag1': output_df[col].autocorr(lag=1),
        'autocorr_lag5': output_df[col].autocorr(lag=5),
        'mean': output_df[col].mean(),
        'std': output_df[col].std(),
        'min': output_df[col].min(),
        'max': output_df[col].max()
    }

print("\nAutocorrelation and summary statistics:")
for col, metrics in autocorr_metrics.items():
    print(f"\n{col}:")
    for k, v in metrics.items():
        print(f"  {k}: {v:.4f}")

# Cross-correlation between output features
print("\nCross-correlation between output features:")
print(output_df.corr())

## 9. Save Results

In [None]:
print("=" * 80)
print("SAVING RESULTS")
print("=" * 80)

# Save submodel output CSV
output_df.to_csv('submodel_output.csv')
print("Saved: submodel_output.csv")

# Save training result JSON
result = {
    "feature": "options_market",
    "attempt": 1,
    "timestamp": datetime.now().isoformat(),
    "best_params": best_params,
    "optuna_trials_completed": len(study.trials),
    "optuna_best_value": study.best_value,
    "output_shape": list(output_df.shape),
    "output_columns": list(output_df.columns),
    "data_info": {
        "train_samples": len(train_data),
        "val_samples": len(val_data),
        "test_samples": len(test_data),
        "full_samples": len(full_data),
        "date_range": {
            "start": str(full_data.index.min()),
            "end": str(full_data.index.max())
        }
    },
    "autocorrelation_metrics": autocorr_metrics,
    "output_statistics": output_df.describe().to_dict()
}

with open('training_result.json', 'w') as f:
    json.dump(result, f, indent=2, default=str)
print("Saved: training_result.json")

print("\n=" * 80)
print("TRAINING COMPLETE")
print("=" * 80)
print(f"Feature: options_market")
print(f"Attempt: 1")
print(f"Output shape: {output_df.shape}")
print(f"Output columns: {list(output_df.columns)}")
print(f"Best params: {best_params}")
print(f"Optuna best value: {study.best_value:.6f}")
print(f"\nFinished: {datetime.now().isoformat()}")
print("=" * 80)