# Gold Prediction SubModel Training - Options Market Attempt 1

Self-contained: Data fetch -> Preprocessing -> HMM + Z-Score + Momentum -> Optuna HPO -> Save results

**Data Sources**: Yahoo Finance ONLY (no FRED dependency)
- SKEW: ^SKEW (Yahoo)
- GVZ: ^GVZ (Yahoo)

**Output**: 3 columns
- options_risk_regime_prob: HMM regime probability
- options_tail_risk_z: SKEW z-score
- options_skew_momentum_z: SKEW momentum z-score

In [None]:
# === 1. Libraries ===
import subprocess
subprocess.check_call(['pip', 'install', 'hmmlearn'])

import numpy as np
import pandas as pd
import yfinance as yf
from hmmlearn.hmm import GaussianHMM
from sklearn.metrics import mutual_info_score
import optuna
import json
import os
from datetime import datetime

np.random.seed(42)

In [None]:
# === 2. Data Fetching ===
print("Fetching data...")

# Fetch SKEW from Yahoo Finance
skew_ticker = yf.Ticker('^SKEW')
skew_data = skew_ticker.history(start='2014-10-01', end='2025-02-15')
skew_close = skew_data['Close']
print(f"SKEW data: {len(skew_close)} rows")

# Fetch GVZ from Yahoo Finance
gvz_ticker = yf.Ticker('^GVZ')
gvz_data = gvz_ticker.history(start='2014-10-01', end='2025-02-15')
gvz_close = gvz_data['Close']
print(f"GVZ data: {len(gvz_close)} rows")

# Fetch Gold for target variable
gold_ticker = yf.Ticker('GC=F')
gold_data = gold_ticker.history(start='2014-10-01', end='2025-02-15')
gold_close = gold_data['Close']
gold_return = gold_close.pct_change() * 100
gold_return_next = gold_return.shift(-1)
print(f"Gold data: {len(gold_close)} rows")

# Align on common dates (merge on index)
df = pd.DataFrame({
    'skew': skew_close,
    'gvz': gvz_close
})
df = df.join(gold_return_next.to_frame(name="gold_return_next"), how="inner")

# Forward-fill gaps up to 3 days
df = df.ffill(limit=3)
df = df.dropna()

print(f"Aligned data: {len(df)} rows")
print(f"Date range: {df.index[0]} to {df.index[-1]}")
print(f"SKEW range: [{df["skew"].min():.2f}, {df["skew"].max():.2f}]")
print(f"GVZ range: [{df["gvz"].min():.2f}, {df["gvz"].max():.2f}]")

In [None]:
# === 3. Compute Changes ===
df['skew_change'] = df['skew'].diff()
df['gvz_change'] = df['gvz'].diff()

# Drop initial NaN from diff
df = df.dropna()

print(f"After computing changes: {len(df)} rows")
print(f"SKEW change range: [{df['skew_change'].min():.2f}, {df['skew_change'].max():.2f}]")
print(f"GVZ change range: [{df['gvz_change'].min():.2f}, {df['gvz_change'].max():.2f}]")

In [None]:
# === 4. Data Split ===
# train/val/test = 70/15/15 (time-series order)
n = len(df)
train_size = int(n * 0.70)
val_size = int(n * 0.15)
test_size = n - train_size - val_size

train_mask = np.zeros(n, dtype=bool)
train_mask[:train_size] = True

val_mask = np.zeros(n, dtype=bool)
val_mask[train_size:train_size+val_size] = True

test_mask = np.zeros(n, dtype=bool)
test_mask[train_size+val_size:] = True

print(f"Train: {train_size} rows")
print(f"Val: {val_size} rows")
print(f"Test: {test_size} rows")

In [None]:
# === 5. Feature Generation Functions ===

def generate_regime_feature(skew_changes, gvz_changes, n_components, train_size):
    """
    2D HMM on [SKEW changes, GVZ changes].
    Returns P(highest-trace-covariance state) for full data.
    """
    X = np.column_stack([skew_changes, gvz_changes])
    X_train = X[:train_size]
    model = GaussianHMM(
        n_components=n_components,
        covariance_type='full',
        n_iter=100,
        tol=1e-4,
        random_state=42
    )
    model.fit(X_train)
    probs = model.predict_proba(X)
    # Identify highest-trace (most volatile) state
    traces = [np.trace(model.covars_[i]) for i in range(n_components)]
    high_var_state = np.argmax(traces)
    return probs[:, high_var_state]

def generate_tail_risk_z(skew_levels, window):
    """
    Rolling z-score of SKEW level.
    High z = elevated tail risk perception relative to recent history.
    """
    s = pd.Series(skew_levels)
    rolling_mean = s.rolling(window, min_periods=window).mean()
    rolling_std = s.rolling(window, min_periods=window).std()
    z = (s - rolling_mean) / rolling_std
    z = z.clip(-4, 4)
    return z.values

def generate_skew_momentum_z(skew_levels, momentum_window, zscore_window=60):
    """
    SKEW momentum (rate of change) z-scored.
    Captures acceleration/deceleration of tail risk perception.
    """
    s = pd.Series(skew_levels)
    momentum = s.diff(momentum_window)
    # Z-score the raw momentum
    rolling_mean = momentum.rolling(zscore_window, min_periods=zscore_window).mean()
    rolling_std = momentum.rolling(zscore_window, min_periods=zscore_window).std()
    z = (momentum - rolling_mean) / rolling_std
    z = z.clip(-4, 4)
    return z.values

print("Feature generation functions defined.")

In [None]:
# === 6. Optuna Objective ===

def objective(trial):
    n_components = trial.suggest_categorical('hmm_n_components', [2, 3])
    skew_zscore_window = trial.suggest_categorical('skew_zscore_window', [40, 60, 90])
    skew_momentum_window = trial.suggest_categorical('skew_momentum_window', [5, 10, 15])
    
    try:
        # Generate features
        regime = generate_regime_feature(
            df['skew_change'].values,
            df['gvz_change'].values,
            n_components,
            train_size
        )
        tail_risk_z = generate_tail_risk_z(df['skew'].values, skew_zscore_window)
        momentum_z = generate_skew_momentum_z(df['skew'].values, skew_momentum_window)
        
        # Extract validation period
        regime_val = regime[val_mask]
        tail_risk_val = tail_risk_z[val_mask]
        momentum_val = momentum_z[val_mask]
        target_val = df['gold_return_next'].values[val_mask]
        
        # Compute MI sum
        def discretize(x, bins=20):
            valid = ~np.isnan(x)
            if valid.sum() < bins:
                return None
            x_c = x.copy()
            x_c[~valid] = np.nanmedian(x)
            return pd.qcut(x_c, bins, labels=False, duplicates='drop')
        
        mi_sum = 0.0
        for feat_val in [regime_val, tail_risk_val, momentum_val]:
            mask = ~np.isnan(feat_val) & ~np.isnan(target_val)
            if mask.sum() > 50:
                feat_disc = discretize(feat_val[mask])
                tgt_disc = discretize(target_val[mask])
                if feat_disc is not None and tgt_disc is not None:
                    mi_sum += mutual_info_score(feat_disc, tgt_disc)
        
        return mi_sum
    except Exception as e:
        print(f"Trial failed: {e}")
        return 0.0

print("Optuna objective defined.")

In [None]:
# === 7. Run Optuna HPO ===
print("Running Optuna HPO...")

study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

study.optimize(objective, n_trials=30, timeout=300)

print(f"Best trial: {study.best_trial.number}")
print(f"Best value: {study.best_value:.6f}")
print(f"Best params: {study.best_params}")

In [None]:
# === 8. Generate Final Output with Best Params ===
print("Generating final output with best params...")

best_params = study.best_params

regime_final = generate_regime_feature(
    df['skew_change'].values,
    df['gvz_change'].values,
    best_params['hmm_n_components'],
    train_size
)

tail_risk_z_final = generate_tail_risk_z(
    df['skew'].values,
    best_params['skew_zscore_window']
)

momentum_z_final = generate_skew_momentum_z(
    df['skew'].values,
    best_params['skew_momentum_window']
)

# Create output DataFrame
output = pd.DataFrame({
    'options_risk_regime_prob': regime_final,
    'options_tail_risk_z': tail_risk_z_final,
    'options_skew_momentum_z': momentum_z_final
}, index=df.index)

# Forward-fill NaN values from warmup period
output = output.fillna(method='ffill')

print(f"Output shape: {output.shape}")
print(f"Output columns: {list(output.columns)}")
print(f"NaN counts: {output.isna().sum().to_dict()}")

In [None]:
# === 9. Compute Metrics ===
print("Computing metrics...")

# Autocorrelation on test set
test_output = output[test_mask]
autocorr = {
    col: test_output[col].autocorr(lag=1)
    for col in output.columns
}

# MI on validation set
def compute_mi(feature, target, bins=20):
    mask = ~np.isnan(feature) & ~np.isnan(target)
    if mask.sum() < bins:
        return 0.0
    feat_disc = pd.qcut(feature[mask], bins, labels=False, duplicates='drop')
    tgt_disc = pd.qcut(target[mask], bins, labels=False, duplicates='drop')
    return mutual_info_score(feat_disc, tgt_disc)

val_output = output[val_mask]
target_val = df['gold_return_next'].values[val_mask]

mi_scores = {
    col: compute_mi(val_output[col].values, target_val)
    for col in output.columns
}

print(f"Autocorrelation (lag 1): {autocorr}")
print(f"MI scores (validation): {mi_scores}")

In [None]:
# === 10. Save Results ===
print("Saving results...")

# Save submodel output
output.to_csv('submodel_output.csv')

# Save training result JSON
result = {
    'feature': 'options_market',
    'attempt': 1,
    'timestamp': datetime.now().isoformat(),
    'best_params': best_params,
    'optuna_trials_completed': len(study.trials),
    'optuna_best_value': study.best_value,
    'output_shape': list(output.shape),
    'output_columns': list(output.columns),
    'data_info': {
        'train_samples': train_size,
        'val_samples': val_size,
        'test_samples': test_size,
        'full_samples': len(df)
    },
    'metrics': {
        'autocorr_lag1': autocorr,
        'mi_scores_validation': mi_scores
    }
}

with open('training_result.json', 'w') as f:
    json.dump(result, f, indent=2, default=str)

print("=== Training complete! ===")
print(f"Finished: {datetime.now().isoformat()}")