# Gold Prediction SubModel Training - Options Market Attempt 2

**Self-contained training notebook**: Data fetch → Preprocessing → HMM Regime Detection → Optuna HPO → Save results

## Key Changes from Attempt 1

- **Output reduced from 3 columns to 1**: Only `options_risk_regime_prob` is retained
- **Dropped features**: `options_tail_risk_z` (MI=0.002, noise) and `options_skew_momentum_z` (MI=0.017, marginal)
- **Optuna objective**: Single-column MI (not sum of 3)
- **Added parameter**: `input_scaling` to optionally standardize HMM inputs

## Architecture

- 2D HMM on [SKEW daily changes, GVZ daily changes]
- Output: P(highest-variance regime) in [0, 1]
- Optuna search space: n_components (2-3), input_scaling (True/False)
- 30 trials, 5 minute timeout

## 1. Install Dependencies

In [None]:
import subprocess
import sys

print("Installing hmmlearn...")
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'hmmlearn', '--quiet'])
print("hmmlearn installed successfully")

## 2. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from hmmlearn.hmm import GaussianHMM
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import StandardScaler
import optuna
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

print("All libraries imported successfully")
print(f"Execution started: {datetime.now().isoformat()}")

## 3. Data Fetching

In [None]:
def fetch_data():
    """
    Fetch SKEW, GVZ, and Gold price data.
    Returns aligned DataFrame with daily changes.
    """
    print("\n=== Fetching Data ===")
    
    # Date range: 2014-10-01 to 2025-02-15 (includes warmup buffer)
    start_date = '2014-10-01'
    end_date = '2025-02-15'
    
    # 1. Fetch SKEW from Yahoo Finance
    print("Fetching SKEW Index (^SKEW)...")
    try:
        skew_ticker = yf.Ticker('^SKEW')
        skew_data = skew_ticker.history(start=start_date, end=end_date, auto_adjust=True)
        if skew_data.empty:
            raise ValueError("SKEW data is empty")
        skew_df = skew_data[['Close']].rename(columns={'Close': 'skew_close'})
        print(f"  SKEW: {len(skew_df)} rows fetched, range {skew_df.index.min()} to {skew_df.index.max()}")
    except Exception as e:
        print(f"  ERROR fetching SKEW: {e}")
        raise
    
    # 2. Fetch GVZ from Yahoo Finance (primary, no FRED dependency)
    print("Fetching Gold Volatility Index (^GVZ)...")
    try:
        gvz_ticker = yf.Ticker('^GVZ')
        gvz_data = gvz_ticker.history(start=start_date, end=end_date, auto_adjust=True)
        if gvz_data.empty:
            raise ValueError("GVZ data is empty")
        gvz_df = gvz_data[['Close']].rename(columns={'Close': 'gvz_close'})
        print(f"  GVZ: {len(gvz_df)} rows fetched, range {gvz_df.index.min()} to {gvz_df.index.max()}")
    except Exception as e:
        print(f"  ERROR fetching GVZ: {e}")
        raise
    
    # 3. Fetch Gold price (GC=F) for target variable
    print("Fetching Gold futures (GC=F)...")
    try:
        gold_ticker = yf.Ticker('GC=F')
        gold_data = gold_ticker.history(start=start_date, end=end_date, auto_adjust=True)
        if gold_data.empty:
            raise ValueError("Gold data is empty")
        gold_df = gold_data[['Close']].rename(columns={'Close': 'gold_close'})
        print(f"  Gold: {len(gold_df)} rows fetched, range {gold_df.index.min()} to {gold_df.index.max()}")
    except Exception as e:
        print(f"  ERROR fetching Gold: {e}")
        raise
    
    # 4. Align data on common dates (inner join)
    print("\nAligning data on common dates...")
    df = skew_df.join(gvz_df, how='inner').join(gold_df, how='inner')
    print(f"  Aligned: {len(df)} rows")
    
    # 5. Forward-fill gaps up to 3 days
    print("Forward-filling gaps (max 3 days)...")
    df = df.ffill(limit=3)
    
    # 6. Compute daily changes
    print("Computing daily changes...")
    df['skew_change'] = df['skew_close'].diff()
    df['gvz_change'] = df['gvz_close'].diff()
    df['gold_return'] = df['gold_close'].pct_change() * 100  # Percentage return
    
    # 7. Create target: next-day gold return
    df['gold_return_next'] = df['gold_return'].shift(-1)
    
    # 8. Drop NaN rows from diff operations
    df = df.dropna(subset=['skew_change', 'gvz_change', 'gold_return_next'])
    
    print(f"\nFinal dataset: {len(df)} rows")
    print(f"Date range: {df.index.min()} to {df.index.max()}")
    print(f"\nData summary:")
    print(df[['skew_close', 'gvz_close', 'skew_change', 'gvz_change']].describe())
    
    # Quality checks
    print("\n=== Data Quality Checks ===")
    print(f"SKEW range: [{df['skew_close'].min():.1f}, {df['skew_close'].max():.1f}]")
    print(f"GVZ range: [{df['gvz_close'].min():.1f}, {df['gvz_close'].max():.1f}]")
    print(f"SKEW change std: {df['skew_change'].std():.2f}")
    print(f"GVZ change std: {df['gvz_change'].std():.2f}")
    print(f"Missing data: {df[['skew_change', 'gvz_change']].isna().sum().sum()} cells")
    
    return df

# Fetch data
data = fetch_data()

## 4. Data Splitting

In [None]:
# Time-series split: 70/15/15 (train/val/test)
n = len(data)
train_size = int(n * 0.70)
val_size = int(n * 0.15)
test_size = n - train_size - val_size

train_end = train_size
val_end = train_size + val_size

# Create masks
train_mask = np.arange(train_size)
val_mask = np.arange(train_size, val_end)
test_mask = np.arange(val_end, n)

print(f"\n=== Data Split ===")
print(f"Total samples: {n}")
print(f"Train: {len(train_mask)} samples ({train_mask[0]} to {train_mask[-1]})")
print(f"Val:   {len(val_mask)} samples ({val_mask[0]} to {val_mask[-1]})")
print(f"Test:  {len(test_mask)} samples ({test_mask[0]} to {test_mask[-1]})")
print(f"\nDate ranges:")
print(f"Train: {data.index[train_mask[0]]} to {data.index[train_mask[-1]]}")
print(f"Val:   {data.index[val_mask[0]]} to {data.index[val_mask[-1]]}")
print(f"Test:  {data.index[test_mask[0]]} to {data.index[test_mask[-1]]}")

## 5. Feature Generation Function

In [None]:
def generate_regime_feature(data, n_components, train_size, input_scaling=False):
    """
    Generate regime probability feature using 2D HMM.
    
    Args:
        data: DataFrame with skew_change and gvz_change columns
        n_components: number of HMM states (2 or 3)
        train_size: index for train/val split (fit HMM on train only)
        input_scaling: whether to standardize inputs before HMM fit
    
    Returns:
        regime_prob: array of P(highest-variance state) for full dataset
    """
    # Extract 2D input: [skew_change, gvz_change]
    X = data[['skew_change', 'gvz_change']].values
    
    # Optional input scaling
    if input_scaling:
        scaler = StandardScaler()
        X_train = X[:train_size]
        scaler.fit(X_train)
        X = scaler.transform(X)
    
    # Split into train portion for HMM fitting
    X_train = X[:train_size]
    
    # Fit HMM on training data only
    # Note: hmmlearn 0.3.3 does NOT support n_init parameter
    # We'll fit once with fixed random_state for reproducibility
    model = GaussianHMM(
        n_components=n_components,
        covariance_type='full',
        n_iter=100,
        tol=1e-4,
        random_state=42
    )
    
    model.fit(X_train)
    
    # Generate probabilities for full dataset
    probs = model.predict_proba(X)
    
    # Identify highest-variance state (highest trace of covariance matrix)
    traces = [np.trace(model.covars_[i]) for i in range(n_components)]
    high_var_state = np.argmax(traces)
    
    # Return P(high-variance state)
    regime_prob = probs[:, high_var_state]
    
    return regime_prob

print("Feature generation function defined")

## 6. Optuna Objective Function

In [None]:
def discretize(x, bins=20):
    """
    Discretize continuous feature into quantile bins for MI calculation.
    """
    valid = ~np.isnan(x)
    if valid.sum() < bins:
        return None
    
    x_c = x.copy()
    x_c[~valid] = np.nanmedian(x)
    
    return pd.qcut(x_c, bins, labels=False, duplicates='drop')


def objective(trial):
    """
    Optuna objective: Maximize MI between regime_prob and gold_return_next on validation set.
    """
    # Hyperparameters
    n_components = trial.suggest_categorical('hmm_n_components', [2, 3])
    input_scaling = trial.suggest_categorical('input_scaling', [True, False])
    
    try:
        # Generate regime feature
        regime = generate_regime_feature(
            data, 
            n_components=n_components,
            train_size=train_size,
            input_scaling=input_scaling
        )
        
        # Extract validation portion
        regime_val = regime[val_mask]
        target_val = data['gold_return_next'].values[val_mask]
        
        # Remove NaN values
        mask = ~np.isnan(regime_val) & ~np.isnan(target_val)
        if mask.sum() < 50:
            return 0.0
        
        # Discretize for MI calculation
        feat_disc = discretize(regime_val[mask])
        tgt_disc = discretize(target_val[mask])
        
        if feat_disc is not None and tgt_disc is not None:
            mi = mutual_info_score(feat_disc, tgt_disc)
            return mi
        
        return 0.0
        
    except Exception as e:
        print(f"Trial failed: {e}")
        return 0.0

print("Optuna objective function defined")

## 7. Run Optuna HPO

In [None]:
print("\n=== Running Optuna Hyperparameter Optimization ===")
print("Objective: Maximize MI(regime_prob, gold_return_next) on validation set")
print("Search space:")
print("  hmm_n_components: [2, 3]")
print("  input_scaling: [True, False]")
print("Trials: 30")
print("Timeout: 300 seconds (5 minutes)")
print()

# Create study
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

# Run optimization
study.optimize(objective, n_trials=30, timeout=300)

# Results
print("\n=== Optuna Results ===")
print(f"Completed trials: {len(study.trials)}")
print(f"Best value (MI): {study.best_value:.6f}")
print(f"Best params: {study.best_params}")

best_params = study.best_params
best_value = study.best_value

## 8. Generate Final Output with Best Parameters

In [None]:
print("\n=== Generating Final Submodel Output ===")
print(f"Using best parameters: {best_params}")

# Generate regime feature with best parameters
regime_prob = generate_regime_feature(
    data,
    n_components=best_params['hmm_n_components'],
    train_size=train_size,
    input_scaling=best_params['input_scaling']
)

# Create output DataFrame
output = pd.DataFrame({
    'options_risk_regime_prob': regime_prob
}, index=data.index)

print(f"\nOutput shape: {output.shape}")
print(f"Output columns: {list(output.columns)}")
print(f"\nOutput statistics:")
print(output.describe())
print(f"\nAutocorrelation (lag 1): {output['options_risk_regime_prob'].autocorr(lag=1):.4f}")
print(f"NaN count: {output.isna().sum().sum()}")

# Verify output quality
assert output.shape[1] == 1, "Output must have exactly 1 column"
assert output.isna().sum().sum() == 0, "Output contains NaN values"
assert output['options_risk_regime_prob'].std() > 0.01, "Output is nearly constant"
assert output['options_risk_regime_prob'].autocorr(lag=1) < 0.99, "Output has excessive autocorrelation"

print("\nOutput validation: PASS")

## 9. Save Results

In [None]:
print("\n=== Saving Results ===")

# Save submodel output CSV
output.to_csv('submodel_output.csv')
print("Saved: submodel_output.csv")

# Create training result summary
result = {
    "feature": "options_market",
    "attempt": 2,
    "timestamp": datetime.now().isoformat(),
    "best_params": best_params,
    "optuna_trials_completed": len(study.trials),
    "optuna_best_value": float(best_value),
    "output_shape": list(output.shape),
    "output_columns": list(output.columns),
    "output_stats": {
        "mean": float(output['options_risk_regime_prob'].mean()),
        "std": float(output['options_risk_regime_prob'].std()),
        "min": float(output['options_risk_regime_prob'].min()),
        "max": float(output['options_risk_regime_prob'].max()),
        "autocorr_lag1": float(output['options_risk_regime_prob'].autocorr(lag=1))
    },
    "data_info": {
        "total_samples": int(n),
        "train_samples": int(len(train_mask)),
        "val_samples": int(len(val_mask)),
        "test_samples": int(len(test_mask)),
        "date_range_start": str(data.index.min()),
        "date_range_end": str(data.index.max())
    },
    "design_changes_from_attempt_1": {
        "output_columns": "Reduced from 3 to 1",
        "dropped_features": ["options_tail_risk_z (MI=0.002)", "options_skew_momentum_z (MI=0.017)"],
        "retained_feature": "options_risk_regime_prob (MI=0.031, rank #2/22)",
        "optuna_objective": "Single-column MI (was sum of 3)",
        "new_parameter": "input_scaling (standardize HMM inputs)"
    }
}

# Save JSON
with open('training_result.json', 'w') as f:
    json.dump(result, f, indent=2)
print("Saved: training_result.json")

print("\n=== Training Complete ===")
print(f"Finished: {datetime.now().isoformat()}")
print(f"\nFinal summary:")
print(f"  Feature: options_market")
print(f"  Attempt: 2")
print(f"  Output: 1 column (options_risk_regime_prob)")
print(f"  Best MI: {best_value:.6f}")
print(f"  Best params: {best_params}")
print(f"  Data: {n} samples ({data.index.min()} to {data.index.max()})")