# Gold Prediction SubModel Training - Options Market Attempt 3

Self-contained: Data fetch -> Preprocessing -> HMM + Momentum -> Optuna HPO -> Save results

## Key Changes from Attempt 2
1. EMA post-smoothing on regime probability (Optuna-tuned span 3-8)
2. Added second column: GVZ EMA momentum z-score (directional trend signal)
3. Total output: 2 columns (options_regime_smooth + options_gvz_momentum_z)
4. Optuna objective: sum of MI for both columns
5. Autocorrelation guard: reject trials with autocorr > 0.98

## Architecture
- Component 1: 2D HMM on [SKEW changes, GVZ changes], EMA post-smoothing on P(highest-variance state)
- Component 2: GVZ EMA momentum z-score (short EMA - long EMA, normalized)
- Optuna: 50 trials, 600s timeout, maximize sum of MI on validation set

## 1. Install Dependencies

In [None]:
import subprocess
import sys

print('Installing hmmlearn...')
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'hmmlearn', '--quiet'])
print('hmmlearn installed successfully')

## 2. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from hmmlearn.hmm import GaussianHMM
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
import optuna
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

optuna.logging.set_verbosity(optuna.logging.WARNING)
np.random.seed(42)

print('All libraries imported successfully')
print(f'Execution started: {datetime.now().isoformat()}')

## 3. Dataset Path Resolution

In [None]:
import glob as _glob

PROBE_FILES = ['base_features.csv', 'base_features_raw.csv', 'vix.csv']
candidates = [
    '/kaggle/input/gold-prediction-submodels',
    '/kaggle/input/datasets/bigbigzabuton/gold-prediction-submodels'
]
DATASET_PATH = None
for c in candidates:
    if os.path.isdir(c) and any(f in os.listdir(c) for f in PROBE_FILES):
        DATASET_PATH = c
        break
    elif os.path.isdir(c):
        print(f'Dir exists but probe files missing: {c} -> {os.listdir(c)[:5]}')
if DATASET_PATH is None:
    raise RuntimeError(
        f'Dataset not found. Tried: {candidates}. '
        f'/kaggle/input/: {os.listdir("/kaggle/input")}'
    )

print(f'DATASET_PATH: {DATASET_PATH}')
print(f'Files: {os.listdir(DATASET_PATH)[:10]}')

## 4. Data Fetching

In [None]:
def fetch_data():
    """
    Fetch SKEW, GVZ, and Gold price data from Yahoo Finance.
    Returns aligned DataFrame with daily changes and target variable.
    """
    print('\n=== Fetching Data ===')

    start_date = '2014-10-01'
    end_date = datetime.now().strftime('%Y-%m-%d')

    # 1. Fetch SKEW from Yahoo Finance
    print('Fetching SKEW Index (^SKEW)...')
    skew_ticker = yf.Ticker('^SKEW')
    skew_data = skew_ticker.history(start=start_date, end=end_date, auto_adjust=True)
    if skew_data.empty:
        raise ValueError('SKEW data is empty. Check ticker ^SKEW.')
    skew_df = skew_data[['Close']].rename(columns={'Close': 'skew_close'})
    # Remove timezone info for consistent joining
    skew_df.index = skew_df.index.tz_localize(None)
    print(f'  SKEW: {len(skew_df)} rows, {skew_df.index.min()} to {skew_df.index.max()}')

    # 2. Fetch GVZ from Yahoo Finance
    print('Fetching Gold Volatility Index (^GVZ)...')
    gvz_ticker = yf.Ticker('^GVZ')
    gvz_data = gvz_ticker.history(start=start_date, end=end_date, auto_adjust=True)
    if gvz_data.empty:
        raise ValueError('GVZ data is empty. Check ticker ^GVZ.')
    gvz_df = gvz_data[['Close']].rename(columns={'Close': 'gvz_close'})
    gvz_df.index = gvz_df.index.tz_localize(None)
    print(f'  GVZ: {len(gvz_df)} rows, {gvz_df.index.min()} to {gvz_df.index.max()}')

    # 3. Fetch Gold price (GC=F) for target variable
    print('Fetching Gold futures (GC=F)...')
    gold_ticker = yf.Ticker('GC=F')
    gold_data = gold_ticker.history(start=start_date, end=end_date, auto_adjust=True)
    if gold_data.empty:
        raise ValueError('Gold data is empty. Check ticker GC=F.')
    gold_df = gold_data[['Close']].rename(columns={'Close': 'gold_close'})
    gold_df.index = gold_df.index.tz_localize(None)
    print(f'  Gold: {len(gold_df)} rows, {gold_df.index.min()} to {gold_df.index.max()}')

    # 4. Inner join on common dates
    print('\nAligning data on common dates...')
    df = skew_df.join(gvz_df, how='inner').join(gold_df, how='inner')
    print(f'  Aligned: {len(df)} rows')

    # 5. Forward-fill gaps up to 3 days
    df = df.ffill(limit=3)

    # 6. Compute daily changes
    df['skew_change'] = df['skew_close'].diff()
    df['gvz_change'] = df['gvz_close'].diff()
    df['gold_return'] = df['gold_close'].pct_change() * 100

    # 7. Target: next-day gold return
    df['gold_return_next'] = df['gold_return'].shift(-1)

    # 8. Drop NaN from diff/shift operations
    df = df.dropna(subset=['skew_change', 'gvz_change', 'gold_return_next'])

    print(f'\nFinal dataset: {len(df)} rows')
    print(f'Date range: {df.index.min()} to {df.index.max()}')

    # Quality checks
    print('\n=== Data Quality Checks ===')
    print(f'SKEW range: [{df["skew_close"].min():.1f}, {df["skew_close"].max():.1f}] (expected [100, 200])')
    print(f'GVZ range: [{df["gvz_close"].min():.1f}, {df["gvz_close"].max():.1f}] (expected [5, 80])')
    print(f'SKEW change std: {df["skew_change"].std():.2f}')
    print(f'GVZ change std: {df["gvz_change"].std():.2f}')
    print(f'Missing data: {df[["skew_change", "gvz_change"]].isna().sum().sum()} cells')

    return df


data = fetch_data()

## 5. Data Splitting

In [None]:
# Time-series split: 70/15/15 (train/val/test)
n = len(data)
train_size = int(n * 0.70)
val_size = int(n * 0.15)

train_end = train_size
val_end = train_size + val_size

train_mask = np.arange(train_size)
val_mask = np.arange(train_size, val_end)
test_mask = np.arange(val_end, n)

print('\n=== Data Split ===')
print(f'Total samples: {n}')
print(f'Train: {len(train_mask)} samples ({train_mask[0]} to {train_mask[-1]})')
print(f'Val:   {len(val_mask)} samples ({val_mask[0]} to {val_mask[-1]})')
print(f'Test:  {len(test_mask)} samples ({test_mask[0]} to {test_mask[-1]})')
print(f'\nDate ranges:')
print(f'Train: {data.index[train_mask[0]]} to {data.index[train_mask[-1]]}')
print(f'Val:   {data.index[val_mask[0]]} to {data.index[val_mask[-1]]}')
print(f'Test:  {data.index[test_mask[0]]} to {data.index[test_mask[-1]]}')

## 6. Feature Generation Functions

In [None]:
def generate_regime_feature(df, n_components, train_size, input_scaling, ema_span):
    """
    Generate EMA-smoothed regime probability using 2D HMM.

    Input: [SKEW changes, GVZ changes]
    HMM fits on train portion only.
    Selects P(highest-trace-covariance state) = highest variance regime.
    Applies EMA post-smoothing (causal, no lookahead).

    Returns:
        numpy array of smoothed regime probabilities for full dataset
    """
    X = df[['skew_change', 'gvz_change']].values

    if input_scaling:
        scaler = StandardScaler()
        scaler.fit(X[:train_size])
        X = scaler.transform(X)

    X_train = X[:train_size]

    model = GaussianHMM(
        n_components=n_components,
        covariance_type='full',
        n_iter=100,
        tol=1e-4,
        random_state=42
    )
    model.fit(X_train)

    probs = model.predict_proba(X)

    # Select highest-variance state (highest trace of covariance matrix)
    traces = [np.trace(model.covars_[i]) for i in range(n_components)]
    high_var_state = np.argmax(traces)
    raw_regime = probs[:, high_var_state]

    # EMA post-smoothing (causal, no lookahead)
    smoothed = pd.Series(raw_regime).ewm(span=ema_span, adjust=False).mean().values

    return smoothed


def generate_gvz_momentum_z(gvz_series, ema_short, ema_long, norm_window):
    """
    GVZ EMA momentum z-score.
    Captures directional trend in gold-specific implied volatility.

    momentum = EMA(short) - EMA(long)
    z = (momentum - rolling_mean) / rolling_std
    clipped to [-3, 3]

    All operations are causal (each value depends only on current and past data).

    Returns:
        numpy array of z-scores
    """
    ema_s = gvz_series.ewm(span=ema_short, adjust=False).mean()
    ema_l = gvz_series.ewm(span=ema_long, adjust=False).mean()
    momentum = ema_s - ema_l

    mom_mean = momentum.rolling(norm_window).mean()
    mom_std = momentum.rolling(norm_window).std()
    z = ((momentum - mom_mean) / (mom_std + 1e-8)).clip(-3, 3)

    return z.values


print('Feature generation functions defined')

## 7. Optuna Objective Function

In [None]:
def objective(trial):
    """
    Optuna objective: maximize sum of MI for both output columns on validation set.

    Penalty: return 0.0 if either column has autocorr(lag=1) > 0.98.
    Constraint: gvz_ema_long > gvz_ema_short + 5
    """
    # HMM parameters
    n_components = trial.suggest_categorical('hmm_n_components', [2, 3])
    input_scaling = trial.suggest_categorical('input_scaling', [True, False])
    ema_span = trial.suggest_int('ema_span', 3, 8)

    # GVZ momentum parameters
    gvz_ema_short = trial.suggest_int('gvz_ema_short', 3, 10)
    gvz_ema_long = trial.suggest_int('gvz_ema_long', 15, 50)
    gvz_norm_window = trial.suggest_int('gvz_norm_window', 30, 90)

    # Constraint: long EMA must be sufficiently larger than short EMA
    if gvz_ema_long <= gvz_ema_short + 5:
        return 0.0

    try:
        # Generate regime feature
        regime = generate_regime_feature(
            data, n_components, train_size, input_scaling, ema_span
        )

        # Generate GVZ momentum z-score
        momentum_z = generate_gvz_momentum_z(
            data['gvz_close'], gvz_ema_short, gvz_ema_long, gvz_norm_window
        )

        # Autocorrelation guard: reject trials exceeding threshold
        regime_series = pd.Series(regime)
        mom_series = pd.Series(momentum_z)

        regime_ac = regime_series.dropna().autocorr(lag=1)
        mom_ac = mom_series.dropna().autocorr(lag=1)

        if not np.isfinite(regime_ac) or regime_ac > 0.98:
            return 0.0
        if not np.isfinite(mom_ac) or mom_ac > 0.98:
            return 0.0

        # Extract validation portion
        regime_val = regime[val_mask]
        mom_val = momentum_z[val_mask]
        target_val = data['gold_return_next'].values[val_mask]

        # Remove NaN
        feat_df = pd.DataFrame({'regime': regime_val, 'momentum': mom_val})
        target_series = pd.Series(target_val)
        valid_mask = feat_df.notna().all(axis=1) & target_series.notna()

        if valid_mask.sum() < 50:
            return 0.0

        feat_valid = feat_df[valid_mask].values
        target_valid = target_series[valid_mask].values

        # Compute MI for both columns combined
        mi = mutual_info_regression(feat_valid, target_valid, random_state=42)
        return float(mi.sum())

    except Exception as e:
        return 0.0


print('Optuna objective function defined')

## 8. Run Optuna HPO

In [None]:
print('\n=== Running Optuna Hyperparameter Optimization ===')
print('Objective: Maximize sum of MI for both output columns on validation set')
print('Search space:')
print('  hmm_n_components: [2, 3]')
print('  input_scaling: [True, False]')
print('  ema_span: [3, 8]')
print('  gvz_ema_short: [3, 10]')
print('  gvz_ema_long: [15, 50]')
print('  gvz_norm_window: [30, 90]')
print('Trials: 50, Timeout: 600s')
print('Autocorrelation guard: reject trials with autocorr > 0.98')
print()

study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

study.optimize(objective, n_trials=50, timeout=600)

print('\n=== Optuna Results ===')
print(f'Completed trials: {len(study.trials)}')
print(f'Best value (sum of MI): {study.best_value:.6f}')
print(f'Best params: {study.best_params}')

best_params = study.best_params
best_value = study.best_value

## 9. Generate Final Output with Best Parameters

In [None]:
print('\n=== Generating Final Submodel Output ===')
print(f'Using best parameters: {best_params}')

# Component 1: EMA-smoothed regime probability
regime_prob = generate_regime_feature(
    data,
    n_components=best_params['hmm_n_components'],
    train_size=train_size,
    input_scaling=best_params['input_scaling'],
    ema_span=best_params['ema_span']
)

# Component 2: GVZ EMA momentum z-score
gvz_momentum = generate_gvz_momentum_z(
    data['gvz_close'],
    ema_short=best_params['gvz_ema_short'],
    ema_long=best_params['gvz_ema_long'],
    norm_window=best_params['gvz_norm_window']
)

# Create output DataFrame
output = pd.DataFrame({
    'options_regime_smooth': regime_prob,
    'options_gvz_momentum_z': gvz_momentum
}, index=data.index)

# Drop rows where momentum z is NaN (warmup period from rolling normalization)
output = output.dropna()

print(f'\nOutput shape: {output.shape}')
print(f'Output columns: {list(output.columns)}')
print(f'\nOutput statistics:')
print(output.describe())

# Verify autocorrelation
for col in output.columns:
    ac = output[col].autocorr(lag=1)
    print(f'{col} autocorr(lag=1): {ac:.4f}')

# Verify output quality
assert output.shape[1] == 2, f'Output must have exactly 2 columns, got {output.shape[1]}'
assert 'options_regime_smooth' in output.columns, 'Missing options_regime_smooth'
assert 'options_gvz_momentum_z' in output.columns, 'Missing options_gvz_momentum_z'
assert output.isna().sum().sum() == 0, f'Output contains {output.isna().sum().sum()} NaN values'
assert output['options_regime_smooth'].std() > 0.01, 'options_regime_smooth is nearly constant'
assert output['options_gvz_momentum_z'].std() > 0.01, 'options_gvz_momentum_z is nearly constant'
assert output['options_regime_smooth'].autocorr(lag=1) < 0.99, 'options_regime_smooth has excessive autocorr'
assert output['options_gvz_momentum_z'].autocorr(lag=1) < 0.99, 'options_gvz_momentum_z has excessive autocorr'

print('\nOutput validation: PASS')

## 10. Compute Per-Column Statistics

In [None]:
# Per-column detailed statistics for training_result.json
col_stats = {}
for col in output.columns:
    s = output[col]
    col_stats[col] = {
        'mean': float(s.mean()),
        'std': float(s.std()),
        'min': float(s.min()),
        'max': float(s.max()),
        'autocorr_lag1': float(s.autocorr(lag=1)),
        'nan_ratio': float(s.isna().sum() / len(s))
    }

print('Per-column statistics:')
for col, stats in col_stats.items():
    print(f'  {col}:')
    for k, v in stats.items():
        print(f'    {k}: {v:.4f}')

## 11. Save Results

In [None]:
print('\n=== Saving Results ===')

# Save submodel output CSV
output.to_csv('submodel_output.csv')
print('Saved: submodel_output.csv')

# Create training result summary
result = {
    'feature': 'options_market',
    'attempt': 3,
    'timestamp': datetime.now().isoformat(),
    'best_params': {
        k: (int(v) if isinstance(v, (np.integer,)) else (bool(v) if isinstance(v, (np.bool_,)) else v))
        for k, v in best_params.items()
    },
    'optuna_trials_completed': len(study.trials),
    'optuna_best_value': float(best_value),
    'output_shape': list(output.shape),
    'output_columns': list(output.columns),
    'output_stats': col_stats,
    'data_info': {
        'total_samples': int(n),
        'train_samples': int(len(train_mask)),
        'val_samples': int(len(val_mask)),
        'test_samples': int(len(test_mask)),
        'date_range_start': str(data.index.min()),
        'date_range_end': str(data.index.max()),
        'output_date_range_start': str(output.index.min()),
        'output_date_range_end': str(output.index.max()),
        'output_rows': int(len(output))
    },
    'design_changes_from_attempt_2': {
        'output_columns': 'Increased from 1 to 2',
        'column_1_change': 'options_risk_regime_prob (raw) -> options_regime_smooth (EMA post-smoothed)',
        'column_2_added': 'options_gvz_momentum_z (GVZ EMA momentum z-score)',
        'new_optuna_params': ['ema_span', 'gvz_ema_short', 'gvz_ema_long', 'gvz_norm_window'],
        'optuna_objective': 'Sum of MI for both columns (was single-column MI)',
        'autocorr_guard': 'Reject trials with autocorr > 0.98 for either column',
        'optuna_trials': '50 (was 30)'
    }
}

with open('training_result.json', 'w') as f:
    json.dump(result, f, indent=2)
print('Saved: training_result.json')

print('\n=== Training Complete ===')
print(f'Finished: {datetime.now().isoformat()}')
print(f'\nFinal summary:')
print(f'  Feature: options_market')
print(f'  Attempt: 3')
print(f'  Output: 2 columns (options_regime_smooth, options_gvz_momentum_z)')
print(f'  Best Optuna MI sum: {best_value:.6f}')
print(f'  Best params: {best_params}')
print(f'  Output shape: {output.shape}')
print(f'  Output date range: {output.index.min()} to {output.index.max()}')