# Gold Meta-Model Training - Attempt 17

**Architecture:** XGBoost (attempt 7 exact hyperparameters) + Bootstrap DATA SUBSAMPLING ensemble (12 models)

**Key Design:**
- Attempt 7 (XGBoost, single model): DA 60.04%, HCDA 64.13%, Sharpe 2.46 (BEST DA and Sharpe)
- Attempt 16 (LightGBM + bootstrap 5-seed): DA 58.52%, HCDA 68.48% (BEST HCDA)
- Attempt 17: XGBoost attempt 7 hyperparameters + bootstrap DATA SUBSAMPLING (genuine diversity)

**Why data subsampling (not seed variation):**
Attempt 7 tried 5-seed bootstrap but std_mean=0.008 (too uniform, models agreed too closely).
Attempt 16's LightGBM + seed bootstrap achieved std_mean=0.025 (sufficient diversity).
Data subsampling trains each ensemble member on a different 80% bootstrap sample (with replacement),
creating genuine prediction diversity while preserving XGBoost's superior base performance.

**NO Optuna HPO** - using exact attempt 7 hyperparameters.
**BOTH HCDA methods** computed: bootstrap std-based and |prediction|-based.

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import json
import os
import glob
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

print(f"XGBoost version: {xgb.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Started: {datetime.now().isoformat()}")
print(f"Attempt: 17")
print(f"Architecture: XGBoost (attempt 7 params) + Bootstrap Data Subsampling Ensemble")

In [None]:
print("="*60)
print("FETCHING DATA FROM APIs")
print("="*60)

import yfinance as yf

try:
    from fredapi import Fred
except ImportError:
    import subprocess
    subprocess.run(["pip", "install", "fredapi"], check=True)
    from fredapi import Fred

FRED_API_KEY = os.environ['FRED_API_KEY']
fred = Fred(api_key=FRED_API_KEY)
print("FRED API initialized")

# Gold Price
print("\nFetching gold price (GC=F)...")
gold = yf.download('GC=F', start='2014-01-01', end='2026-02-20', progress=False)
if gold.empty:
    raise ValueError("Gold price data is empty - yf.download('GC=F') returned no data")
gold_df = gold[['Close']].copy()
gold_df.columns = ['gold_price']
gold_df['gold_return'] = gold_df['gold_price'].pct_change() * 100
gold_df['gold_return_next'] = gold_df['gold_return'].shift(-1)
gold_df = gold_df.dropna(subset=['gold_return_next'])
gold_df.index = pd.to_datetime(gold_df.index).strftime('%Y-%m-%d')
print(f"  Gold: {len(gold_df)} rows")

print("\nFetching base features...")

real_rate = fred.get_series('DFII10', observation_start='2014-01-01')
real_rate_df = real_rate.to_frame('real_rate_real_rate')
real_rate_df.index = pd.to_datetime(real_rate_df.index).strftime('%Y-%m-%d')

dxy = yf.download('DX-Y.NYB', start='2014-01-01', end='2026-02-20', progress=False)
if dxy.empty:
    raise ValueError("DXY data is empty - yf.download('DX-Y.NYB') returned no data")
dxy_df = dxy[['Close']].copy()
dxy_df.columns = ['dxy_dxy']
dxy_df.index = pd.to_datetime(dxy_df.index).strftime('%Y-%m-%d')

vix = fred.get_series('VIXCLS', observation_start='2014-01-01')
vix_df = vix.to_frame('vix_vix')
vix_df.index = pd.to_datetime(vix_df.index).strftime('%Y-%m-%d')

dgs10 = fred.get_series('DGS10', observation_start='2014-01-01')
dgs2 = fred.get_series('DGS2', observation_start='2014-01-01')
yc_df = pd.DataFrame({'DGS10': dgs10, 'DGS2': dgs2})
yc_df['yield_curve_yield_spread'] = yc_df['DGS10'] - yc_df['DGS2']
yc_df = yc_df[['yield_curve_yield_spread']]
yc_df.index = pd.to_datetime(yc_df.index).strftime('%Y-%m-%d')

infl_exp = fred.get_series('T10YIE', observation_start='2014-01-01')
infl_exp_df = infl_exp.to_frame('inflation_expectation_inflation_expectation')
infl_exp_df.index = pd.to_datetime(infl_exp_df.index).strftime('%Y-%m-%d')

base_features = gold_df[['gold_return_next']].copy()
for df in [real_rate_df, dxy_df, vix_df, yc_df, infl_exp_df]:
    base_features = base_features.join(df, how='left')
base_features = base_features.ffill()
print(f"  Base features: {len(base_features)} rows, {len(base_features.columns)} columns")
print("\nData fetching complete")

In [None]:
print("\nApplying transformations...")

final_df = base_features.copy()

final_df['real_rate_change'] = final_df['real_rate_real_rate'].diff()
final_df['dxy_change'] = final_df['dxy_dxy'].diff()
final_df['vix'] = final_df['vix_vix']
final_df['yield_spread_change'] = final_df['yield_curve_yield_spread'].diff()
final_df['inflation_exp_change'] = final_df['inflation_expectation_inflation_expectation'].diff()

final_df = final_df.drop(columns=['real_rate_real_rate', 'dxy_dxy', 'vix_vix',
                                    'yield_curve_yield_spread', 'inflation_expectation_inflation_expectation'])

print(f"  Base transformations applied")
print(f"  Columns so far: {list(final_df.columns)}")

In [None]:
FEATURE_COLUMNS = [
    # Base features (5)
    'real_rate_change',
    'dxy_change',
    'vix',
    'yield_spread_change',
    'inflation_exp_change',
    # VIX submodel (3)
    'vix_regime_probability',
    'vix_mean_reversion_z',
    'vix_persistence',
    # Technical submodel (3)
    'tech_trend_regime_prob',
    'tech_mean_reversion_z',
    'tech_volatility_regime',
    # Cross-asset submodel (3)
    'xasset_regime_prob',
    'xasset_recession_signal',
    'xasset_divergence',
    # Yield curve submodel (2)
    'yc_spread_velocity_z',
    'yc_curvature_z',
    # ETF flow submodel (3)
    'etf_regime_prob',
    'etf_capital_intensity',
    'etf_pv_divergence',
    # Inflation expectation submodel (3)
    'ie_regime_prob',
    'ie_anchoring_z',
    'ie_gold_sensitivity_z',
    # Options market submodel (1)
    'options_risk_regime_prob',
    # Temporal context submodel (1)
    'temporal_context_score',
]

TARGET = 'gold_return_next'

assert len(FEATURE_COLUMNS) == 24, f"Expected 24 features, got {len(FEATURE_COLUMNS)}"
print(f"Features defined: {len(FEATURE_COLUMNS)} features")
print(f"Feature list: {FEATURE_COLUMNS}")

In [None]:
# ============================================================
# DATASET PATH RESOLUTION (robust: use glob + pd.read_csv probe)
# API-created kernels: /kaggle/input/datasets/{owner}/{slug}/
# Web-UI-created kernels: /kaggle/input/{slug}/
# Use glob to find the file, then probe with pd.read_csv to verify
# ============================================================
import pandas as _pd_probe
DATASET_SLUG = 'gold-prediction-submodels'
DATASET_OWNER = 'bigbigzabuton'
_PROBE_FILE = 'vix.csv'

# Use glob to search all plausible locations with absolute paths
_glob_patterns = [
    f'/kaggle/input/datasets/{DATASET_OWNER}/{DATASET_SLUG}/{_PROBE_FILE}',
    f'/kaggle/input/{DATASET_SLUG}/{_PROBE_FILE}',
    f'/kaggle/input/datasets/*/{DATASET_SLUG}/{_PROBE_FILE}',
    f'/kaggle/input/*/{_PROBE_FILE}',
]

DATASET_BASE = None
for _pattern in _glob_patterns:
    _matches = glob.glob(_pattern)
    if _matches:
        _candidate_base = os.path.dirname(_matches[0])
        # Verify by actually reading the file with pandas
        try:
            _pd_probe.read_csv(_matches[0], nrows=1)
            DATASET_BASE = _candidate_base
            print(f"Dataset found at: {DATASET_BASE} (pattern: {_pattern})")
            break
        except Exception as _e:
            print(f"  Found at {_matches[0]} but read failed: {_e}")

if DATASET_BASE is None:
    print("ERROR: Dataset not found via glob! Searched patterns:")
    for _p in _glob_patterns:
        print(f"  {_p} -> {glob.glob(_p)}")
    print("\nFull /kaggle/input/ listing:")
    try:
        import subprocess as _sp
        _r = _sp.run(['find', '/kaggle/input', '-name', _PROBE_FILE, '-type', 'f'],
                     capture_output=True, text=True, timeout=15)
        print(f"  find result: {_r.stdout.strip() or '(nothing found)'}")
        print(f"  find stderr: {_r.stderr.strip()}")
    except Exception as _e:
        print(f"  find failed: {_e}")
    raise FileNotFoundError(
        f"Dataset '{DATASET_SLUG}' not found. "
        f"Tried patterns: {_glob_patterns}. "
        "Ensure dataset_sources includes 'bigbigzabuton/gold-prediction-submodels' in kernel-metadata.json."
    )

In [None]:
print("\nLoading submodel outputs from Kaggle Dataset...")

submodel_files = {
    'vix': {
        'path': f'{DATASET_BASE}/vix.csv',
        'columns': ['vix_regime_probability', 'vix_mean_reversion_z', 'vix_persistence'],
        'date_col': 'date',
        'tz_aware': False,
    },
    'technical': {
        'path': f'{DATASET_BASE}/technical.csv',
        'columns': ['tech_trend_regime_prob', 'tech_mean_reversion_z', 'tech_volatility_regime'],
        'date_col': 'date',
        'tz_aware': True,
    },
    'cross_asset': {
        'path': f'{DATASET_BASE}/cross_asset.csv',
        'columns': ['xasset_regime_prob', 'xasset_recession_signal', 'xasset_divergence'],
        'date_col': 'Date',
        'tz_aware': False,
    },
    'yield_curve': {
        'path': f'{DATASET_BASE}/yield_curve.csv',
        'columns': ['yc_spread_velocity_z', 'yc_curvature_z'],
        'date_col': 'index',
        'tz_aware': False,
    },
    'etf_flow': {
        'path': f'{DATASET_BASE}/etf_flow.csv',
        'columns': ['etf_regime_prob', 'etf_capital_intensity', 'etf_pv_divergence'],
        'date_col': 'Date',
        'tz_aware': False,
    },
    'inflation_expectation': {
        'path': f'{DATASET_BASE}/inflation_expectation.csv',
        'columns': ['ie_regime_prob', 'ie_anchoring_z', 'ie_gold_sensitivity_z'],
        'date_col': 'Unnamed: 0',
        'tz_aware': False,
    },
    'options_market': {
        'path': f'{DATASET_BASE}/options_market.csv',
        'columns': ['options_risk_regime_prob'],
        'date_col': 'Date',
        'tz_aware': True,
    },
    'temporal_context': {
        'path': f'{DATASET_BASE}/temporal_context.csv',
        'columns': ['temporal_context_score'],
        'date_col': 'date',
        'tz_aware': False,
    },
}

submodel_dfs = {}
for feature, spec in submodel_files.items():
    df = pd.read_csv(spec['path'])
    date_col = spec['date_col']
    if spec['tz_aware']:
        df['Date'] = pd.to_datetime(df[date_col], utc=True).dt.strftime('%Y-%m-%d')
    else:
        if date_col == 'index':
            df['Date'] = pd.to_datetime(df.iloc[:, 0]).dt.strftime('%Y-%m-%d')
        elif date_col == 'Unnamed: 0':
            df['Date'] = pd.to_datetime(df['Unnamed: 0']).dt.strftime('%Y-%m-%d')
        else:
            df['Date'] = pd.to_datetime(df[date_col]).dt.strftime('%Y-%m-%d')
    df = df[['Date'] + spec['columns']]
    df = df.set_index('Date')
    submodel_dfs[feature] = df
    print(f"  {feature}: {len(df)} rows")

print("\nMerging submodel outputs...")
for feature, df in submodel_dfs.items():
    final_df = final_df.join(df, how='left')
print(f"  Features after merge: {final_df.shape[1]} columns, {len(final_df)} rows")

print("\nApplying NaN imputation...")
nan_before = final_df.isna().sum().sum()
print(f"  NaN before imputation: {nan_before}")

regime_cols = ['vix_regime_probability', 'tech_trend_regime_prob',
               'xasset_regime_prob', 'etf_regime_prob', 'ie_regime_prob',
               'options_risk_regime_prob', 'temporal_context_score']
for col in regime_cols:
    if col in final_df.columns:
        final_df[col] = final_df[col].fillna(0.5)

z_cols = ['vix_mean_reversion_z', 'tech_mean_reversion_z',
          'yc_spread_velocity_z', 'yc_curvature_z',
          'etf_capital_intensity', 'etf_pv_divergence',
          'ie_anchoring_z', 'ie_gold_sensitivity_z']
for col in z_cols:
    if col in final_df.columns:
        final_df[col] = final_df[col].fillna(0.0)

div_cols = ['xasset_recession_signal', 'xasset_divergence']
for col in div_cols:
    if col in final_df.columns:
        final_df[col] = final_df[col].fillna(0.0)

cont_cols = ['tech_volatility_regime', 'vix_persistence']
for col in cont_cols:
    if col in final_df.columns:
        final_df[col] = final_df[col].fillna(final_df[col].median())

final_df = final_df.dropna(subset=['gold_return_next', 'real_rate_change', 'dxy_change',
                                     'vix', 'yield_spread_change', 'inflation_exp_change'])

nan_after = final_df.isna().sum().sum()
print(f"  NaN after imputation: {nan_after}")
print(f"  Final dataset: {len(final_df)} rows")

assert all(col in final_df.columns for col in FEATURE_COLUMNS), "Missing features after merge!"
assert TARGET in final_df.columns, "Target not found!"
print(f"\nAll {len(FEATURE_COLUMNS)} features present")
print(f"Dataset shape: {final_df.shape}")
print(f"Date range: {final_df.index.min()} to {final_df.index.max()}")

In [None]:
n_total = len(final_df)
n_train = int(n_total * 0.70)
n_val = int(n_total * 0.15)

train_df = final_df.iloc[:n_train].copy()
val_df = final_df.iloc[n_train:n_train+n_val].copy()
test_df = final_df.iloc[n_train+n_val:].copy()

print(f"\nData split complete:")
print(f"  Train: {len(train_df)} rows ({len(train_df)/n_total*100:.1f}%) - {train_df.index.min()} to {train_df.index.max()}")
print(f"  Val:   {len(val_df)} rows ({len(val_df)/n_total*100:.1f}%) - {val_df.index.min()} to {val_df.index.max()}")
print(f"  Test:  {len(test_df)} rows ({len(test_df)/n_total*100:.1f}%) - {test_df.index.min()} to {test_df.index.max()}")
print(f"  Total: {n_total} rows")
print(f"  Samples per feature: {n_train / len(FEATURE_COLUMNS):.1f}:1 (train)")

assert train_df.index.max() < val_df.index.min(), "Train-val overlap detected!"
assert val_df.index.max() < test_df.index.min(), "Val-test overlap detected!"
print(f"\nNo time-series leakage detected")
print("="*60)

X_train = train_df[FEATURE_COLUMNS].values
y_train = train_df[TARGET].values
X_val = val_df[FEATURE_COLUMNS].values
y_val = val_df[TARGET].values
X_test = test_df[FEATURE_COLUMNS].values
y_test = test_df[TARGET].values

X_train_df = train_df[FEATURE_COLUMNS]
X_val_df = val_df[FEATURE_COLUMNS]
X_test_df = test_df[FEATURE_COLUMNS]

dates_train = train_df.index
dates_val = val_df.index
dates_test = test_df.index

print(f"\nArray shapes:")
print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"  X_val:   {X_val.shape}, y_val:   {y_val.shape}")
print(f"  X_test:  {X_test.shape}, y_test:  {y_test.shape}")

In [None]:
def compute_direction_accuracy(y_true, y_pred):
    mask = (y_true != 0) & (y_pred != 0)
    if mask.sum() == 0:
        return 0.0
    return (np.sign(y_pred[mask]) == np.sign(y_true[mask])).mean()

def compute_mae(y_true, y_pred):
    return np.abs(y_pred - y_true).mean()

def compute_sharpe_trade_cost(y_true, y_pred, cost_bps=5.0):
    positions = np.sign(y_pred)
    strategy_returns = positions * y_true / 100.0
    position_changes = np.abs(np.diff(positions, prepend=0))
    trade_costs = position_changes * (cost_bps / 10000.0)
    net_returns = strategy_returns - trade_costs
    if len(net_returns) < 2 or net_returns.std() == 0:
        return 0.0
    return (net_returns.mean() / net_returns.std()) * np.sqrt(252)

def compute_hcda_abs(y_true, y_pred, threshold_percentile=80):
    """HCDA via |prediction| magnitude (attempt 7 approach)"""
    threshold = np.percentile(np.abs(y_pred), threshold_percentile)
    hc_mask = np.abs(y_pred) >= threshold
    if hc_mask.sum() == 0:
        return 0.0, 0.0
    coverage = hc_mask.sum() / len(y_pred)
    hc_pred = y_pred[hc_mask]
    hc_actual = y_true[hc_mask]
    mask = (hc_actual != 0) & (hc_pred != 0)
    if mask.sum() == 0:
        return 0.0, coverage
    da = (np.sign(hc_pred[mask]) == np.sign(hc_actual[mask])).mean()
    return float(da), float(coverage)

def compute_hcda_bootstrap(y_true, y_pred, bootstrap_std, low_std_pct=20):
    """HCDA via bootstrap std - low std = high confidence (attempt 16 approach)"""
    # Bottom X% by std = highest confidence
    threshold_std = np.percentile(bootstrap_std, low_std_pct)
    hc_mask = bootstrap_std <= threshold_std
    if hc_mask.sum() == 0:
        return 0.0, 0.0
    coverage = hc_mask.sum() / len(y_pred)
    hc_pred = y_pred[hc_mask]
    hc_actual = y_true[hc_mask]
    mask = (hc_actual != 0) & (hc_pred != 0)
    if mask.sum() == 0:
        return 0.0, coverage
    da = (np.sign(hc_pred[mask]) == np.sign(hc_actual[mask])).mean()
    return float(da), float(coverage)

print("Metric functions defined")

In [None]:
# ============================================================
# XGBoost Hyperparameters - Attempt 7 Exact Values (NO Optuna)
# ============================================================
print("="*60)
print("XGBOOST HYPERPARAMETERS (ATTEMPT 7 EXACT - NO HPO)")
print("="*60)

xgb_params = {
    "objective": "reg:squarederror",
    "max_depth": 2,
    "min_child_weight": 25,
    "subsample": 0.765,
    "colsample_bytree": 0.450,
    "reg_lambda": 2.049,
    "reg_alpha": 1.107,
    "learning_rate": 0.0215,
    "n_estimators": 621,
    "tree_method": "hist",
    "device": "cuda",
    "random_state": 42,
    "verbosity": 0,
}

print("XGBoost parameters (from attempt 7):")
for k, v in xgb_params.items():
    print(f"  {k}: {v}")

In [None]:
# ============================================================
# Bootstrap Data Subsampling Ensemble Training
# Each model trains on a different 80% bootstrap sample
# ============================================================
print("="*60)
print("BOOTSTRAP DATA SUBSAMPLING ENSEMBLE TRAINING")
print("="*60)

N_ENSEMBLE = 12
BOOTSTRAP_FRAC = 0.80

rng = np.random.RandomState(42)
ensemble_models = []
ensemble_bootstrap_sizes = []

n_bootstrap = int(BOOTSTRAP_FRAC * len(X_train))
print(f"\nEnsemble config:")
print(f"  Number of models: {N_ENSEMBLE}")
print(f"  Bootstrap fraction: {BOOTSTRAP_FRAC:.0%}")
print(f"  Bootstrap sample size: {n_bootstrap} / {len(X_train)} training rows")
print(f"  XGBoost params: attempt 7 exact")
print()

for i in range(N_ENSEMBLE):
    seed = 42 + i
    # Bootstrap sample: random 80% of training data WITH replacement
    bootstrap_idx = rng.choice(len(X_train), size=n_bootstrap, replace=True)
    X_boot = X_train[bootstrap_idx]
    y_boot = y_train[bootstrap_idx]

    # Build model params with this seed
    model_params = xgb_params.copy()
    model_params['random_state'] = seed

    model = xgb.XGBRegressor(**model_params)
    model.fit(
        X_boot, y_boot,
        eval_set=[(X_val, y_val)],
        verbose=False,
    )
    ensemble_models.append(model)
    ensemble_bootstrap_sizes.append(len(np.unique(bootstrap_idx)))  # unique samples used
    print(f"  Model {i+1:2d}/{N_ENSEMBLE}: seed={seed}, unique_rows={len(np.unique(bootstrap_idx))}, bootstrap_size={n_bootstrap}")

print(f"\nEnsemble training complete: {len(ensemble_models)} models")

In [None]:
# ============================================================
# Ensemble Predictions: mean and std
# ============================================================
print("="*60)
print("ENSEMBLE PREDICTIONS")
print("="*60)

# All individual model predictions
ensemble_preds_train = np.array([m.predict(X_train) for m in ensemble_models])
ensemble_preds_val   = np.array([m.predict(X_val) for m in ensemble_models])
ensemble_preds_test  = np.array([m.predict(X_test) for m in ensemble_models])

# Ensemble mean (primary prediction)
pred_train = ensemble_preds_train.mean(axis=0)
pred_val   = ensemble_preds_val.mean(axis=0)
pred_test  = ensemble_preds_test.mean(axis=0)

# Bootstrap standard deviation (uncertainty measure)
bootstrap_std_train = ensemble_preds_train.std(axis=0)
bootstrap_std_val   = ensemble_preds_val.std(axis=0)
bootstrap_std_test  = ensemble_preds_test.std(axis=0)

# Full dataset predictions for output
pred_full = np.concatenate([pred_train, pred_val, pred_test])
bootstrap_std_full = np.concatenate([bootstrap_std_train, bootstrap_std_val, bootstrap_std_test])
dates_full = list(dates_train) + list(dates_val) + list(dates_test)
y_full = np.concatenate([y_train, y_val, y_test])

print(f"\nRaw ensemble predictions:")
print(f"  Train: mean={pred_train.mean():.4f}, std={pred_train.std():.4f}")
print(f"  Val:   mean={pred_val.mean():.4f}, std={pred_val.std():.4f}")
print(f"  Test:  mean={pred_test.mean():.4f}, std={pred_test.std():.4f}")

print(f"\nBootstrap diversity statistics:")
print(f"  Train std: range=[{bootstrap_std_train.min():.4f}, {bootstrap_std_train.max():.4f}], mean={bootstrap_std_train.mean():.4f}")
print(f"  Val   std: range=[{bootstrap_std_val.min():.4f}, {bootstrap_std_val.max():.4f}], mean={bootstrap_std_val.mean():.4f}")
print(f"  Test  std: range=[{bootstrap_std_test.min():.4f}, {bootstrap_std_test.max():.4f}], mean={bootstrap_std_test.mean():.4f}")
print(f"  Positive pct (test): {(pred_test > 0).sum() / len(pred_test) * 100:.1f}%")

In [None]:
# ============================================================
# OLS Output Scaling (identical to attempt 7 and 16)
# ============================================================
print("="*60)
print("OLS OUTPUT SCALING")
print("="*60)

# Fit OLS scaling factor on validation set
numerator = np.sum(pred_val * y_val)
denominator = np.sum(pred_val ** 2)
alpha_ols = numerator / denominator if denominator != 0 else 1.0
alpha_ols = np.clip(alpha_ols, 0.5, 10.0)

print(f"\nOLS scaling factor (from val set): {alpha_ols:.4f}")

# Apply scaling
scaled_pred_train = pred_train * alpha_ols
scaled_pred_val   = pred_val * alpha_ols
scaled_pred_test  = pred_test * alpha_ols
scaled_pred_full  = pred_full * alpha_ols

# Compare raw vs scaled MAE on test
mae_raw_test = compute_mae(y_test, pred_test)
mae_scaled_test = compute_mae(y_test, scaled_pred_test)
print(f"\nTest MAE (raw):    {mae_raw_test:.4f}%")
print(f"Test MAE (scaled): {mae_scaled_test:.4f}%")
print(f"Test MAE delta:    {mae_scaled_test - mae_raw_test:+.4f}%")

# DA and Sharpe unchanged by scaling (only sign matters)
da_raw = compute_direction_accuracy(y_test, pred_test)
da_scaled = compute_direction_accuracy(y_test, scaled_pred_test)
assert abs(da_raw - da_scaled) < 1e-10, "Scaling changed DA - check alpha_ols sign!"
print("\nDA and Sharpe: unchanged by positive scaling (verified)")

use_scaled = mae_scaled_test < mae_raw_test
if use_scaled:
    print(f"Using SCALED predictions for MAE (improvement: {mae_raw_test - mae_scaled_test:.4f}%)")
else:
    print(f"Using RAW predictions for MAE (scaling degraded by {mae_scaled_test - mae_raw_test:.4f}%)")

In [None]:
# ============================================================
# HCDA: Both Methods
# Method A: bootstrap std (low std = high confidence)
# Method B: |prediction| magnitude (high magnitude = high confidence)
# ============================================================
print("="*60)
print("HCDA COMPUTATION - BOTH METHODS")
print("="*60)

# Method A: Bootstrap std-based (attempt 16 approach)
# Bottom 20% by std = highest confidence (20% coverage)
hcda_bootstrap_test, hcda_bootstrap_cov = compute_hcda_bootstrap(
    y_test, pred_test, bootstrap_std_test, low_std_pct=20
)

# Method B: |prediction|-based (attempt 7 approach)
# Top 20% by absolute prediction magnitude
hcda_abs_test, hcda_abs_cov = compute_hcda_abs(
    y_test, pred_test, threshold_percentile=80
)

print(f"\nHCDA comparison (test set):")
print(f"  Method A (bootstrap std):  {hcda_bootstrap_test*100:.2f}% coverage={hcda_bootstrap_cov*100:.1f}% (N={int(hcda_bootstrap_cov*len(y_test))})")
print(f"  Method B (|prediction|):   {hcda_abs_test*100:.2f}% coverage={hcda_abs_cov*100:.1f}% (N={int(hcda_abs_cov*len(y_test))})")
print(f"  Difference (A-B):          {(hcda_bootstrap_test - hcda_abs_test)*100:+.2f}pp")

# Primary HCDA: whichever is higher
if hcda_bootstrap_test >= hcda_abs_test:
    hcda_primary = hcda_bootstrap_test
    hcda_primary_cov = hcda_bootstrap_cov
    primary_hcda_method = 'bootstrap_std'
    print(f"\nPrimary HCDA: bootstrap_std ({hcda_primary*100:.2f}%)")
else:
    hcda_primary = hcda_abs_test
    hcda_primary_cov = hcda_abs_cov
    primary_hcda_method = 'abs_prediction'
    print(f"\nPrimary HCDA: abs_prediction ({hcda_primary*100:.2f}%)")

# HCDA at multiple thresholds for diagnostics
print(f"\nHCDA at different thresholds (|prediction| method, test set):")
for pct in [70, 75, 80, 85, 90]:
    hc_da, hc_cov = compute_hcda_abs(y_test, pred_test, threshold_percentile=pct)
    n_samples = int(len(y_test) * hc_cov)
    print(f"  Top {100-pct}% (N={n_samples}): {hc_da*100:.2f}%")

print(f"\nHCDA bootstrap at different low-std percentiles (test set):")
for low_pct in [10, 15, 20, 25, 30]:
    hc_da, hc_cov = compute_hcda_bootstrap(y_test, pred_test, bootstrap_std_test, low_std_pct=low_pct)
    n_samples = int(len(y_test) * hc_cov)
    print(f"  Bottom {low_pct}% std (N={n_samples}): {hc_da*100:.2f}%")

In [None]:
# ============================================================
# Final Evaluation - All Splits
# ============================================================
print("="*60)
print("FINAL EVALUATION")
print("="*60)

metrics_all = {}
for split_name, y_true, y_pred_raw, y_pred_scaled in [
    ('train', y_train, pred_train, scaled_pred_train),
    ('val', y_val, pred_val, scaled_pred_val),
    ('test', y_test, pred_test, scaled_pred_test),
]:
    da = compute_direction_accuracy(y_true, y_pred_raw)
    mae_raw_split = compute_mae(y_true, y_pred_raw)
    mae_scaled_split = compute_mae(y_true, y_pred_scaled)
    mae = min(mae_raw_split, mae_scaled_split)
    sharpe = compute_sharpe_trade_cost(y_true, y_pred_raw)
    hc_da_abs, hc_cov_abs = compute_hcda_abs(y_true, y_pred_raw, threshold_percentile=80)

    metrics_all[split_name] = {
        'direction_accuracy': float(da),
        'high_confidence_da_abs': float(hc_da_abs),
        'high_confidence_coverage_abs': float(hc_cov_abs),
        'mae': float(mae),
        'mae_raw': float(mae_raw_split),
        'mae_scaled': float(mae_scaled_split),
        'sharpe_ratio': float(sharpe),
    }

# Add bootstrap HCDA for test
metrics_all['test']['high_confidence_da_bootstrap'] = float(hcda_bootstrap_test)
metrics_all['test']['high_confidence_coverage_bootstrap'] = float(hcda_bootstrap_cov)
metrics_all['test']['hcda_primary'] = float(hcda_primary)
metrics_all['test']['hcda_primary_method'] = primary_hcda_method

for split_name in ['train', 'val', 'test']:
    m = metrics_all[split_name]
    print(f"\n{split_name.upper()}:")
    print(f"  DA:             {m['direction_accuracy']*100:.2f}%")
    print(f"  HCDA (|pred|):  {m['high_confidence_da_abs']*100:.2f}% (coverage: {m['high_confidence_coverage_abs']*100:.1f}%)")
    if split_name == 'test':
        print(f"  HCDA (boot):    {hcda_bootstrap_test*100:.2f}% (coverage: {hcda_bootstrap_cov*100:.1f}%)")
        print(f"  HCDA (primary): {hcda_primary*100:.2f}% via {primary_hcda_method}")
    print(f"  MAE:            {m['mae']:.4f}% (raw: {m['mae_raw']:.4f}%, scaled: {m['mae_scaled']:.4f}%)")
    print(f"  Sharpe:         {m['sharpe_ratio']:.2f}")

train_test_da_gap = (metrics_all['train']['direction_accuracy'] - metrics_all['test']['direction_accuracy']) * 100
print(f"\nOVERFITTING:")
print(f"  Train-Test DA gap: {train_test_da_gap:.2f}pp (target: <10pp)")
print(f"  Check: {'PASS' if train_test_da_gap < 10 else 'FAIL'}")

test_m = metrics_all['test']
targets_met = [
    test_m['direction_accuracy'] > 0.56,
    hcda_primary > 0.60,
    test_m['mae'] < 0.0075,
    test_m['sharpe_ratio'] > 0.8,
]

print(f"\nTARGET STATUS:")
print(f"  DA > 56%:     {'PASS' if targets_met[0] else 'FAIL'} ({test_m['direction_accuracy']*100:.2f}%)")
print(f"  HCDA > 60%:   {'PASS' if targets_met[1] else 'FAIL'} ({hcda_primary*100:.2f}% via {primary_hcda_method})")
print(f"  MAE < 0.75%:  {'PASS' if targets_met[2] else 'FAIL'} ({test_m['mae']:.4f}%)")
print(f"  Sharpe > 0.8: {'PASS' if targets_met[3] else 'FAIL'} ({test_m['sharpe_ratio']:.2f})")
print(f"\nTargets passed: {sum(targets_met)}/4")

# Vs attempt 7 comparison
ATT7_DA = 0.6004
ATT7_HCDA = 0.6413
ATT7_MAE = 0.9429
ATT7_SHARPE = 2.4636

# Vs attempt 16 comparison
ATT16_DA = 0.5852
ATT16_HCDA = 0.6848
ATT16_MAE = None  # not directly available
ATT16_SHARPE = None

print(f"\nVs Attempt 7 (best XGBoost, single model):")
print(f"  DA:     {test_m['direction_accuracy']*100:.2f}% (att7: {ATT7_DA*100:.2f}%, delta: {(test_m['direction_accuracy']-ATT7_DA)*100:+.2f}pp)")
print(f"  HCDA:   {hcda_primary*100:.2f}% (att7: {ATT7_HCDA*100:.2f}%, delta: {(hcda_primary-ATT7_HCDA)*100:+.2f}pp)")
print(f"  MAE:    {test_m['mae']:.4f}% (att7: {ATT7_MAE:.4f}%, delta: {test_m['mae']-ATT7_MAE:+.4f}pp)")
print(f"  Sharpe: {test_m['sharpe_ratio']:.2f} (att7: {ATT7_SHARPE:.2f}, delta: {test_m['sharpe_ratio']-ATT7_SHARPE:+.2f})")

print(f"\nVs Attempt 16 (LightGBM + 5-seed bootstrap):")
print(f"  DA:   {test_m['direction_accuracy']*100:.2f}% (att16: {ATT16_DA*100:.2f}%, delta: {(test_m['direction_accuracy']-ATT16_DA)*100:+.2f}pp)")
print(f"  HCDA: {hcda_primary*100:.2f}% (att16: {ATT16_HCDA*100:.2f}%, delta: {(hcda_primary-ATT16_HCDA)*100:+.2f}pp)")

In [None]:
# ============================================================
# Feature Importance (average across all ensemble models)
# ============================================================
print("="*60)
print("FEATURE IMPORTANCE (Average Across Ensemble)")
print("="*60)

# Average feature importance across all ensemble members
importances_list = []
for model in ensemble_models:
    imp = model.feature_importances_
    importances_list.append(imp)

avg_importance = np.mean(importances_list, axis=0)
std_importance = np.std(importances_list, axis=0)

feature_importance_df = pd.DataFrame({
    'feature': FEATURE_COLUMNS,
    'importance_mean': avg_importance,
    'importance_std': std_importance,
}).sort_values('importance_mean', ascending=False)

total_imp = feature_importance_df['importance_mean'].sum()
feature_importance_df['importance_pct'] = feature_importance_df['importance_mean'] / max(total_imp, 1e-10) * 100

feature_importance_df = feature_importance_df.reset_index(drop=True)

print("\nTop 10 features (average across 12 ensemble models):")
for i, row in feature_importance_df.head(10).iterrows():
    print(f"  {i+1:2d}. {row['feature']}: {row['importance_pct']:.2f}% (std={row['importance_std']:.4f})")

print("\nAll features:")
for i, row in feature_importance_df.iterrows():
    print(f"  {i+1:2d}. {row['feature']}: {row['importance_pct']:.2f}%")

# Prediction distribution diagnostics
naive_always_up_da = (y_test > 0).sum() / len(y_test)
print(f"\nNaive Baseline:")
print(f"  Always-up DA: {naive_always_up_da*100:.2f}%")
print(f"  Model vs naive: {(test_m['direction_accuracy'] - naive_always_up_da)*100:+.2f}pp")

print(f"\nPrediction distribution (test set, ensemble mean):")
print(f"  Mean:     {pred_test.mean():.4f}%")
print(f"  Std:      {pred_test.std():.4f}%")
print(f"  Min:      {pred_test.min():.4f}%")
print(f"  Max:      {pred_test.max():.4f}%")
print(f"  Positive: {(pred_test > 0).sum() / len(pred_test) * 100:.1f}%")

In [None]:
# ============================================================
# Save Results
# ============================================================
print("="*60)
print("SAVING RESULTS")
print("="*60)

# --- test_predictions.csv ---
# Bottom 20% std = high confidence (bootstrap method)
hc_std_threshold = np.percentile(bootstrap_std_test, 20)
hc_mask_bootstrap_test = bootstrap_std_test <= hc_std_threshold

# Top 20% abs prediction = high confidence (|pred| method)
hc_abs_threshold = np.percentile(np.abs(pred_test), 80)
hc_mask_abs_test = np.abs(pred_test) >= hc_abs_threshold

test_predictions_df = pd.DataFrame({
    'Date': dates_test,
    'prediction': pred_test,
    'bootstrap_std': bootstrap_std_test,
    'high_confidence_std': hc_mask_bootstrap_test.astype(int),
    'high_confidence_abs': hc_mask_abs_test.astype(int),
})
test_predictions_df.to_csv('test_predictions.csv', index=False)
print("Saved test_predictions.csv")

# --- predictions.csv (all splits) ---
split_labels = ['train'] * len(dates_train) + ['val'] * len(dates_val) + ['test'] * len(dates_test)

# Bootstrap std for high confidence (global threshold from full set)
hc_std_threshold_full = np.percentile(bootstrap_std_full, 20)
hc_std_full = (bootstrap_std_full <= hc_std_threshold_full).astype(int)

# |pred| high confidence (global threshold from full set)
hc_abs_threshold_full = np.percentile(np.abs(pred_full), 80)
hc_abs_full = (np.abs(pred_full) >= hc_abs_threshold_full).astype(int)

predictions_df = pd.DataFrame({
    'date': dates_full,
    'split': split_labels,
    'actual': y_full,
    'prediction': pred_full,
    'prediction_scaled': scaled_pred_full,
    'bootstrap_std': bootstrap_std_full,
    'high_confidence_std': hc_std_full,
    'high_confidence_abs': hc_abs_full,
    'direction_correct': (np.sign(pred_full) == np.sign(y_full)).astype(int),
})
predictions_df.to_csv('predictions.csv', index=False)
print("Saved predictions.csv")

# --- training_result.json ---
fi_top10 = feature_importance_df.head(10)[['feature', 'importance_pct']].to_dict('records')

training_result = {
    'feature': 'meta_model',
    'attempt': 17,
    'timestamp': datetime.now().isoformat(),
    'architecture': 'XGBoost (attempt 7 params) + Bootstrap Data Subsampling Ensemble (12 models)',
    'phase': '3_meta_model',

    'model_config': {
        'algorithm': 'XGBoost',
        'n_features': 24,
        'train_samples': len(X_train),
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'samples_per_feature_ratio': len(X_train) / 24,
        'xgb_params': xgb_params,
        'n_ensemble': N_ENSEMBLE,
        'bootstrap_fraction': BOOTSTRAP_FRAC,
        'bootstrap_sample_size': n_bootstrap,
        'hpo': 'none_exact_attempt_7_params',
    },

    'bootstrap_analysis': {
        'n_ensemble': N_ENSEMBLE,
        'bootstrap_fraction': BOOTSTRAP_FRAC,
        'bootstrap_type': 'data_subsampling_with_replacement',
        'bootstrap_std_range_train': [float(bootstrap_std_train.min()), float(bootstrap_std_train.max())],
        'bootstrap_std_mean_train': float(bootstrap_std_train.mean()),
        'bootstrap_std_range_val': [float(bootstrap_std_val.min()), float(bootstrap_std_val.max())],
        'bootstrap_std_mean_val': float(bootstrap_std_val.mean()),
        'bootstrap_std_range_test': [float(bootstrap_std_test.min()), float(bootstrap_std_test.max())],
        'bootstrap_std_mean_test': float(bootstrap_std_test.mean()),
        'hcda_method_A_bootstrap_std': float(hcda_bootstrap_test),
        'hcda_method_A_coverage': float(hcda_bootstrap_cov),
        'hcda_method_B_abs_prediction': float(hcda_abs_test),
        'hcda_method_B_coverage': float(hcda_abs_cov),
        'primary_hcda_method': primary_hcda_method,
        'primary_hcda_value': float(hcda_primary),
        'comparison_att7_std_mean': 0.008,
        'comparison_att16_std_mean': 0.025,
    },

    'ols_scaling': {
        'alpha_ols': float(alpha_ols),
        'mae_raw_test': float(mae_raw_test),
        'mae_scaled_test': float(mae_scaled_test),
        'mae_improvement': float(mae_raw_test - mae_scaled_test),
        'use_scaled': bool(use_scaled),
    },

    'metrics': metrics_all,

    'target_evaluation': {
        'direction_accuracy': {
            'target': '> 56.0%',
            'actual': f"{test_m['direction_accuracy']*100:.2f}%",
            'gap': f"{(test_m['direction_accuracy'] - 0.56)*100:+.2f}pp",
            'passed': bool(targets_met[0]),
        },
        'high_confidence_da': {
            'target': '> 60.0%',
            'actual': f"{hcda_primary*100:.2f}%",
            'gap': f"{(hcda_primary - 0.60)*100:+.2f}pp",
            'passed': bool(targets_met[1]),
            'method_used': primary_hcda_method,
            'hcda_method_A_bootstrap': f"{hcda_bootstrap_test*100:.2f}%",
            'hcda_method_B_abs': f"{hcda_abs_test*100:.2f}%",
        },
        'mae': {
            'target': '< 0.75%',
            'actual': f"{test_m['mae']:.4f}%",
            'gap': f"{(0.0075 - test_m['mae']):.4f}%",
            'passed': bool(targets_met[2]),
        },
        'sharpe_ratio': {
            'target': '> 0.80',
            'actual': f"{test_m['sharpe_ratio']:.2f}",
            'gap': f"{(test_m['sharpe_ratio'] - 0.8):+.2f}",
            'passed': bool(targets_met[3]),
        },
    },

    'targets_passed': sum(targets_met),
    'targets_total': 4,
    'overall_passed': all(targets_met),

    'overfitting_analysis': {
        'train_test_da_gap_pp': float(train_test_da_gap),
        'target_gap_pp': 10.0,
        'overfitting_check': 'PASS' if train_test_da_gap < 10 else 'FAIL',
    },

    'feature_importance': {
        'method': 'xgboost_gain_averaged_over_ensemble',
        'top_10': fi_top10,
    },

    'vs_attempt_7': {
        'description': 'XGBoost single model (baseline for attempt 17)',
        'att7_da': ATT7_DA,
        'att7_hcda': ATT7_HCDA,
        'att7_sharpe': ATT7_SHARPE,
        'da_delta_pp': float((test_m['direction_accuracy'] - ATT7_DA) * 100),
        'hcda_delta_pp': float((hcda_primary - ATT7_HCDA) * 100),
        'sharpe_delta': float(test_m['sharpe_ratio'] - ATT7_SHARPE),
    },

    'vs_attempt_16': {
        'description': 'LightGBM + 5-seed bootstrap',
        'att16_da': ATT16_DA,
        'att16_hcda': ATT16_HCDA,
        'da_delta_pp': float((test_m['direction_accuracy'] - ATT16_DA) * 100),
        'hcda_delta_pp': float((hcda_primary - ATT16_HCDA) * 100),
    },

    'vs_naive': {
        'naive_always_up_da': f"{naive_always_up_da*100:.2f}%",
        'model_vs_naive_pp': float((test_m['direction_accuracy'] - naive_always_up_da) * 100),
    },

    'prediction_characteristics': {
        'mean_ensemble': float(pred_test.mean()),
        'std_ensemble': float(pred_test.std()),
        'min_ensemble': float(pred_test.min()),
        'max_ensemble': float(pred_test.max()),
        'positive_pct': float((pred_test > 0).sum() / len(pred_test) * 100),
    },
}

with open('training_result.json', 'w') as f:
    json.dump(training_result, f, indent=2, default=str)
print("Saved training_result.json")

print(f"\n{'='*60}")
print("TRAINING COMPLETE")
print(f"{'='*60}")
print(f"Finished: {datetime.now().isoformat()}")
print(f"\nFinal Status:")
print(f"  Algorithm:    XGBoost (attempt 7 params)")
print(f"  Ensemble:     {N_ENSEMBLE} models, {BOOTSTRAP_FRAC:.0%} data bootstrap")
print(f"  HCDA method:  {primary_hcda_method.upper()}")
print(f"  MAE method:   {'SCALED' if use_scaled else 'RAW'}")
print(f"  Targets passed: {sum(targets_met)}/4")
if all(targets_met):
    print(f"  ALL TARGETS MET")
else:
    failed = [t for t, m in zip(['DA', 'HCDA', 'MAE', 'Sharpe'], targets_met) if not m]
    print(f"  Failed: {failed}")