# Gold Prediction SubModel Training - CNY Demand Proxy Attempt 1

**Self-contained**: Data fetch → Preprocessing → HMM + Z-Scores → Optuna HPO → Save results

**Architecture**: 2D HMM [CNY_return, CNY_vol] + momentum z-score + volatility regime z-score

**Feature**: cny_demand | **Attempt**: 1

Generated by builder_model agent

## 1. Library Installation and Imports

In [None]:
# Install hmmlearn (not pre-installed on Kaggle)
import subprocess
subprocess.check_call(['pip', 'install', '-q', 'hmmlearn'])

print("Libraries installed successfully!")

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from hmmlearn.hmm import GaussianHMM
from sklearn.metrics import mutual_info_score
import optuna
import json
import os
from datetime import datetime

# Seed for reproducibility
np.random.seed(42)

print(f"Training started: {datetime.now().isoformat()}")

## 2. Data Fetching and Preprocessing

In [None]:
def fetch_and_preprocess():
    """
    Fetch and preprocess CNY demand proxy data.
    Self-contained: No external file dependencies.
    
    Returns:
        tuple: (train_df, val_df, test_df, full_df)
               Each df contains: cny_return, cny_vol_5d, gold_return
    """
    # === 1. Fetch CNY=X from Yahoo Finance ===
    # Start date: 2014-06-01 (buffer for 120-day warmup before 2015-01-30)
    print("Fetching CNY=X (CNY/USD exchange rate)...")
    cny_data = yf.download('CNY=X', start='2014-06-01', progress=False)
    
    if cny_data.empty:
        raise RuntimeError("Failed to fetch CNY=X data from Yahoo Finance")
    
    # Extract Close price
    if isinstance(cny_data.columns, pd.MultiIndex):
        cny_close = cny_data['Close']['CNY=X'].copy()
    else:
        cny_close = cny_data['Close'].copy()
    
    cny_close = cny_close.dropna()
    print(f"CNY=X data fetched: {len(cny_close)} rows from {cny_close.index[0]} to {cny_close.index[-1]}")
    
    # Validate CNY/USD range (reasonable onshore rate)
    if cny_close.min() < 5.5 or cny_close.max() > 8.0:
        print(f"WARNING: CNY/USD range [{cny_close.min():.2f}, {cny_close.max():.2f}] outside expected [5.5, 8.0]")
    
    # === 2. Fetch GC=F for gold returns ===
    print("Fetching GC=F (Gold Futures) for return computation...")
    gc_data = yf.download('GC=F', start='2014-06-01', progress=False)
    
    if gc_data.empty:
        raise RuntimeError("Failed to fetch GC=F data from Yahoo Finance")
    
    # Extract Close price
    if isinstance(gc_data.columns, pd.MultiIndex):
        gc_close = gc_data['Close']['GC=F'].copy()
    else:
        gc_close = gc_data['Close'].copy()
    
    gc_close = gc_close.dropna()
    print(f"GC=F data fetched: {len(gc_close)} rows from {gc_close.index[0]} to {gc_close.index[-1]}")
    
    # === 3. Compute derived quantities ===
    # CNY daily return
    cny_return = cny_close.pct_change()
    
    # CNY 5-day rolling volatility (for initial HMM input)
    cny_vol_5d = cny_return.rolling(5).std()
    
    # Gold current-day return (for MI evaluation, not model input)
    gold_return = gc_close.pct_change()
    
    # === 4. Align dates (inner join on trading dates) ===
    df = pd.DataFrame({
        'cny_close': cny_close,
        'cny_return': cny_return,
        'cny_vol_5d': cny_vol_5d,
        'gold_return': gold_return
    })
    
    # Drop NaN rows from returns/volatility computation
    initial_rows = len(df)
    df = df.dropna()
    print(f"After alignment and NaN removal: {len(df)} rows (dropped {initial_rows - len(df)} rows)")
    
    # === 5. Validate data quality ===
    # Check for extreme CNY returns (managed float should not have |return| > 0.05)
    extreme_returns = (df['cny_return'].abs() > 0.05).sum()
    if extreme_returns > 0:
        print(f"WARNING: {extreme_returns} CNY returns exceed 5% (max: {df['cny_return'].abs().max():.4f})")
    
    # Check for constant volatility (should be varying)
    if df['cny_vol_5d'].std() < 1e-6:
        raise RuntimeError("CNY volatility is constant (std < 1e-6)")
    
    # Check for zero/negative volatility (can occur in managed float periods)
    zero_vol_count = (df['cny_vol_5d'] == 0).sum()
    if zero_vol_count > 0:
        print(f"INFO: {zero_vol_count} periods with zero volatility (PBOC fixed rate)")
    
    if (df['cny_vol_5d'] < 0).any():
        raise RuntimeError("CNY volatility contains negative values (implementation error)")
    
    # === 6. Trim to base_features date range ===
    # Expected: 2015-01-30 to latest
    df.index = pd.to_datetime(df.index)
    base_start = pd.Timestamp('2015-01-30')
    
    if df.index[0] > base_start:
        print(f"WARNING: CNY data starts at {df.index[0]}, later than expected {base_start}")
    
    df = df[df.index >= base_start]
    print(f"After trimming to base_features range (>= {base_start}): {len(df)} rows")
    
    # === 7. Data split (70/15/15, time-series order) ===
    n = len(df)
    train_end = int(n * 0.70)
    val_end = int(n * 0.85)
    
    train_df = df.iloc[:train_end].copy()
    val_df = df.iloc[train_end:val_end].copy()
    test_df = df.iloc[val_end:].copy()
    
    print(f"\nData split:")
    print(f"  Train: {len(train_df)} rows ({train_df.index[0]} to {train_df.index[-1]})")
    print(f"  Val:   {len(val_df)} rows ({val_df.index[0]} to {val_df.index[-1]})")
    print(f"  Test:  {len(test_df)} rows ({test_df.index[0]} to {test_df.index[-1]})")
    
    # === 8. Summary statistics ===
    print(f"\nSummary statistics (full dataset):")
    print(f"  CNY/USD range: [{df['cny_close'].min():.4f}, {df['cny_close'].max():.4f}]")
    print(f"  CNY return mean: {df['cny_return'].mean():.6f}, std: {df['cny_return'].std():.6f}")
    print(f"  CNY return range: [{df['cny_return'].min():.6f}, {df['cny_return'].max():.6f}]")
    print(f"  CNY vol_5d mean: {df['cny_vol_5d'].mean():.6f}, std: {df['cny_vol_5d'].std():.6f}")
    print(f"  Gold return mean: {df['gold_return'].mean():.6f}, std: {df['gold_return'].std():.6f}")
    
    # Autocorrelation check (lag 1)
    cny_return_autocorr = df['cny_return'].autocorr(lag=1)
    print(f"  CNY return autocorr(lag=1): {cny_return_autocorr:.4f}")
    
    return train_df, val_df, test_df, df


# Fetch data
train_df, val_df, test_df, full_df = fetch_and_preprocess()
print("\nData fetching complete!")

## 3. Feature Generation Functions

In [None]:
def generate_regime_feature(cny_return, cny_vol, n_components, covariance_type, train_size):
    """
    Fit 2D HMM on [CNY_return, CNY_vol] and return P(highest-return-variance state).
    Best-of-5 random restarts.
    
    Args:
        cny_return: Array of CNY daily returns
        cny_vol: Array of CNY volatility (rolling std)
        n_components: Number of HMM states (2 or 3)
        covariance_type: 'full' or 'diag'
        train_size: Number of training samples
    
    Returns:
        tuple: (regime_prob array, fitted HMM model)
    """
    X = np.column_stack([cny_return, cny_vol])
    valid_mask = ~np.isnan(X).any(axis=1)
    X_valid = X[valid_mask]
    X_train = X_valid[:train_size]
    
    best_score = -np.inf
    best_model = None
    
    # Best-of-5 restarts (hmmlearn lacks n_init parameter)
    for seed in [42, 123, 456, 789, 0]:
        model = GaussianHMM(
            n_components=n_components,
            covariance_type=covariance_type,
            n_iter=100,
            tol=1e-4,
            random_state=seed
        )
        try:
            model.fit(X_train)
            score = model.score(X_train)
            if score > best_score:
                best_score = score
                best_model = model
        except Exception as e:
            print(f"  HMM seed {seed} failed: {e}")
            continue
    
    if best_model is None:
        print("WARNING: All HMM seeds failed. Returning NaN.")
        return np.full(len(cny_return), np.nan), None
    
    # Generate probabilities for valid data
    probs = best_model.predict_proba(X_valid)
    
    # Identify highest-return-variance state (first dimension = CNY_return)
    state_vars = []
    for i in range(n_components):
        if covariance_type == 'full':
            state_vars.append(float(best_model.covars_[i][0, 0]))
        elif covariance_type == 'diag':
            state_vars.append(float(best_model.covars_[i][0]))
    
    high_var_state = np.argmax(state_vars)
    
    # Map back to full array
    result = np.full(len(cny_return), np.nan)
    result[valid_mask] = probs[:, high_var_state]
    
    return result, best_model


def generate_momentum_feature(cny_return, baseline_window):
    """
    5d momentum z-scored against baseline_window-day baseline.
    
    Args:
        cny_return: Array of CNY daily returns
        baseline_window: Baseline window for z-score (60 or 120)
    
    Returns:
        Array of momentum z-scores (clipped to [-4, 4])
    """
    s = pd.Series(cny_return)
    momentum = s.rolling(5).mean()
    rolling_mean = momentum.rolling(baseline_window).mean()
    rolling_std = momentum.rolling(baseline_window).std()
    z = (momentum - rolling_mean) / rolling_std
    z = z.clip(-4, 4)
    return z.values


def generate_vol_regime_feature(cny_return, vol_window, baseline_window):
    """
    Rolling z-score of vol_window-day volatility against baseline_window-day baseline.
    
    Args:
        cny_return: Array of CNY daily returns
        vol_window: Short volatility window (5, 10, or 20)
        baseline_window: Baseline window for z-score (60 or 120)
    
    Returns:
        Array of volatility regime z-scores (clipped to [-4, 4])
    """
    s = pd.Series(cny_return)
    vol_short = s.rolling(vol_window).std()
    rolling_mean = vol_short.rolling(baseline_window).mean()
    rolling_std = vol_short.rolling(baseline_window).std()
    z = (vol_short - rolling_mean) / rolling_std
    z = z.clip(-4, 4)
    return z.values


print("Feature generation functions defined.")

## 4. Optuna Objective Function

In [None]:
def objective(trial, cny_return_full, target_full, train_size, val_start, val_end):
    """
    Optuna objective: Maximize MI sum on validation set.
    
    Args:
        trial: Optuna trial object
        cny_return_full: Full CNY return array
        target_full: Full gold return array (for MI computation)
        train_size: Number of training samples
        val_start: Validation start index
        val_end: Validation end index
    
    Returns:
        MI sum on validation set
    """
    # Suggest hyperparameters
    n_components = trial.suggest_categorical('hmm_n_components', [2, 3])
    covariance_type = trial.suggest_categorical('hmm_covariance_type', ['full', 'diag'])
    vol_window = trial.suggest_categorical('vol_short_window', [5, 10, 20])
    vol_baseline = trial.suggest_categorical('vol_baseline_window', [60, 120])
    mom_baseline = trial.suggest_categorical('momentum_baseline_window', [60, 120])
    
    try:
        # Recompute HMM input vol with trial's vol_window
        cny_vol = pd.Series(cny_return_full).rolling(vol_window).std().values
        
        # Generate features
        regime, _ = generate_regime_feature(
            cny_return_full, cny_vol, n_components, covariance_type, train_size
        )
        momentum = generate_momentum_feature(cny_return_full, mom_baseline)
        vol_regime = generate_vol_regime_feature(cny_return_full, vol_window, vol_baseline)
        
        # Extract validation period
        regime_val = regime[val_start:val_end]
        momentum_val = momentum[val_start:val_end]
        vol_regime_val = vol_regime[val_start:val_end]
        target_val = target_full[val_start:val_end]
        
        # Discretize for MI computation
        def discretize(x, bins=20):
            valid = ~np.isnan(x)
            if valid.sum() < bins:
                return None
            x_valid = x.copy()
            x_valid[~valid] = np.nanmedian(x)
            try:
                return pd.qcut(x_valid, bins, labels=False, duplicates='drop')
            except:
                return None
        
        # Compute MI sum
        mi_sum = 0.0
        for feat_val in [regime_val, momentum_val, vol_regime_val]:
            mask = ~np.isnan(feat_val) & ~np.isnan(target_val)
            if mask.sum() > 50:
                feat_disc = discretize(feat_val[mask])
                tgt_disc = discretize(target_val[mask])
                if feat_disc is not None and tgt_disc is not None:
                    mi = mutual_info_score(feat_disc, tgt_disc)
                    mi_sum += mi
        
        return mi_sum
    
    except Exception as e:
        print(f"  Trial failed: {e}")
        return 0.0


print("Optuna objective function defined.")

## 5. Hyperparameter Optimization

In [None]:
# Prepare data for Optuna
cny_return_full = full_df['cny_return'].values
gold_return_full = full_df['gold_return'].values

n_total = len(full_df)
train_size = int(n_total * 0.70)
val_start = train_size
val_end = int(n_total * 0.85)

print(f"\nStarting Optuna HPO...")
print(f"Total samples: {n_total}")
print(f"Train size: {train_size}")
print(f"Val range: [{val_start}, {val_end})")

# Create Optuna study
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)

# Run optimization
study.optimize(
    lambda trial: objective(trial, cny_return_full, gold_return_full, train_size, val_start, val_end),
    n_trials=30,
    timeout=300,
    show_progress_bar=True
)

# Best parameters
best_params = study.best_params
best_value = study.best_value
n_completed = len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])

print(f"\nOptuna HPO complete!")
print(f"Completed trials: {n_completed}")
print(f"Best MI sum: {best_value:.6f}")
print(f"Best parameters:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

## 6. Final Model Training with Best Parameters

In [None]:
print("\nGenerating final features with best parameters...")

# Extract best parameters
n_components = best_params['hmm_n_components']
covariance_type = best_params['hmm_covariance_type']
vol_window = best_params['vol_short_window']
vol_baseline = best_params['vol_baseline_window']
mom_baseline = best_params['momentum_baseline_window']

# Recompute CNY volatility with best vol_window
cny_vol_full = pd.Series(cny_return_full).rolling(vol_window).std().values

# Generate regime feature
print(f"Fitting HMM ({n_components} states, {covariance_type} covariance)...")
regime_prob, best_hmm = generate_regime_feature(
    cny_return_full, cny_vol_full, n_components, covariance_type, train_size
)

# Generate momentum z-score
print(f"Computing momentum z-score (5d momentum, {mom_baseline}d baseline)...")
momentum_z = generate_momentum_feature(cny_return_full, mom_baseline)

# Generate volatility regime z-score
print(f"Computing volatility regime z-score ({vol_window}d vol, {vol_baseline}d baseline)...")
vol_regime_z = generate_vol_regime_feature(cny_return_full, vol_window, vol_baseline)

# Create output DataFrame
output_df = pd.DataFrame({
    'cny_regime_prob': regime_prob,
    'cny_momentum_z': momentum_z,
    'cny_vol_regime_z': vol_regime_z
}, index=full_df.index)

print(f"\nOutput shape: {output_df.shape}")
print(f"Output columns: {list(output_df.columns)}")
print(f"\nOutput summary:")
print(output_df.describe())

# Check for NaN and constant values
nan_counts = output_df.isna().sum()
print(f"\nNaN counts:")
print(nan_counts)

stds = output_df.std()
print(f"\nStandard deviations:")
print(stds)

# Check for constant features
for col in output_df.columns:
    if stds[col] < 1e-6:
        print(f"WARNING: {col} is effectively constant (std={stds[col]:.2e})")

## 7. Compute Training Metrics

In [None]:
# Compute autocorrelations
autocorrs = {}
for col in output_df.columns:
    valid_data = output_df[col].dropna()
    if len(valid_data) > 10:
        autocorr = valid_data.autocorr(lag=1)
        autocorrs[col] = autocorr
    else:
        autocorrs[col] = np.nan

print(f"\nAutocorrelations (lag 1):")
for col, ac in autocorrs.items():
    print(f"  {col}: {ac:.4f}")
    if ac > 0.99:
        print(f"    WARNING: Autocorrelation > 0.99 (likely leak)")

# Compute MI on validation set
def compute_mi(feature, target, bins=20):
    """Compute mutual information between feature and target."""
    mask = ~np.isnan(feature) & ~np.isnan(target)
    if mask.sum() < bins:
        return 0.0
    
    try:
        feat_valid = feature[mask]
        tgt_valid = target[mask]
        feat_disc = pd.qcut(feat_valid, bins, labels=False, duplicates='drop')
        tgt_disc = pd.qcut(tgt_valid, bins, labels=False, duplicates='drop')
        return mutual_info_score(feat_disc, tgt_disc)
    except:
        return 0.0

# Extract validation data
val_features = output_df.iloc[val_start:val_end]
val_target = gold_return_full[val_start:val_end]

mi_scores = {}
for col in output_df.columns:
    mi = compute_mi(val_features[col].values, val_target)
    mi_scores[col] = mi

mi_total = sum(mi_scores.values())

print(f"\nMutual Information (validation set):")
for col, mi in mi_scores.items():
    print(f"  {col}: {mi:.6f}")
print(f"  Total: {mi_total:.6f}")

# Create metrics dictionary
metrics = {
    'autocorrelations': autocorrs,
    'mutual_information': mi_scores,
    'mi_total': mi_total,
    'output_std': stds.to_dict(),
    'output_nan_pct': (nan_counts / len(output_df)).to_dict()
}

print(f"\nMetrics computed.")

## 8. Save Results

In [None]:
print("\nSaving results...")

# Save submodel output
output_df.to_csv('submodel_output.csv')
print("  Saved: submodel_output.csv")

# Save HMM model parameters (hmmlearn models are not directly serializable with torch)
model_params = {
    'n_components': n_components,
    'covariance_type': covariance_type,
    'means': best_hmm.means_.tolist() if best_hmm else None,
    'covars': best_hmm.covars_.tolist() if best_hmm and hasattr(best_hmm, 'covars_') else None,
    'transmat': best_hmm.transmat_.tolist() if best_hmm else None,
    'startprob': best_hmm.startprob_.tolist() if best_hmm else None
}

with open('model_params.json', 'w') as f:
    json.dump(model_params, f, indent=2)
print("  Saved: model_params.json")

# Save training result
result = {
    'feature': 'cny_demand',
    'attempt': 1,
    'timestamp': datetime.now().isoformat(),
    'best_params': best_params,
    'metrics': metrics,
    'optuna_trials_completed': n_completed,
    'optuna_best_value': best_value,
    'output_shape': list(output_df.shape),
    'output_columns': list(output_df.columns),
    'data_info': {
        'train_samples': len(train_df),
        'val_samples': len(val_df),
        'test_samples': len(test_df),
        'full_samples': len(full_df),
        'date_range': {
            'start': str(full_df.index[0]),
            'end': str(full_df.index[-1])
        }
    }
}

with open('training_result.json', 'w') as f:
    json.dump(result, f, indent=2, default=str)
print("  Saved: training_result.json")

print(f"\n=== Training Complete! ===")
print(f"Finished: {datetime.now().isoformat()}")
print(f"\nOutput files:")
print(f"  - submodel_output.csv ({output_df.shape[0]} rows x {output_df.shape[1]} columns)")
print(f"  - model_params.json (HMM parameters)")
print(f"  - training_result.json (full training report)")