# Gold Prediction SubModel Training - CNY Demand Proxy Attempt 2

**Self-contained**: Data fetch -> Deterministic momentum z-score -> Optuna window HPO -> Save results

**Key changes from Attempt 1:**
1. Single output column (was 3)
2. No HMM, no hmmlearn (pure deterministic)
3. Optimized momentum z-score with finer window grid

**Feature**: cny_demand | **Attempt**: 2

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from scipy.stats import pearsonr
from sklearn.metrics import mutual_info_score
import optuna
from optuna.samplers import TPESampler
import json
import os
from datetime import datetime

np.random.seed(42)
print(f"Training started: {datetime.now().isoformat()}")

## Data Fetching and Preprocessing

In [None]:
def fetch_and_preprocess():
    """
    Fetch CNY=X and GC=F, compute returns.
    GC=F is for Optuna objective only, not feature input.
    """
    # Fetch CNY=X
    print("Fetching CNY=X (CNY/USD exchange rate)...")
    cny_data = yf.download('CNY=X', start='2014-06-01', progress=False)
    if cny_data.empty:
        raise RuntimeError("Failed to fetch CNY=X")
    
    if isinstance(cny_data.columns, pd.MultiIndex):
        cny_close = cny_data['Close']['CNY=X'].copy()
    else:
        cny_close = cny_data['Close'].copy()
    cny_close = cny_close.dropna()
    print(f"CNY=X: {len(cny_close)} rows ({cny_close.index[0]} to {cny_close.index[-1]})")
    
    # Validate range
    assert cny_close.min() >= 5.5 and cny_close.max() <= 8.0, \
        f"CNY/USD out of range: [{cny_close.min():.2f}, {cny_close.max():.2f}]"
    
    # Fetch GC=F (for Optuna objective only)
    print("Fetching GC=F (Gold Futures)...")
    gc_data = yf.download('GC=F', start='2014-06-01', progress=False)
    if gc_data.empty:
        raise RuntimeError("Failed to fetch GC=F")
    
    if isinstance(gc_data.columns, pd.MultiIndex):
        gc_close = gc_data['Close']['GC=F'].copy()
    else:
        gc_close = gc_data['Close'].copy()
    gc_close = gc_close.dropna()
    print(f"GC=F: {len(gc_close)} rows ({gc_close.index[0]} to {gc_close.index[-1]})")
    
    # Compute returns
    cny_return = cny_close.pct_change()
    gold_return = gc_close.pct_change()
    gold_return_next = gold_return.shift(-1)  # Next-day gold return
    
    # Align dates (inner join)
    df = pd.DataFrame({
        'cny_close': cny_close,
        'cny_return': cny_return,
        'gold_return_next': gold_return_next
    }).dropna()
    
    # Trim to base_features start date
    df.index = pd.to_datetime(df.index)
    base_start = pd.Timestamp('2015-01-30')
    df_output = df[df.index >= base_start]
    
    print(f"\nAligned dataset: {len(df_output)} rows ({df_output.index[0]} to {df_output.index[-1]})")
    
    # Data split 70/15/15
    n = len(df_output)
    train_end = int(n * 0.70)
    val_end = int(n * 0.85)
    
    print(f"Split: train={train_end}, val={val_end - train_end}, test={n - val_end}")
    
    return df, df_output, train_end, val_end

df_full, df_aligned, train_end, val_end = fetch_and_preprocess()
print("\nData ready!")

## Feature Generation Function

In [None]:
def generate_momentum_z(cny_return, momentum_window, baseline_window):
    """
    Compute z-scored momentum: rolling sum of N-day returns,
    z-scored against baseline_window-day rolling statistics.
    
    Args:
        cny_return: pd.Series of daily CNY returns
        momentum_window: int, days for cumulative return (3-10)
        baseline_window: int, days for z-score baseline (30-120)
    
    Returns:
        pd.Series: z-scored momentum, clipped to [-4, 4]
    """
    momentum = cny_return.rolling(momentum_window).sum()
    rolling_mean = momentum.rolling(baseline_window).mean()
    rolling_std = momentum.rolling(baseline_window).std()
    # Avoid division by zero
    rolling_std = rolling_std.replace(0, np.nan)
    z = (momentum - rolling_mean) / rolling_std
    z = z.clip(-4, 4)
    return z

# Test with default params
test_z = generate_momentum_z(df_aligned['cny_return'], 5, 60)
print(f"Test feature (5d/60d): non-NaN={test_z.notna().sum()}, mean={test_z.mean():.4f}, std={test_z.std():.4f}")
print(f"Autocorrelation (lag-1): {test_z.autocorr(1):.4f}")
print("Feature generation function defined.")

## Optuna Hyperparameter Optimization

In [None]:
def objective(trial):
    """
    Maximize abs(Pearson correlation) + 10 * MI on validation set.
    """
    momentum_window = trial.suggest_int('momentum_window', 3, 10)
    baseline_window = trial.suggest_int('baseline_window', 30, 120, step=10)
    
    # Generate feature for full aligned dataset
    z = generate_momentum_z(df_aligned['cny_return'], momentum_window, baseline_window)
    
    # Extract validation period
    z_val = z.iloc[train_end:val_end]
    target_val = df_aligned['gold_return_next'].iloc[train_end:val_end]
    
    # Drop NaN
    valid = z_val.notna() & target_val.notna()
    if valid.sum() < 50:
        return 0.0
    
    z_clean = z_val[valid].values
    target_clean = target_val[valid].values
    
    # Pearson correlation
    corr, _ = pearsonr(z_clean, target_clean)
    abs_corr = abs(corr)
    
    # Mutual information (discretized)
    try:
        z_disc = pd.qcut(z_clean, 20, labels=False, duplicates='drop')
        t_disc = pd.qcut(target_clean, 20, labels=False, duplicates='drop')
        mi = mutual_info_score(z_disc, t_disc)
    except Exception:
        mi = 0.0
    
    score = abs_corr + mi * 10
    
    # Log trial details
    trial.set_user_attr('correlation', float(corr))
    trial.set_user_attr('abs_corr', float(abs_corr))
    trial.set_user_attr('mi', float(mi))
    trial.set_user_attr('score', float(score))
    
    return score


# Run Optuna
print("Starting Optuna HPO (30 trials, 300s timeout)...")
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42)
)
study.optimize(objective, n_trials=30, timeout=300, show_progress_bar=True)

# Results
best_params = study.best_params
best_value = study.best_value
n_completed = len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])

print(f"\nOptuna complete!")
print(f"Completed trials: {n_completed}")
print(f"Best score: {best_value:.6f}")
print(f"Best parameters:")
for k, v in best_params.items():
    print(f"  {k}: {v}")
print(f"\nBest trial details:")
bt = study.best_trial
print(f"  Correlation: {bt.user_attrs['correlation']:.6f}")
print(f"  Abs correlation: {bt.user_attrs['abs_corr']:.6f}")
print(f"  MI: {bt.user_attrs['mi']:.6f}")

# Show top 5 trials
print(f"\nTop 5 trials:")
sorted_trials = sorted(study.trials, key=lambda t: t.value if t.value else 0, reverse=True)
for i, t in enumerate(sorted_trials[:5]):
    if t.value:
        print(f"  #{t.number}: score={t.value:.6f}, mom={t.params['momentum_window']}, base={t.params['baseline_window']}, corr={t.user_attrs.get('correlation', 'N/A')}")

## Generate Final Output

In [None]:
print("Generating final output with best parameters...")
momentum_window = best_params['momentum_window']
baseline_window = best_params['baseline_window']

# Generate feature for FULL dataset (including pre-2015 warmup)
# Use df_full which starts from 2014-06-01 for proper warmup
cny_return_full = df_full['cny_return']
z_full = generate_momentum_z(cny_return_full, momentum_window, baseline_window)

# Trim to aligned dates (2015-01-30+)
z_aligned = z_full.reindex(df_aligned.index)

# Create output DataFrame
output_df = pd.DataFrame({
    'cny_demand_momentum_z': z_aligned.values
}, index=df_aligned.index)
output_df.index.name = 'Date'

print(f"\nOutput shape: {output_df.shape}")
print(f"Output column: {list(output_df.columns)}")
print(f"\nOutput summary:")
print(output_df.describe())

# Quality checks
nan_count = output_df['cny_demand_momentum_z'].isna().sum()
nan_pct = nan_count / len(output_df) * 100
std_val = output_df['cny_demand_momentum_z'].std()
autocorr = output_df['cny_demand_momentum_z'].dropna().autocorr(1)

print(f"\nQuality checks:")
print(f"  NaN count: {nan_count} ({nan_pct:.2f}%)")
print(f"  Std: {std_val:.4f}")
print(f"  Autocorrelation (lag-1): {autocorr:.4f}")
print(f"  Min: {output_df['cny_demand_momentum_z'].min():.4f}")
print(f"  Max: {output_df['cny_demand_momentum_z'].max():.4f}")

if std_val < 1e-6:
    print("WARNING: Feature is essentially constant!")
if autocorr > 0.99:
    print("WARNING: Autocorrelation > 0.99 (potential leak)")

## Compute Metrics

In [None]:
# Compute correlation and MI on each split
splits = {
    'train': (0, train_end),
    'val': (train_end, val_end),
    'test': (val_end, len(df_aligned))
}

metrics = {}
for split_name, (start, end) in splits.items():
    z_split = output_df['cny_demand_momentum_z'].iloc[start:end]
    target_split = df_aligned['gold_return_next'].iloc[start:end]
    
    valid = z_split.notna() & target_split.notna()
    if valid.sum() > 50:
        z_clean = z_split[valid].values
        t_clean = target_split[valid].values
        
        corr, p_value = pearsonr(z_clean, t_clean)
        
        try:
            z_disc = pd.qcut(z_clean, 20, labels=False, duplicates='drop')
            t_disc = pd.qcut(t_clean, 20, labels=False, duplicates='drop')
            mi = mutual_info_score(z_disc, t_disc)
        except:
            mi = 0.0
        
        metrics[split_name] = {
            'correlation': float(corr),
            'p_value': float(p_value),
            'mi': float(mi),
            'n_samples': int(valid.sum()),
            'autocorrelation': float(z_split.dropna().autocorr(1)) if z_split.notna().sum() > 10 else None
        }
        
        print(f"\n{split_name.upper()}:")
        print(f"  Samples: {valid.sum()}")
        print(f"  Correlation: {corr:.6f} (p={p_value:.4f})")
        print(f"  MI: {mi:.6f}")
        print(f"  Autocorrelation: {metrics[split_name]['autocorrelation']:.4f}")
    else:
        metrics[split_name] = {'error': 'insufficient data'}
        print(f"\n{split_name.upper()}: insufficient data")

print("\nMetrics computed.")

## Save Results

In [None]:
print("\nSaving results...")

# 1. Save submodel output
output_df.to_csv('submodel_output.csv')
print("  Saved: submodel_output.csv")

# 2. Save training result
result = {
    'feature': 'cny_demand',
    'attempt': 2,
    'timestamp': datetime.now().isoformat(),
    'architecture': 'Deterministic momentum z-score (single output)',
    'key_changes': [
        'Single output column (was 3 in attempt 1)',
        'No HMM, no hmmlearn (pure deterministic)',
        'Optimized momentum and baseline windows via Optuna'
    ],
    'best_params': best_params,
    'optuna_trials_completed': n_completed,
    'optuna_best_value': float(best_value),
    'metrics': metrics,
    'output_shape': list(output_df.shape),
    'output_columns': list(output_df.columns),
    'quality_checks': {
        'nan_count': int(nan_count),
        'nan_pct': float(nan_pct),
        'std': float(std_val),
        'autocorrelation_lag1': float(autocorr),
        'min': float(output_df['cny_demand_momentum_z'].min()),
        'max': float(output_df['cny_demand_momentum_z'].max())
    },
    'data_info': {
        'train_samples': train_end,
        'val_samples': val_end - train_end,
        'test_samples': len(df_aligned) - val_end,
        'full_samples': len(df_aligned),
        'date_range': {
            'start': str(df_aligned.index[0]),
            'end': str(df_aligned.index[-1])
        }
    }
}

with open('training_result.json', 'w') as f:
    json.dump(result, f, indent=2, default=str)
print("  Saved: training_result.json")

print(f"\n=== Training Complete! ===")
print(f"Finished: {datetime.now().isoformat()}")
print(f"\nOutput: submodel_output.csv ({output_df.shape[0]} rows x {output_df.shape[1]} column)")
print(f"Best params: momentum_window={best_params['momentum_window']}, baseline_window={best_params['baseline_window']}")