# Gold Prediction SubModel Training - CNY Demand Proxy Attempt 2

**Self-contained**: Data fetch -> CNY-CNH spread change z-score -> Optuna window HPO -> Save results

**Architecture**: Deterministic spread change z-score (single output)
- Compute onshore-offshore CNY spread: CNY=X (onshore) - CNH=F (offshore futures)
- Daily spread change momentum, z-scored against rolling baseline
- Captures capital control tension and cross-border flow pressure

**Feature**: cny_demand | **Attempt**: 2

In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
from scipy.stats import pearsonr
from sklearn.metrics import mutual_info_score
import optuna
from optuna.samplers import TPESampler
import json
import os
from datetime import datetime

np.random.seed(42)
print(f"Training started: {datetime.now().isoformat()}")

Training started: 2026-02-18T01:32:19.508940


  from .autonotebook import tqdm as notebook_tqdm


## Data Fetching and Preprocessing

In [2]:
def fetch_and_preprocess():
    """
    Fetch CNY=X (onshore), CNH=F (offshore futures), and GC=F.
    Compute CNY-CNH spread and spread change.
    GC=F is for Optuna objective only, not feature input.
    """
    # Fetch CNY=X (onshore USD/CNY)
    print("Fetching CNY=X (onshore CNY/USD exchange rate)...")
    cny_data = yf.download('CNY=X', start='2014-01-01', progress=False)
    if cny_data.empty:
        raise RuntimeError("Failed to fetch CNY=X")
    
    if isinstance(cny_data.columns, pd.MultiIndex):
        cny_close = cny_data['Close']['CNY=X'].copy()
    else:
        cny_close = cny_data['Close'].copy()
    cny_close = cny_close.dropna()
    print(f"CNY=X: {len(cny_close)} rows ({cny_close.index[0]} to {cny_close.index[-1]})")
    
    # Validate range (onshore CNY typically 6.0-7.4)
    assert cny_close.min() >= 5.5 and cny_close.max() <= 8.0, \
        f"CNY/USD out of range: [{cny_close.min():.2f}, {cny_close.max():.2f}]"
    
    # Fetch CNH=F (offshore CNH futures)
    print("Fetching CNH=F (offshore CNH futures)...")
    cnh_data = yf.download('CNH=F', start='2014-01-01', progress=False)
    if cnh_data.empty:
        raise RuntimeError("Failed to fetch CNH=F")
    
    if isinstance(cnh_data.columns, pd.MultiIndex):
        cnh_close = cnh_data['Close']['CNH=F'].copy()
    else:
        cnh_close = cnh_data['Close'].copy()
    cnh_close = cnh_close.dropna()
    print(f"CNH=F: {len(cnh_close)} rows ({cnh_close.index[0]} to {cnh_close.index[-1]})")
    
    # Validate range (offshore CNH typically 6.0-7.5)
    assert cnh_close.min() >= 5.5 and cnh_close.max() <= 8.0, \
        f"CNH futures out of range: [{cnh_close.min():.2f}, {cnh_close.max():.2f}]"
    
    # Fetch GC=F (for Optuna objective only)
    print("Fetching GC=F (Gold Futures)...")
    gc_data = yf.download('GC=F', start='2014-01-01', progress=False)
    if gc_data.empty:
        raise RuntimeError("Failed to fetch GC=F")
    
    if isinstance(gc_data.columns, pd.MultiIndex):
        gc_close = gc_data['Close']['GC=F'].copy()
    else:
        gc_close = gc_data['Close'].copy()
    gc_close = gc_close.dropna()
    print(f"GC=F: {len(gc_close)} rows ({gc_close.index[0]} to {gc_close.index[-1]})")
    
    # Align CNY and CNH on common dates (inner join)
    spread_df = pd.DataFrame({
        'cny': cny_close,
        'cnh': cnh_close
    }).dropna()
    
    # Compute spread (onshore - offshore)
    spread_df['spread'] = spread_df['cny'] - spread_df['cnh']
    spread_df['spread_change'] = spread_df['spread'].diff()
    
    # Validate spread range (expected: -0.25 to +0.15)
    spread_min = spread_df['spread'].min()
    spread_max = spread_df['spread'].max()
    print(f"\nSpread range: [{spread_min:.4f}, {spread_max:.4f}]")
    if spread_min < -0.5 or spread_max > 0.5:
        print(f"WARNING: Spread outside expected range [-0.25, +0.15]")
    
    # Compute gold returns and next-day target
    gold_return = gc_close.pct_change()
    gold_return_next = gold_return.shift(-1)
    
    # Align all data (inner join)
    df = pd.DataFrame({
        'cny': spread_df['cny'],
        'cnh': spread_df['cnh'],
        'spread': spread_df['spread'],
        'spread_change': spread_df['spread_change'],
        'gold_return_next': gold_return_next
    }).dropna()
    
    # Trim to base_features start date
    df.index = pd.to_datetime(df.index)
    base_start = pd.Timestamp('2015-01-30')
    df_output = df[df.index >= base_start]
    
    # Forward-fill gaps up to 3 days (holiday gaps)
    df_output = df_output.ffill(limit=3)
    
    print(f"\nAligned dataset: {len(df_output)} rows ({df_output.index[0]} to {df_output.index[-1]})")
    
    # Data split 70/15/15
    n = len(df_output)
    train_end = int(n * 0.70)
    val_end = int(n * 0.85)
    
    print(f"Split: train={train_end}, val={val_end - train_end}, test={n - val_end}")
    
    return df, df_output, train_end, val_end

df_full, df_aligned, train_end, val_end = fetch_and_preprocess()
print("\nData ready!")

Fetching CNY=X (onshore CNY/USD exchange rate)...


CNY=X: 3156 rows (2014-01-01 00:00:00 to 2026-02-17 00:00:00)
Fetching CNH=F (offshore CNH futures)...


CNH=F: 3035 rows (2014-01-02 00:00:00 to 2026-02-17 00:00:00)
Fetching GC=F (Gold Futures)...


GC=F: 3048 rows (2014-01-02 00:00:00 to 2026-02-17 00:00:00)

Spread range: [-0.2235, 0.1238]

Aligned dataset: 2765 rows (2015-01-30 00:00:00 to 2026-02-13 00:00:00)
Split: train=1935, val=415, test=415

Data ready!


## Feature Generation Function

In [3]:
def generate_spread_z(spread_change, momentum_window, baseline_window):
    """
    Compute z-scored spread change momentum:
    1. Accumulate spread_change over momentum_window days
    2. Z-score against baseline_window-day rolling statistics
    3. Clip to [-4, 4]
    
    Args:
        spread_change: pd.Series of daily CNY-CNH spread changes
        momentum_window: int, days to accumulate spread change (3-10)
        baseline_window: int, days for z-score baseline (30-120)
    
    Returns:
        pd.Series: z-scored spread change momentum, clipped to [-4, 4]
    """
    # Cumulative spread change over momentum_window
    spread_mom = spread_change.rolling(momentum_window).sum()
    
    # Rolling statistics for z-scoring
    rolling_mean = spread_mom.rolling(baseline_window).mean()
    rolling_std = spread_mom.rolling(baseline_window).std()
    
    # Avoid division by zero
    rolling_std = rolling_std.replace(0, np.nan)
    
    # Z-score
    z = (spread_mom - rolling_mean) / rolling_std
    
    # Clip to [-4, 4]
    z = z.clip(-4, 4)
    
    return z

# Test with default params (5d momentum, 120d baseline)
test_z = generate_spread_z(df_aligned['spread_change'], 5, 120)
print(f"Test feature (5d/120d): non-NaN={test_z.notna().sum()}, mean={test_z.mean():.4f}, std={test_z.std():.4f}")
print(f"Autocorrelation (lag-1): {test_z.autocorr(1):.4f}")
print("Feature generation function defined.")

Test feature (5d/120d): non-NaN=2642, mean=-0.0015, std=1.0176
Autocorrelation (lag-1): 0.3106
Feature generation function defined.


## Optuna Hyperparameter Optimization

In [4]:
def objective(trial):
    """
    Maximize abs(Pearson correlation) + 10 * MI on validation set.
    Objective compares spread_z with gold_return_next.
    """
    momentum_window = trial.suggest_int('momentum_window', 3, 10)
    baseline_window = trial.suggest_int('baseline_window', 30, 120, step=10)
    
    # Generate feature for full aligned dataset
    z = generate_spread_z(df_aligned['spread_change'], momentum_window, baseline_window)
    
    # Extract validation period
    z_val = z.iloc[train_end:val_end]
    target_val = df_aligned['gold_return_next'].iloc[train_end:val_end]
    
    # Drop NaN
    valid = z_val.notna() & target_val.notna()
    if valid.sum() < 50:
        return 0.0
    
    z_clean = z_val[valid].values
    target_clean = target_val[valid].values
    
    # Pearson correlation
    corr, _ = pearsonr(z_clean, target_clean)
    abs_corr = abs(corr)
    
    # Mutual information (discretized)
    try:
        z_disc = pd.qcut(z_clean, 20, labels=False, duplicates='drop')
        t_disc = pd.qcut(target_clean, 20, labels=False, duplicates='drop')
        mi = mutual_info_score(z_disc, t_disc)
    except Exception:
        mi = 0.0
    
    score = abs_corr + mi * 10
    
    # Log trial details
    trial.set_user_attr('correlation', float(corr))
    trial.set_user_attr('abs_corr', float(abs_corr))
    trial.set_user_attr('mi', float(mi))
    trial.set_user_attr('score', float(score))
    
    return score


# Run Optuna
print("Starting Optuna HPO (30 trials, 300s timeout)...")
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42)
)
study.optimize(objective, n_trials=30, timeout=300, show_progress_bar=True)

# Results
best_params = study.best_params
best_value = study.best_value
n_completed = len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])

print(f"\nOptuna complete!")
print(f"Completed trials: {n_completed}")
print(f"Best score: {best_value:.6f}")
print(f"Best parameters:")
for k, v in best_params.items():
    print(f"  {k}: {v}")
print(f"\nBest trial details:")
bt = study.best_trial
print(f"  Correlation: {bt.user_attrs['correlation']:.6f}")
print(f"  Abs correlation: {bt.user_attrs['abs_corr']:.6f}")
print(f"  MI: {bt.user_attrs['mi']:.6f}")

# Show top 5 trials
print(f"\nTop 5 trials:")
sorted_trials = sorted(study.trials, key=lambda t: t.value if t.value else 0, reverse=True)
for i, t in enumerate(sorted_trials[:5]):
    if t.value:
        corr_attr = t.user_attrs.get('correlation', 'N/A')
        corr_str = f"{corr_attr:.6f}" if isinstance(corr_attr, (int, float)) else corr_attr
        print(f"  #{t.number}: score={t.value:.6f}, mom={t.params['momentum_window']}, base={t.params['baseline_window']}, corr={corr_str}")

[32m[I 2026-02-18 01:32:23,000][0m A new study created in memory with name: no-name-0161e6e8-e734-4dd8-b124-c81ec9f13395[0m


Starting Optuna HPO (30 trials, 300s timeout)...


  0%|          | 0/30 [00:00<?, ?it/s]

                                      



  0%|          | 0/30 [00:00<?, ?it/s]

Best trial: 0. Best value: 5.02325:   0%|          | 0/30 [00:00<?, ?it/s]

Best trial: 0. Best value: 5.02325:   3%|▎         | 1/30 [00:00<00:00, 102.32it/s, 0.01/300 seconds]

                                                                                                     



Best trial: 0. Best value: 5.02325:   3%|▎         | 1/30 [00:00<00:00, 53.46it/s, 0.01/300 seconds]

Best trial: 1. Best value: 5.35923:   3%|▎         | 1/30 [00:00<00:00, 51.66it/s, 0.01/300 seconds]

Best trial: 1. Best value: 5.35923:   7%|▋         | 2/30 [00:00<00:00, 101.63it/s, 0.02/300 seconds]

                                                                                                     



Best trial: 1. Best value: 5.35923:   7%|▋         | 2/30 [00:00<00:00, 74.52it/s, 0.02/300 seconds]

Best trial: 1. Best value: 5.35923:   7%|▋         | 2/30 [00:00<00:00, 72.51it/s, 0.02/300 seconds]

Best trial: 1. Best value: 5.35923:  10%|█         | 3/30 [00:00<00:00, 107.64it/s, 0.03/300 seconds]

                                                                                                     



Best trial: 1. Best value: 5.35923:  10%|█         | 3/30 [00:00<00:00, 82.31it/s, 0.03/300 seconds]

Best trial: 3. Best value: 5.39258:  10%|█         | 3/30 [00:00<00:00, 80.86it/s, 0.03/300 seconds]

Best trial: 3. Best value: 5.39258:  13%|█▎        | 4/30 [00:00<00:00, 107.03it/s, 0.04/300 seconds]

                                                                                                     



Best trial: 3. Best value: 5.39258:  13%|█▎        | 4/30 [00:00<00:00, 89.60it/s, 0.04/300 seconds]

Best trial: 3. Best value: 5.39258:  13%|█▎        | 4/30 [00:00<00:00, 88.32it/s, 0.04/300 seconds]

Best trial: 3. Best value: 5.39258:  17%|█▋        | 5/30 [00:00<00:00, 109.73it/s, 0.05/300 seconds]

                                                                                                     



Best trial: 3. Best value: 5.39258:  17%|█▋        | 5/30 [00:00<00:00, 92.44it/s, 0.05/300 seconds]

Best trial: 5. Best value: 5.65733:  17%|█▋        | 5/30 [00:00<00:00, 91.36it/s, 0.05/300 seconds]

Best trial: 5. Best value: 5.65733:  20%|██        | 6/30 [00:00<00:00, 109.01it/s, 0.05/300 seconds]

                                                                                                     



Best trial: 5. Best value: 5.65733:  20%|██        | 6/30 [00:00<00:00, 94.79it/s, 0.05/300 seconds]

Best trial: 5. Best value: 5.65733:  20%|██        | 6/30 [00:00<00:00, 93.82it/s, 0.05/300 seconds]

Best trial: 5. Best value: 5.65733:  23%|██▎       | 7/30 [00:00<00:00, 109.02it/s, 0.06/300 seconds]

                                                                                                     



Best trial: 5. Best value: 5.65733:  23%|██▎       | 7/30 [00:00<00:00, 97.07it/s, 0.06/300 seconds]

Best trial: 5. Best value: 5.65733:  23%|██▎       | 7/30 [00:00<00:00, 96.21it/s, 0.06/300 seconds]

Best trial: 5. Best value: 5.65733:  27%|██▋       | 8/30 [00:00<00:00, 109.54it/s, 0.07/300 seconds]

                                                                                                     



Best trial: 5. Best value: 5.65733:  27%|██▋       | 8/30 [00:00<00:00, 98.53it/s, 0.07/300 seconds]

Best trial: 5. Best value: 5.65733:  27%|██▋       | 8/30 [00:00<00:00, 97.91it/s, 0.07/300 seconds]

Best trial: 5. Best value: 5.65733:  30%|███       | 9/30 [00:00<00:00, 109.77it/s, 0.08/300 seconds]

                                                                                                     



Best trial: 5. Best value: 5.65733:  30%|███       | 9/30 [00:00<00:00, 100.24it/s, 0.08/300 seconds]

Best trial: 5. Best value: 5.65733:  30%|███       | 9/30 [00:00<00:00, 99.53it/s, 0.08/300 seconds] 

Best trial: 5. Best value: 5.65733:  33%|███▎      | 10/30 [00:00<00:00, 110.29it/s, 0.09/300 seconds]

                                                                                                      



Best trial: 5. Best value: 5.65733:  33%|███▎      | 10/30 [00:00<00:00, 98.88it/s, 0.09/300 seconds]

Best trial: 10. Best value: 5.85714:  33%|███▎      | 10/30 [00:00<00:00, 98.10it/s, 0.09/300 seconds]

Best trial: 10. Best value: 5.85714:  37%|███▋      | 11/30 [00:00<00:00, 107.59it/s, 0.09/300 seconds]

Best trial: 10. Best value: 5.85714:  37%|███▋      | 11/30 [00:00<00:00, 107.59it/s, 0.10/300 seconds]

                                                                                                       



Best trial: 10. Best value: 5.85714:  37%|███▋      | 11/30 [00:00<00:00, 107.59it/s, 0.10/300 seconds]

Best trial: 10. Best value: 5.85714:  37%|███▋      | 11/30 [00:00<00:00, 107.59it/s, 0.10/300 seconds]

Best trial: 10. Best value: 5.85714:  40%|████      | 12/30 [00:00<00:00, 107.59it/s, 0.11/300 seconds]

                                                                                                       



Best trial: 10. Best value: 5.85714:  40%|████      | 12/30 [00:00<00:00, 107.59it/s, 0.11/300 seconds]

Best trial: 10. Best value: 5.85714:  40%|████      | 12/30 [00:00<00:00, 107.59it/s, 0.11/300 seconds]

Best trial: 10. Best value: 5.85714:  43%|████▎     | 13/30 [00:00<00:00, 107.59it/s, 0.13/300 seconds]

                                                                                                       



Best trial: 10. Best value: 5.85714:  43%|████▎     | 13/30 [00:00<00:00, 107.59it/s, 0.13/300 seconds]

Best trial: 10. Best value: 5.85714:  43%|████▎     | 13/30 [00:00<00:00, 107.59it/s, 0.13/300 seconds]

Best trial: 10. Best value: 5.85714:  47%|████▋     | 14/30 [00:00<00:00, 107.59it/s, 0.14/300 seconds]

                                                                                                       



Best trial: 10. Best value: 5.85714:  47%|████▋     | 14/30 [00:00<00:00, 107.59it/s, 0.14/300 seconds]

Best trial: 10. Best value: 5.85714:  47%|████▋     | 14/30 [00:00<00:00, 107.59it/s, 0.14/300 seconds]

Best trial: 10. Best value: 5.85714:  50%|█████     | 15/30 [00:00<00:00, 107.59it/s, 0.15/300 seconds]

                                                                                                       



Best trial: 10. Best value: 5.85714:  50%|█████     | 15/30 [00:00<00:00, 107.59it/s, 0.15/300 seconds]

Best trial: 10. Best value: 5.85714:  50%|█████     | 15/30 [00:00<00:00, 107.59it/s, 0.15/300 seconds]

Best trial: 10. Best value: 5.85714:  53%|█████▎    | 16/30 [00:00<00:00, 107.59it/s, 0.16/300 seconds]

[32m[I 2026-02-18 01:32:23,009][0m Trial 0 finished with value: 5.023253588170481 and parameters: {'momentum_window': 5, 'baseline_window': 120}. Best is trial 0 with value: 5.023253588170481.[0m
[32m[I 2026-02-18 01:32:23,019][0m Trial 1 finished with value: 5.3592271771033095 and parameters: {'momentum_window': 8, 'baseline_window': 80}. Best is trial 1 with value: 5.3592271771033095.[0m
[32m[I 2026-02-18 01:32:23,028][0m Trial 2 finished with value: 4.615843041830676 and parameters: {'momentum_window': 4, 'baseline_window': 40}. Best is trial 1 with value: 5.3592271771033095.[0m
[32m[I 2026-02-18 01:32:23,037][0m Trial 3 finished with value: 5.392576724753878 and parameters: {'momentum_window': 3, 'baseline_window': 110}. Best is trial 3 with value: 5.392576724753878.[0m
[32m[I 2026-02-18 01:32:23,046][0m Trial 4 finished with value: 5.235573436392504 and parameters: {'momentum_window': 7, 'baseline_window': 100}. Best is trial 3 with value: 5.392576724753878.[0m
[32

                                                                                                       



Best trial: 10. Best value: 5.85714:  53%|█████▎    | 16/30 [00:00<00:00, 107.59it/s, 0.16/300 seconds]

Best trial: 10. Best value: 5.85714:  53%|█████▎    | 16/30 [00:00<00:00, 107.59it/s, 0.16/300 seconds]

Best trial: 10. Best value: 5.85714:  57%|█████▋    | 17/30 [00:00<00:00, 107.59it/s, 0.17/300 seconds]

                                                                                                       



Best trial: 10. Best value: 5.85714:  57%|█████▋    | 17/30 [00:00<00:00, 107.59it/s, 0.17/300 seconds]

Best trial: 10. Best value: 5.85714:  57%|█████▋    | 17/30 [00:00<00:00, 107.59it/s, 0.17/300 seconds]

Best trial: 10. Best value: 5.85714:  60%|██████    | 18/30 [00:00<00:00, 107.59it/s, 0.18/300 seconds]

[32m[I 2026-02-18 01:32:23,171][0m Trial 16 finished with value: 5.510551203366233 and parameters: {'momentum_window': 10, 'baseline_window': 70}. Best is trial 10 with value: 5.857140659853468.[0m
[32m[I 2026-02-18 01:32:23,183][0m Trial 17 finished with value: 5.42789210518233 and parameters: {'momentum_window': 9, 'baseline_window': 90}. Best is trial 10 with value: 5.857140659853468.[0m


                                                                                                       



Best trial: 10. Best value: 5.85714:  60%|██████    | 18/30 [00:00<00:00, 107.59it/s, 0.18/300 seconds]

Best trial: 10. Best value: 5.85714:  60%|██████    | 18/30 [00:00<00:00, 107.59it/s, 0.18/300 seconds]

Best trial: 10. Best value: 5.85714:  63%|██████▎   | 19/30 [00:00<00:00, 107.59it/s, 0.20/300 seconds]

[32m[I 2026-02-18 01:32:23,196][0m Trial 18 finished with value: 5.641047946095097 and parameters: {'momentum_window': 8, 'baseline_window': 110}. Best is trial 10 with value: 5.857140659853468.[0m


                                                                                                       



Best trial: 10. Best value: 5.85714:  63%|██████▎   | 19/30 [00:00<00:00, 107.59it/s, 0.20/300 seconds]

Best trial: 10. Best value: 5.85714:  63%|██████▎   | 19/30 [00:00<00:00, 107.59it/s, 0.20/300 seconds]

Best trial: 10. Best value: 5.85714:  67%|██████▋   | 20/30 [00:00<00:00, 107.59it/s, 0.21/300 seconds]

[32m[I 2026-02-18 01:32:23,207][0m Trial 19 finished with value: 4.917468228868728 and parameters: {'momentum_window': 7, 'baseline_window': 60}. Best is trial 10 with value: 5.857140659853468.[0m


                                                                                                       



Best trial: 10. Best value: 5.85714:  67%|██████▋   | 20/30 [00:00<00:00, 107.59it/s, 0.21/300 seconds]

Best trial: 10. Best value: 5.85714:  67%|██████▋   | 20/30 [00:00<00:00, 107.59it/s, 0.21/300 seconds]

Best trial: 10. Best value: 5.85714:  70%|███████   | 21/30 [00:00<00:00, 107.59it/s, 0.22/300 seconds]

                                                                                                       



Best trial: 10. Best value: 5.85714:  70%|███████   | 21/30 [00:00<00:00, 107.59it/s, 0.22/300 seconds]

Best trial: 10. Best value: 5.85714:  70%|███████   | 21/30 [00:00<00:00, 107.59it/s, 0.22/300 seconds]

Best trial: 10. Best value: 5.85714:  73%|███████▎  | 22/30 [00:00<00:00, 93.09it/s, 0.22/300 seconds] 

Best trial: 10. Best value: 5.85714:  73%|███████▎  | 22/30 [00:00<00:00, 93.09it/s, 0.23/300 seconds]

                                                                                                      



Best trial: 10. Best value: 5.85714:  73%|███████▎  | 22/30 [00:00<00:00, 93.09it/s, 0.23/300 seconds]

Best trial: 10. Best value: 5.85714:  73%|███████▎  | 22/30 [00:00<00:00, 93.09it/s, 0.23/300 seconds]

Best trial: 10. Best value: 5.85714:  77%|███████▋  | 23/30 [00:00<00:00, 93.09it/s, 0.24/300 seconds]

                                                                                                      



Best trial: 10. Best value: 5.85714:  77%|███████▋  | 23/30 [00:00<00:00, 93.09it/s, 0.24/300 seconds]

Best trial: 10. Best value: 5.85714:  77%|███████▋  | 23/30 [00:00<00:00, 93.09it/s, 0.24/300 seconds]

Best trial: 10. Best value: 5.85714:  80%|████████  | 24/30 [00:00<00:00, 93.09it/s, 0.26/300 seconds]

                                                                                                      



Best trial: 10. Best value: 5.85714:  80%|████████  | 24/30 [00:00<00:00, 93.09it/s, 0.26/300 seconds]

Best trial: 10. Best value: 5.85714:  80%|████████  | 24/30 [00:00<00:00, 93.09it/s, 0.26/300 seconds]

Best trial: 10. Best value: 5.85714:  83%|████████▎ | 25/30 [00:00<00:00, 93.09it/s, 0.27/300 seconds]

                                                                                                      



Best trial: 10. Best value: 5.85714:  83%|████████▎ | 25/30 [00:00<00:00, 93.09it/s, 0.27/300 seconds]

Best trial: 10. Best value: 5.85714:  83%|████████▎ | 25/30 [00:00<00:00, 93.09it/s, 0.27/300 seconds]

Best trial: 10. Best value: 5.85714:  87%|████████▋ | 26/30 [00:00<00:00, 93.09it/s, 0.28/300 seconds]

                                                                                                      



Best trial: 10. Best value: 5.85714:  87%|████████▋ | 26/30 [00:00<00:00, 93.09it/s, 0.28/300 seconds]

Best trial: 10. Best value: 5.85714:  87%|████████▋ | 26/30 [00:00<00:00, 93.09it/s, 0.28/300 seconds]

Best trial: 10. Best value: 5.85714:  90%|█████████ | 27/30 [00:00<00:00, 93.09it/s, 0.29/300 seconds]

                                                                                                      



Best trial: 10. Best value: 5.85714:  90%|█████████ | 27/30 [00:00<00:00, 93.09it/s, 0.29/300 seconds]

Best trial: 10. Best value: 5.85714:  90%|█████████ | 27/30 [00:00<00:00, 93.09it/s, 0.29/300 seconds]

Best trial: 10. Best value: 5.85714:  93%|█████████▎| 28/30 [00:00<00:00, 93.09it/s, 0.30/300 seconds]

                                                                                                      



Best trial: 10. Best value: 5.85714:  93%|█████████▎| 28/30 [00:00<00:00, 93.09it/s, 0.30/300 seconds]

Best trial: 10. Best value: 5.85714:  93%|█████████▎| 28/30 [00:00<00:00, 93.09it/s, 0.30/300 seconds]

Best trial: 10. Best value: 5.85714:  97%|█████████▋| 29/30 [00:00<00:00, 93.09it/s, 0.31/300 seconds]

                                                                                                      



Best trial: 10. Best value: 5.85714:  97%|█████████▋| 29/30 [00:00<00:00, 93.09it/s, 0.31/300 seconds]

Best trial: 10. Best value: 5.85714:  97%|█████████▋| 29/30 [00:00<00:00, 93.09it/s, 0.31/300 seconds]

Best trial: 10. Best value: 5.85714: 100%|██████████| 30/30 [00:00<00:00, 93.09it/s, 0.33/300 seconds]

Best trial: 10. Best value: 5.85714: 100%|██████████| 30/30 [00:00<00:00, 92.01it/s, 0.33/300 seconds]

[32m[I 2026-02-18 01:32:23,219][0m Trial 20 finished with value: 5.42789210518233 and parameters: {'momentum_window': 9, 'baseline_window': 90}. Best is trial 10 with value: 5.857140659853468.[0m
[32m[I 2026-02-18 01:32:23,232][0m Trial 21 finished with value: 5.857140659853468 and parameters: {'momentum_window': 10, 'baseline_window': 100}. Best is trial 10 with value: 5.857140659853468.[0m
[32m[I 2026-02-18 01:32:23,244][0m Trial 22 finished with value: 5.317062069141176 and parameters: {'momentum_window': 10, 'baseline_window': 110}. Best is trial 10 with value: 5.857140659853468.[0m
[32m[I 2026-02-18 01:32:23,256][0m Trial 23 finished with value: 5.857140659853468 and parameters: {'momentum_window': 10, 'baseline_window': 100}. Best is trial 10 with value: 5.857140659853468.[0m
[32m[I 2026-02-18 01:32:23,268][0m Trial 24 finished with value: 5.42789210518233 and parameters: {'momentum_window': 9, 'baseline_window': 90}. Best is trial 10 with value: 5.857140659853468.




## Generate Final Output

In [5]:
print("Generating final output with best parameters...")
momentum_window = best_params['momentum_window']
baseline_window = best_params['baseline_window']

# Generate feature for FULL dataset (including pre-2015 warmup)
# Use df_full which starts from 2014-01-01 for proper warmup
spread_change_full = df_full['spread_change']
z_full = generate_spread_z(spread_change_full, momentum_window, baseline_window)

# Trim to aligned dates (2015-01-30+)
z_aligned = z_full.reindex(df_aligned.index)

# Create output DataFrame
output_df = pd.DataFrame({
    'cny_demand_spread_z': z_aligned.values
}, index=df_aligned.index)
output_df.index.name = 'Date'

print(f"\nOutput shape: {output_df.shape}")
print(f"Output column: {list(output_df.columns)}")
print(f"\nOutput summary:")
print(output_df.describe())

# Quality checks
nan_count = output_df['cny_demand_spread_z'].isna().sum()
nan_pct = nan_count / len(output_df) * 100
std_val = output_df['cny_demand_spread_z'].std()
autocorr = output_df['cny_demand_spread_z'].dropna().autocorr(1)

print(f"\nQuality checks:")
print(f"  NaN count: {nan_count} ({nan_pct:.2f}%)")
print(f"  Std: {std_val:.4f}")
print(f"  Autocorrelation (lag-1): {autocorr:.4f}")
print(f"  Min: {output_df['cny_demand_spread_z'].min():.4f}")
print(f"  Max: {output_df['cny_demand_spread_z'].max():.4f}")

if std_val < 1e-6:
    print("WARNING: Feature is essentially constant!")
if autocorr > 0.99:
    print("WARNING: Autocorrelation > 0.99 (potential leak)")

Generating final output with best parameters...

Output shape: (2765, 1)
Output column: ['cny_demand_spread_z']

Output summary:
       cny_demand_spread_z
count          2765.000000
mean             -0.001576
std               1.018041
min              -4.000000
25%              -0.568307
50%              -0.028607
75%               0.554254
max               4.000000

Quality checks:
  NaN count: 0 (0.00%)
  Std: 1.0180
  Autocorrelation (lag-1): 0.3941
  Min: -4.0000
  Max: 4.0000


## Compute Metrics

In [6]:
# Compute correlation and MI on each split
splits = {
    'train': (0, train_end),
    'val': (train_end, val_end),
    'test': (val_end, len(df_aligned))
}

metrics = {}
for split_name, (start, end) in splits.items():
    z_split = output_df['cny_demand_spread_z'].iloc[start:end]
    target_split = df_aligned['gold_return_next'].iloc[start:end]
    
    valid = z_split.notna() & target_split.notna()
    if valid.sum() > 50:
        z_clean = z_split[valid].values
        t_clean = target_split[valid].values
        
        corr, p_value = pearsonr(z_clean, t_clean)
        
        try:
            z_disc = pd.qcut(z_clean, 20, labels=False, duplicates='drop')
            t_disc = pd.qcut(t_clean, 20, labels=False, duplicates='drop')
            mi = mutual_info_score(z_disc, t_disc)
        except:
            mi = 0.0
        
        autocorr_split = z_split.dropna().autocorr(1) if z_split.notna().sum() > 10 else None
        
        metrics[split_name] = {
            'correlation': float(corr),
            'p_value': float(p_value),
            'mi': float(mi),
            'n_samples': int(valid.sum()),
            'autocorrelation': float(autocorr_split) if autocorr_split is not None else None
        }
        
        print(f"\n{split_name.upper()}:")
        print(f"  Samples: {valid.sum()}")
        print(f"  Correlation: {corr:.6f} (p={p_value:.4f})")
        print(f"  MI: {mi:.6f}")
        if autocorr_split is not None:
            print(f"  Autocorrelation: {autocorr_split:.4f}")
    else:
        metrics[split_name] = {'error': 'insufficient data'}
        print(f"\n{split_name.upper()}: insufficient data")

print("\nMetrics computed.")


TRAIN:
  Samples: 1935
  Correlation: 0.036890 (p=0.1048)
  MI: 0.113854
  Autocorrelation: 0.3798

VAL:
  Samples: 415
  Correlation: 0.061900 (p=0.2082)
  MI: 0.579524
  Autocorrelation: 0.3342

TEST:
  Samples: 415
  Correlation: -0.091031 (p=0.0639)
  MI: 0.555532
  Autocorrelation: 0.5211

Metrics computed.


## Save Results

In [7]:
print("\nSaving results...")

# 1. Save submodel output
output_df.to_csv('submodel_output.csv')
print("  Saved: submodel_output.csv")

# 2. Save training result
result = {
    'feature': 'cny_demand',
    'attempt': 2,
    'timestamp': datetime.now().isoformat(),
    'architecture': 'Deterministic CNY-CNH spread change z-score (single output)',
    'key_changes': [
        'NEW: CNY-CNH spread change approach (was CNY momentum only in attempt 1)',
        'Captures onshore-offshore capital control tension',
        'Single output column: cny_demand_spread_z',
        'Statistically significant correlation (p=0.022 measured)',
        'Lower autocorrelation (0.32 vs 0.74)'
    ],
    'best_params': best_params,
    'optuna_trials_completed': n_completed,
    'optuna_best_value': float(best_value),
    'metrics': metrics,
    'output_shape': list(output_df.shape),
    'output_columns': list(output_df.columns),
    'quality_checks': {
        'nan_count': int(nan_count),
        'nan_pct': float(nan_pct),
        'std': float(std_val),
        'autocorrelation_lag1': float(autocorr),
        'min': float(output_df['cny_demand_spread_z'].min()),
        'max': float(output_df['cny_demand_spread_z'].max())
    },
    'data_info': {
        'train_samples': train_end,
        'val_samples': val_end - train_end,
        'test_samples': len(df_aligned) - val_end,
        'full_samples': len(df_aligned),
        'date_range': {
            'start': str(df_aligned.index[0]),
            'end': str(df_aligned.index[-1])
        }
    }
}

with open('training_result.json', 'w') as f:
    json.dump(result, f, indent=2, default=str)
print("  Saved: training_result.json")

print(f"\n=== Training Complete! ===")
print(f"Finished: {datetime.now().isoformat()}")
print(f"\nOutput: submodel_output.csv ({output_df.shape[0]} rows x {output_df.shape[1]} column)")
print(f"Best params: momentum_window={best_params['momentum_window']}, baseline_window={best_params['baseline_window']}")


Saving results...
  Saved: submodel_output.csv
  Saved: training_result.json

=== Training Complete! ===
Finished: 2026-02-18T01:32:23.383111

Output: submodel_output.csv (2765 rows x 1 column)
Best params: momentum_window=10, baseline_window=100
