In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import os
import torch
import joblib
import numpy as np

from TransformerModel import TimeSeriesTransformer
from LSTMModel import TimeSeriesLSTM
from torch.utils.data import TensorDataset
from Util import create_sequences, sample_dataset, plot_metric, plot_multiple_curves, grid_search, safe_inverse_transform
from torch.utils.data import random_split


from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [None]:
# 检查设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# 准备数据，训练模型时会对train做k-fold。

# 准备train
df_train = pd.read_csv ('../TockFormerData\SPY_1hour_train.csv')
X, y, scaler, target_indices = create_sequences (df_train, seq_length=32, target_cols=['open', 'high', 'low', 'close'],
                                                 scale=True)
train_dataset = TensorDataset (X, y)
sample_dataset = sample_dataset (train_dataset, 0.1)

In [None]:
# 准备test
df_test = pd.read_csv ('../TockFormerData\SPY_1hour_test.csv')
X_t, y_t, _, _ = create_sequences (df_test, seq_length=32, target_cols=['open', 'high', 'low', 'close'],
                                   scaler=scaler)
test_dataset = TensorDataset (X_t, y_t)

In [None]:
# 准备val
df_val = pd.read_csv ('../TockFormerData\SPY_1hour_validate.csv')
X_val, y_val, _, _ = create_sequences (df_val, seq_length=32, target_cols=['open', 'high', 'low', 'close'],
                                       scaler=scaler)
val_dataset = TensorDataset (X_val, y_val)

In [None]:
# 检查数据集大小
print (f"Train dataset size: {len(train_dataset)}")
print (f"Sample dataset size: {len(sample_dataset)}")
print (f"Test dataset size: {len(test_dataset)}")
print (f"Val dataset size: {len(val_dataset)}")

In [None]:
# 定义网格搜索 时序Transformer，找最好的超参数
# 固定的初始化参数
init_args = {
    'input_dim': 49,
    'output_dim': 4,
    'seq_length': 32,
    'dropout': 0.1
}

# 架构超参数空间
param_grid = {
    'model_dim': [64, 128],
    'num_heads': [2, 4],
    'num_layers': [2, 3]
}

best_params, best_score = grid_search (
    TimeSeriesTransformer,
    init_args,
    sample_dataset,
    param_grid,
    cv=3,
    scaler=scaler,
    target_indices=target_indices
)

print ('Best architecture:', best_params)
print ('Best avg CV MSE:', best_score)

In [None]:
# 应用最好的参数, 对全体数据进行 cv，检查performance
# best_params['input_dim'] = init_args['input_dim']
# best_params['output_dim'] = init_args['output_dim']
# best_params['seq_length'] = init_args['seq_length']
# best_params['dropout'] = init_args['dropout']

# 这里为了方便测试，直接使用预先调好的参数
best_params_tock = {
 'model_dim': 64,
 'num_heads': 4,
 'num_layers': 3,
 'input_dim': 49,
 'output_dim': 4,
 'seq_length': 32,
 'dropout': 0.1
 }


best_params_tock

In [None]:
# 如果表现优异，对 tock 进行 train
tock = TimeSeriesTransformer (**best_params)
train_loss, mse_list_train, r2_list_train = tock.train_model (
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    num_epochs=50,
    batch_size=32,
    learning_rate=1e-4,
    scaler=scaler,  # 回归任务需要 scaler 进行逆缩放
    target_indices=target_indices,  # 回归任务中目标列索引
    patience=10,
    log=True,
)

In [None]:
# 绘制训练结果
mse_list = np.array (mse_list_train)
r2_list = np.array (r2_list_train)
plot_metric (train_loss, y_label="loss", title="Train Loss", color='red')
plot_metric (mse_list[:, 3], y_label="mse", title="Val MSE", color='green')
plot_metric (r2_list[:, 3], y_label="r2", title="Val R²", color='blue')

In [None]:
# 对test 进行测试
mse_list_test, r2_list_test, preds_test, targets = tock.evaluate_model (test_dataset, batch_size=32,
                                                                         scaler=scaler, target_indices=target_indices)


print ("测试集 MSE: ", mse_list_test)
print ("测试集 R²: ", r2_list_test)

curve_dict = {}
curve_dict['predicts'] = preds_test[:, 3]
curve_dict['targets'] = targets[:, 3]
plot_multiple_curves (curve_dict, x_label='interval', y_label='price')

In [None]:
# 网格搜索 LSTM 超参数

init_args_lstm = {
    'input_dim': 49,
    'output_dim': 4,
    'seq_length': 32,
    'dropout': 0.1
}

param_grid_lstm = {
    'hidden_dim': [64, 128],
    'num_layers': [2, 3],
}

best_params_lstm, best_score_lstm = grid_search (
    TimeSeriesLSTM,
    init_args_lstm,
    sample_dataset,
    param_grid=param_grid_lstm,
    cv=3,
    scaler=scaler,
    target_indices=target_indices
)

print ('Best architecture:', best_params_lstm)
print ('Best avg CV MSE:', best_score_lstm)

In [None]:
# 应用最好的参数, 对全体数据进行 cv，检查performance
# best_params['input_dim'] = init_args['input_dim']
# best_params['output_dim'] = init_args['output_dim']
# best_params['seq_length'] = init_args['seq_length']
# best_params['dropout'] = init_args['dropout']

# 这里为了方便测试，直接使用预先调好的参数
best_params_lstm = {
 'hidden_dim': 64,
 'input_dim': 49,
 'output_dim': 4,
 'num_layers': 3,
 'seq_length': 32,
 'dropout': 0.1
 }

best_params_lstm

In [None]:
# 如果表现优异，对 lstm 进行 train
lstm = TimeSeriesLSTM (**best_params_lstm)
train_loss, mse_list_train, r2_list_train = lstm.train_model (
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    num_epochs=50,
    batch_size=32,
    learning_rate=1e-4,
    scaler=scaler,  # 回归任务需要 scaler 进行逆缩放
    target_indices=target_indices,  # 回归任务中目标列索引
    patience=10,
    log=True,
)

# 绘制训练结果
mse_list = np.array (mse_list_train)
r2_list = np.array (r2_list_train)
plot_metric (train_loss, y_label="loss", title="Train Loss", color='red')
plot_metric (mse_list[:, 3], y_label="mse", title="Val MSE", color='green')
plot_metric (r2_list[:, 3], y_label="r2", title="Val R²", color='blue')

In [None]:
mse_list_lstm, r2_list_lstm, preds_lstm, targets = lstm.evaluate_model (test_dataset, batch_size=32,
                                                                         scaler=scaler, target_indices=target_indices)

print ("测试集 MSE: ", mse_list_lstm)
print ("测试集 R²: ", r2_list_lstm)

curve_dict = {}
curve_dict['predicts'] = preds_lstm[:, 3]
curve_dict['targets'] = targets[:, 3]
plot_multiple_curves (curve_dict, x_label='interval', y_label='price')

In [None]:
# 定义 blender 训练参数 (全部杀死) 
# 
# （使用自定义blender，而不是sklearn，base在train上训练， blender在 val上训练 即 base 对val 的预测， 最后整体在 test 上测试）

# 1) 拆出 numpy array
X_train = train_dataset.tensors[0].numpy()
y_train = train_dataset.tensors[1].numpy()
X_test  = test_dataset.tensors[0].numpy()
y_test  = test_dataset.tensors[1].numpy()

# 2) 初始化你的两个 PyTorch 模型
tock = TimeSeriesTransformer(**best_params_tock)
lstm = TimeSeriesLSTM(**best_params_lstm)

# 3) 包装成 sklearn 适配器
tock_adapter = SklearnAdapter(
    model=tock,
    scaler=scaler,
    val_dataset=val_dataset,
    bias_corrector=None,
    target_indices=target_indices,
    num_epochs=5,
    batch_size=32,
    learning_rate=1e-4,
    patience=10,
)

lstm_adapter = SklearnAdapter(
    model=lstm,
    scaler=scaler,
    val_dataset=val_dataset,
    bias_corrector=None,
    target_indices=target_indices,
    num_epochs=20,
    batch_size=32,
    learning_rate=1e-4,
    patience=10,
)

# 4) 定义 stacking regressor
base_estimators = [
    ("tock", tock_adapter),
    ("lstm", lstm_adapter),
]
stack = StackingRegressor(
    estimators=base_estimators,
    final_estimator=Ridge(alpha=1.0),
    cv=5,
    passthrough=False
)

# 5) 用 MultiOutputRegressor 包装，使其支持 y.shape = (n_samples, 4)
multi_stack = MultiOutputRegressor(stack, n_jobs=1)



multi_stack

In [None]:
# 6) 训练
multi_stack.fit(X_train, y_train)

In [None]:
# 预测
preds = multi_stack.predict(X_test)  # preds.shape == (n_samples, 4)

# 7) 评估
# 默认 mean_squared_error 会对 4 维求平均；如需分别看每个维度：
mse_all = mean_squared_error(y_test, preds)  
mse_per_dim = mean_squared_error(y_test, preds, multioutput="raw_values")
print("Overall MSE:", mse_all)
print("Per-dim MSE:", mse_per_dim)

In [None]:
# 训练 residual module

from BiasCorrector import BiasCorrector
from sklearn.metrics import mean_squared_error, r2_score


x_tensor = val_dataset.tensors[0]  # 输入特征
y_tensor = val_dataset.tensors[1]  # 对应标签

y = tock.safe_inverse_transform(y_tensor.numpy(), scaler=scaler, target_indices=target_indices)

# 使用 base model 进行预测（注意需要 batch 预测，如果模型较大）
preds_val = tock.predict_model(x_tensor, scaler=scaler, bias_corrector=None, target_indices=target_indices)  # shape: (N, num_targets)


mlp_corrector = BiasCorrector(mode='mean', scale='None')
mlp_corrector.fit(preds_val, y)


In [None]:
mse_list_cor, r2_list_cor, preds_cor, targets = tock.evaluate_model (test_dataset, batch_size=32,
                                                                         scaler=scaler, target_indices=target_indices,
                                                                        bias_corrector=mlp_corrector)

print ("测试集 MSE: ", mse_list_cor)
print ("测试集 R²: ", r2_list_cor)

curve_dict = {}
curve_dict['predicts'] = preds_cor[:, 3]
curve_dict['targets'] = targets[:, 3]
plot_multiple_curves (curve_dict, x_label='interval', y_label='price')

In [None]:
# 如果test表现优异，则保存
model_out = "./xxx/xxx/"
scaler_out = "./xxx/xxx/"

os.makedirs (os.path.dirname (model_out), exist_ok=True)
os.makedirs (os.path.dirname (scaler_out), exist_ok=True)

print ("=" * 10 + " 保存模型Essentials... " + "=" * 10)
torch.save (model.state_dict (), model_out)
joblib.dump (scaler, scaler_out)