In [12]:
import pandas as pd
import os
import torch
import joblib
import numpy as np

from TransformerModel import TimeSeriesTransformer, grid_search
from torch.utils.data import TensorDataset
from Util import create_sequences, sample_dataset, plot_metric, plot_multiple_curves
from torch.utils.data import random_split

In [13]:
# 准备数据，只需要准备train 和 test，训练模型时会对train做k-fold。
df_train = pd.read_csv ('readyData_ohlc/SPY_1hour_train.csv')
X, y, scaler, target_indices = create_sequences (df_train, seq_length=32, target_cols=['open', 'high', 'low', 'close'],
                                                 scale=True)
train_dataset = TensorDataset (X, y)
sample_dataset = sample_dataset (train_dataset, 0.1)

In [14]:
df_test = pd.read_csv ('readyData_ohlc/SPY_1hour_test.csv')
X_t, y_t, _, _ = create_sequences (df_test, seq_length=32, target_cols=['open', 'high', 'low', 'close'],
                                   scale=True)
test_dataset = TensorDataset (X_t, y_t)

In [None]:
# 定义网格搜索，找最好的超参数
# 固定的初始化参数
init_args = {
    'input_dim': 49,
    'output_dim': 4,
    'seq_length': 32,
    'dropout': 0.1
}

# 架构超参数空间
param_grid = {
    'model_dim': [32, 64, 128],
    'num_heads': [2, 4],
    'num_layers': [1, 2, 3]
}

best_params, best_score = grid_search (
    TimeSeriesTransformer,
    init_args,
    train_dataset,
    param_grid,
    cv=3,
    scaler=scaler,
    target_indices=target_indices
)

print ('Best architecture:', best_params)
print ('Best avg CV MSE:', best_score)

Testing architecture params: {'model_dim': 32, 'num_heads': 2, 'num_layers': 1}
=== Fold 1/3 ===
Using device: cpu
Epoch 1/50, Train Loss: 0.064071
Epoch 1/50, Val MSEs: [2903.636962890625, 3011.084716796875, 3130.41650390625, 2669.9873046875], R²: [0.4474952816963196, 0.4265255331993103, 0.40415364503860474, 0.49177104234695435]
Epoch 2/50, Train Loss: 0.021717
Epoch 2/50, Val MSEs: [809.752685546875, 1108.137451171875, 1316.832763671875, 821.048095703125], R²: [0.8459200859069824, 0.7889503240585327, 0.7493528127670288, 0.8437144756317139]
Epoch 3/50, Train Loss: 0.013009
Epoch 3/50, Val MSEs: [421.9873962402344, 580.1016235351562, 731.0242309570312, 489.33013916015625], R²: [0.9197041392326355, 0.889517068862915, 0.8608561754226685, 0.9068565964698792]
Epoch 4/50, Train Loss: 0.009319
Epoch 4/50, Val MSEs: [241.0908203125, 331.43292236328125, 448.5807189941406, 253.02056884765625], R²: [0.954125165939331, 0.9368771314620972, 0.9146167039871216, 0.9518378376960754]
Epoch 5/50, Train 

In [5]:
# 应用最好的参数, 对全体数据进行 cv，检查performance
model = TimeSeriesTransformer (**best_params)
result = model.cross_validate (train_dataset, k=5, scaler=scaler, target_indices=target_indices)

SyntaxError: incomplete input (4164696866.py, line 3)

In [None]:
print (result)

In [10]:
# 重新整理数据，抽取10% 做validate，剩下的为train

total_len = len (train_dataset)
val_len = int (0.1 * total_len)
train_len = total_len - val_len

# 使用 random_split 划分数据集
train_subset, val_subset = random_split (train_dataset, [train_len, val_len])

In [None]:
# 如果表现优异，对整体进行 train
model = TimeSeriesTransformer (**best_params)
train_loss, mse_list_train, r2_list_train = model.train_model (
    train_subset,
    val_dataset=val_subset,
    num_epochs=50,
    batch_size=32,
    learning_rate=1e-4,
    scaler=scaler,  # 回归任务需要 scaler 进行逆缩放
    target_indices=target_indices,  # 回归任务中目标列索引
    patience=10,
)

In [None]:
# 绘制训练结果
mse_list = np.array (mse_list_train)
r2_list = np.array (r2_list_train)
plot_metric (train_loss, y_label="loss", title="Train Loss", color='red')
plot_metric (mse_list[:, 3], y_label="mse", title="Val MSE", color='green')
plot_metric (r2_list[:, 3], y_label="r2", title="Val R²", color='blue')

In [14]:
# 对test 进行测试
mse_list_test, r2_list_test, preds_test, targets = model.evaluate_model (test_dataset, batch_size=32,
                                                                         scaler=scaler, target_indices=target_indices)

NameError: name 'model' is not defined

In [None]:
print ("测试集 MSE: ", mse_list_test)
print ("测试集 R²: ", r2_list_test)

curve_dict = {}
curve_dict['predicts'] = preds_test[:, 3]
curve_dict['targets'] = targets[:, 3]
plot_multiple_curves (curve_dict, x_label='interval', y_label='price')

In [None]:
# 如果test表现优异，则保存
model_out = "./xxx/xxx/"
scaler_out = "./xxx/xxx/"

os.makedirs (os.path.dirname (model_out), exist_ok=True)
os.makedirs (os.path.dirname (scaler_out), exist_ok=True)

print ("=" * 10 + " 保存模型Essentials... " + "=" * 10)
torch.save (model.state_dict (), model_out)
joblib.dump (scaler, scaler_out)