导入所需包，准备数据

In [4]:
import pandas as pd
import numpy as np
import pickle
import model_function
import preprocessing
from datetime import datetime
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

构建GRU样本，使用同一支股票连续10天的被选出的10个特征作为模型输入，预测最后一天的y值，这里使用经过截面zscore处理的y作为标签

In [5]:
with open('./data/GRU_3MAD_RZ_KNN训练样本.pkl', 'rb') as f:
    train_samples = pickle.load(f)
with open('./data/GRU_3MAD_RZ_KNN验证样本.pkl', 'rb') as f:
    valid_samples = pickle.load(f)
with open('./data/GRU_3MAD_RZ_KNN测试样本.pkl', 'rb') as f:
    test_samples = pickle.load(f)

In [6]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 为节省内存，把后面验证需要的数据先生成，以便早点删除占内存极大的样本变量
GRU_valid_df_ori = pd.DataFrame(valid_samples[:, -1, [0, 1, -1]], columns=['date', 'code', 'y'])
GRU_test_result_ori = pd.DataFrame(test_samples[:, -1, [0, 1]], columns=['date', 'code'])

train_targets = train_samples[:, -1, -1].astype(float)
train_features = train_samples[:, :, 2:-2].astype(float)
valid_targets = valid_samples[:, -1, -1].astype(float)
valid_features = valid_samples[:, :, 2:-2].astype(float)
test_features = test_samples[:, :, 2:].astype(float)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-23 23:11:21.290001
结束时间为: 2024-04-23 23:15:45.374344
处理时间为： 0:04:24.084343


In [7]:
del train_samples
del valid_samples
del test_samples

In [8]:
# 定义数据集类
class StockDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float)
        self.targets = torch.tensor(targets, dtype=torch.float)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [9]:
# 准备数据集
train_dataset = StockDataset(train_features, train_targets)
valid_dataset = StockDataset(valid_features, valid_targets)

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的设备:", device)

使用的设备: cuda


In [41]:
# 定义GRU模型
class GRUNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(GRUNet, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, dropout=0.1, batch_first=True) #加入dropout减少过拟合
        self.bn = nn.BatchNorm1d(hidden_size)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # 默认使用全零张量作为初始隐藏状态
        # 前向传播 GRU
        out, _ = self.gru(x, h0)
        # 取最后一个时间步的输出
        out = out[:, -1, :]
        # 批量归一化
        out = self.bn(out)
        # 全连接层
        out = self.fc(out)
        return out

# 用IC作为损失函数
def pearson_correlation(x, y):
    cov = torch.mean((x - torch.mean(x)) * (y - torch.mean(y)))
    std_x = torch.std(x)
    std_y = torch.std(y)
    return cov / (std_x * std_y)

class PearsonLoss(nn.Module):
    def __init__(self):
        super(PearsonLoss, self).__init__()

    def forward(self, x, y):
        return -pearson_correlation(x, y)

In [57]:
# 定义模型、损失函数和优化器
input_size = 10  # 特征数量
hidden_size = 64  # 隐藏层大小
num_layers = 2  # GRU层数
model = GRUNet(input_size, hidden_size, num_layers).to(device)
criterion = PearsonLoss()
#criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 模型训练参数
num_epochs = 5
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [58]:
start_time = datetime.now()
print("开始时间为:", start_time)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0  # 初始化整个 epoch 的损失
    total_batches = 0  # 初始化总批次数
    for features, target in train_loader:
        features, target = features.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs.squeeze(), target)
        #loss.requires_grad_(True)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # 累加当前批次的损失值
        total_batches += 1  # 更新总批次数

    epoch_loss = total_loss / total_batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 08:40:56.477146
Epoch [1/5], Loss: -0.0931
Epoch [2/5], Loss: -0.1176
Epoch [3/5], Loss: -0.1264
Epoch [4/5], Loss: -0.1329
Epoch [5/5], Loss: -0.1386
结束时间为: 2024-04-24 09:00:06.327915
处理时间为： 0:19:09.850769


In [59]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 评估模型
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
model.eval()
predictions = []

with torch.no_grad():
    total_loss = 0
    for features, target in valid_loader:
        features, target = features.to(device), target.to(device)
        outputs = model(features)
        predictions.append(outputs)
        total_loss += criterion(outputs.squeeze(), target).item()
        
    print(f'Average Valid Loss: {total_loss / len(valid_loader):.4f}')

predictions = torch.cat(predictions, dim=0)
predictions_series = pd.Series(predictions.reshape(-1).cpu().numpy())
GRU_valid_df = GRU_valid_df_ori.copy()
GRU_valid_df['y_pred'] = predictions_series
GRU_rankic = model_function.get_rankic(GRU_valid_df)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 09:00:06.401026
Average Valid Loss: -0.0566
rankic均值为： 0.06182834615746414
结束时间为: 2024-04-24 09:00:22.002136
处理时间为： 0:00:15.601110


In [60]:
GRU_rankic

Unnamed: 0,date,RankIC
0,1541.0,0.077029
1,1542.0,0.028718
2,1543.0,0.033183
3,1544.0,0.080002
4,1545.0,0.106532
...,...,...
156,1697.0,0.012055
157,1698.0,0.047702
158,1699.0,0.078119
159,1700.0,0.064405


In [61]:
class TestDataset(Dataset):
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.float)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

In [62]:
# 准备测试集数据
test_dataset = TestDataset(test_features)

# 创建测试集数据加载器
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [63]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 进行预测
predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(device)
        outputs = model(features)
        predictions.append(outputs)

# 将所有预测结果连接成一个张量
predictions = torch.cat(predictions, dim=0)
predictions_series = pd.Series(predictions.reshape(-1).cpu().numpy())
GRU_test_result = GRU_test_result_ori.copy()
GRU_test_result['y_pred'] = predictions_series

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 09:00:29.556694
结束时间为: 2024-04-24 09:02:29.546943
处理时间为： 0:01:59.990249


In [64]:
GRU_test_result

Unnamed: 0,date,code,y_pred
0,1711,s_0,0.012596
1,1712,s_0,-0.743100
2,1713,s_0,-0.331426
3,1714,s_0,-1.181125
4,1715,s_0,-0.496436
...,...,...,...
3828250,2598,s_999,0.186671
3828251,2599,s_999,-0.022782
3828252,2600,s_999,-0.074305
3828253,2601,s_999,-0.072921


In [66]:
with open('./data/GRU_3MAD_RZ_KNN_截面zscore的y.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('./data/GRU_3MAD_RZ_KNN_截面zscore的y_test_result.pkl', 'wb') as f:
    pickle.dump(GRU_test_result, f)