导入所需包，准备数据

In [1]:
import pandas as pd
import numpy as np
import pickle
import model_function
import preprocessing
from datetime import datetime
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

读取数据，准备训练（给数据用预处理方法改了名）

In [2]:
with open('./data/GRU_百分比_RZ_KNN训练样本.pkl', 'rb') as f:
    train_samples = pickle.load(f)

In [3]:
train_targets = train_samples[:, -1, -1].astype(float)
train_features = train_samples[:, :, 2:-1].astype(float)

In [4]:
del train_samples

In [5]:
with open('./data/GRU_百分比_RZ_KNN验证样本.pkl', 'rb') as f:
    valid_samples = pickle.load(f)

In [6]:
# 为节省内存，把后面验证需要的数据先生成，以便早点删除占内存极大的样本变量
GRU_valid_df_ori = pd.DataFrame(valid_samples[:, -1, [0, 1, -1]], columns=['date', 'code', 'y'])

In [7]:
valid_targets = valid_samples[:, -1, -1].astype(float)
valid_features = valid_samples[:, :, 2:-1].astype(float)

In [8]:
del valid_samples

In [9]:
with open('./data/GRU_百分比_RZ_KNN测试样本.pkl', 'rb') as f:
    test_samples = pickle.load(f)

In [10]:
# 为节省内存，把后面验证需要的数据先生成，以便早点删除占内存极大的样本变量
GRU_test_result_ori = pd.DataFrame(test_samples[:, -1, [0, 1]], columns=['date', 'code'])

In [11]:
test_features = test_samples[:, :, 2:].astype(float)

In [12]:
del test_samples

加载数据

In [13]:
# 定义数据集类
class StockDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float)
        self.targets = torch.tensor(targets, dtype=torch.float)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [14]:
# 准备数据集
train_dataset = StockDataset(train_features, train_targets)
valid_dataset = StockDataset(valid_features, valid_targets)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的设备:", device)

使用的设备: cuda


In [16]:
# 定义GRU模型
class GRUNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(GRUNet, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, dropout=0.1, batch_first=True) #加入dropout减少过拟合
        self.bn = nn.BatchNorm1d(hidden_size)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # 默认使用全零张量作为初始隐藏状态
        # 前向传播 GRU
        out, _ = self.gru(x, h0)
        # 取最后一个时间步的输出
        out = out[:, -1, :]
        # 批量归一化
        out = self.bn(out)
        # 全连接层
        out = self.fc(out)
        return out

# 用IC作为损失函数
def pearson_correlation(x, y):
    cov = torch.mean((x - torch.mean(x)) * (y - torch.mean(y)))
    std_x = torch.std(x)
    std_y = torch.std(y)
    return cov / (std_x * std_y)

class PearsonLoss(nn.Module):
    def __init__(self):
        super(PearsonLoss, self).__init__()

    def forward(self, x, y):
        return -pearson_correlation(x, y)

定义模型时，为了拟合残差，此处使用MSE损失函数

In [17]:
# 定义模型、损失函数和优化器
input_size = 10  # 特征数量
hidden_size = 64  # 隐藏层大小
num_layers = 2  # GRU层数
model = GRUNet(input_size, hidden_size, num_layers).to(device)
#criterion = PearsonLoss()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 模型训练参数
num_epochs = 5 #试过迭代10轮，但并不能有效提高验证集上的ic
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [18]:
start_time = datetime.now()
print("开始时间为:", start_time)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0  # 初始化整个 epoch 的损失
    total_batches = 0  # 初始化总批次数
    for features, target in train_loader:
        features, target = features.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs.squeeze(), target)
        #loss.requires_grad_(True)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # 累加当前批次的损失值
        total_batches += 1  # 更新总批次数

    epoch_loss = total_loss / total_batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 18:22:05.964090
Epoch [1/5], Loss: 0.9339
Epoch [2/5], Loss: 0.9278
Epoch [3/5], Loss: 0.9251
Epoch [4/5], Loss: 0.9230
Epoch [5/5], Loss: 0.9214
结束时间为: 2024-04-25 18:29:05.627703
处理时间为： 0:06:59.663613


In [19]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 得到模型在训练集上的预测结果，计算预测结果和真实值的残差
train_loader_2 = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
model.eval()
predictions_t = []

with torch.no_grad():
    total_loss = 0
    for features, target in train_loader_2:
        features, target = features.to(device), target.to(device)
        outputs = model(features)
        predictions_t.append(outputs)
        total_loss += criterion(outputs.squeeze(), target).item()
        
    print(f'Average Train Loss: {total_loss / len(train_loader_2):.4f}')

predictions_t = torch.cat(predictions_t, dim=0)
predictions_t = predictions_t.reshape(-1).cpu().numpy()
res = train_targets - predictions_t

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 18:29:05.653708
Average Train Loss: 0.9199
结束时间为: 2024-04-25 18:30:01.362953
处理时间为： 0:00:55.709245


训练第二个模型拟合第一个模型的残差

In [20]:
res_dataset = StockDataset(train_features, res)
model_res = GRUNet(input_size, hidden_size, num_layers).to(device)
res_loader = DataLoader(res_dataset, batch_size=batch_size, shuffle=True)
optimizer_res = optim.Adam(model_res.parameters(), lr=0.000002) #降低学习率，避免难以拟合残差

In [21]:
start_time = datetime.now()
print("开始时间为:", start_time)

for epoch in range(num_epochs):
    model_res.train()
    total_loss = 0  # 初始化整个 epoch 的损失
    total_batches = 0  # 初始化总批次数
    for features, target in res_loader:
        features, target = features.to(device), target.to(device)
        optimizer_res.zero_grad()
        outputs = model_res(features)
        loss = criterion(outputs.squeeze(), target)
        #loss.requires_grad_(True)
        loss.backward()
        optimizer_res.step()

        total_loss += loss.item()  # 累加当前批次的损失值
        total_batches += 1  # 更新总批次数

    epoch_loss = total_loss / total_batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 18:30:03.127954
Epoch [1/5], Loss: 0.9500
Epoch [2/5], Loss: 0.9255
Epoch [3/5], Loss: 0.9224
Epoch [4/5], Loss: 0.9211
Epoch [5/5], Loss: 0.9202
结束时间为: 2024-04-25 18:37:05.584017
处理时间为： 0:07:02.456063


将两个模型结果相加，作为最终的预测值

In [22]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 评估原模型
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
model.eval()
predictions = []

with torch.no_grad():
    total_loss = 0
    for features, target in valid_loader:
        features, target = features.to(device), target.to(device)
        outputs = model(features)
        predictions.append(outputs)
        total_loss += criterion(outputs.squeeze(), target).item()
        
    print(f'Average Valid Loss: {total_loss / len(valid_loader):.4f}')

predictions = torch.cat(predictions, dim=0)
predictions_series = pd.Series(predictions.reshape(-1).cpu().numpy())
GRU_valid_df = GRU_valid_df_ori.copy()
GRU_valid_df['y_1'] = predictions_series

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 18:37:05.600022
Average Valid Loss: 0.9769
结束时间为: 2024-04-25 18:37:19.646754
处理时间为： 0:00:14.046732


In [24]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 评估原模型
model_res.eval()
predictions = []

with torch.no_grad():
    total_loss = 0
    for features, target in valid_loader:
        features, target = features.to(device), target.to(device)
        outputs = model_res(features)
        predictions.append(outputs)
        total_loss += criterion(outputs.squeeze(), target).item()
        
    print(f'Average Valid Loss: {total_loss / len(valid_loader):.4f}')

predictions = torch.cat(predictions, dim=0)
predictions_series = pd.Series(predictions.reshape(-1).cpu().numpy())
GRU_valid_df['y_2'] = predictions_series
GRU_valid_df['y_pred'] = GRU_valid_df['y_1'] + GRU_valid_df['y_2']
GRU_rankic = model_function.get_rankic(GRU_valid_df)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 18:37:19.706757
Average Valid Loss: 0.9806
rankic均值为： 0.0789603570756313
结束时间为: 2024-04-25 18:37:29.499332
处理时间为： 0:00:09.792575


In [25]:
GRU_valid_df

Unnamed: 0,date,code,y,y_1,y_2,y_pred
0,1541,s_0,1.818162,-0.011825,-0.041888,-0.053712
1,1542,s_0,2.866452,-0.046325,-0.024397,-0.070721
2,1543,s_0,4.746281,0.011428,-0.033160,-0.021732
3,1544,s_0,1.892028,-0.003789,-0.037868,-0.041657
4,1545,s_0,2.342451,-0.070269,-0.098976,-0.169245
...,...,...,...,...,...,...
563647,1697,s_999,-0.37718,0.034448,-0.016839,0.017608
563648,1698,s_999,-0.437914,0.012588,-0.018716,-0.006128
563649,1699,s_999,0.762539,-0.007806,-0.024914,-0.032720
563650,1700,s_999,1.81606,-0.015723,-0.032651,-0.048374


In [26]:
GRU_rankic

Unnamed: 0,date,RankIC
0,1541,-0.002113
1,1542,0.066835
2,1543,0.106764
3,1544,0.142847
4,1545,0.164839
...,...,...
156,1697,0.061699
157,1698,0.027257
158,1699,-0.003242
159,1700,-0.029591


In [27]:
GRU_rankic['RankIC'].mean() / GRU_rankic['RankIC'].std()

1.1062085324720505

In [28]:
class TestDataset(Dataset):
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.float)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

In [29]:
# 准备测试集数据
test_dataset = TestDataset(test_features)

# 创建测试集数据加载器
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [30]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 进行预测
predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(device)
        outputs = model(features)
        predictions.append(outputs)

# 将所有预测结果连接成一个张量
predictions = torch.cat(predictions, dim=0)
predictions_series = pd.Series(predictions.reshape(-1).cpu().numpy())
GRU_test_result = GRU_test_result_ori.copy()
GRU_test_result['y_1'] = predictions_series

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 18:37:33.172163
结束时间为: 2024-04-25 18:38:44.279104
处理时间为： 0:01:11.106941


In [31]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 进行预测
predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(device)
        outputs = model_res(features)
        predictions.append(outputs)

# 将所有预测结果连接成一个张量
predictions = torch.cat(predictions, dim=0)
predictions_series = pd.Series(predictions.reshape(-1).cpu().numpy())
GRU_test_result['y_2'] = predictions_series
GRU_test_result['y_pred'] = GRU_test_result['y_1'] + GRU_test_result['y_2']

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 18:38:44.390077
结束时间为: 2024-04-25 18:39:20.713167
处理时间为： 0:00:36.323090


In [32]:
GRU_test_result

Unnamed: 0,date,code,y_1,y_2,y_pred
0,1711,s_0,-0.029407,-0.030717,-0.060124
1,1712,s_0,-0.207942,-0.074331,-0.282274
2,1713,s_0,-0.296059,-0.080541,-0.376600
3,1714,s_0,-0.407733,-0.092050,-0.499783
4,1715,s_0,-0.134697,-0.065973,-0.200670
...,...,...,...,...,...
3828250,2598,s_999,-0.058720,-0.001289,-0.060010
3828251,2599,s_999,-0.074297,-0.001861,-0.076158
3828252,2600,s_999,-0.040567,0.004394,-0.036173
3828253,2601,s_999,-0.027235,0.003200,-0.024034


In [33]:
with open('./data/GRU_res_百分比_RZ_KNN_截面zscore的y_原model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('./data/GRU_res_百分比_RZ_KNN_截面zscore的y_残差model.pkl', 'wb') as f:
    pickle.dump(model_res, f)
with open('./data/GRU_res_百分比_RZ_KNN_截面zscore的y_test_result.pkl', 'wb') as f:
    pickle.dump(GRU_test_result, f)