In [1]:
import pandas as pd
import numpy as np
import pickle
import model_function
from datetime import datetime
from scipy.stats import spearmanr

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
with open('./data/X_train_filled_cleaned.pkl', 'rb') as f:
    X_train_filled_cleaned = pickle.load(f)
with open('./data/X_valid_filled_cleaned.pkl', 'rb') as f:
    X_valid_filled_cleaned = pickle.load(f)
with open('./data/X_test_filled_cleaned.pkl', 'rb') as f:
    X_test_filled_cleaned = pickle.load(f)
with open('./data/X_train_unfilled_cleaned.pkl', 'rb') as f:
    X_train_unfilled_cleaned = pickle.load(f)
with open('./data/X_valid_unfilled_cleaned.pkl', 'rb') as f:
    X_valid_unfilled_cleaned = pickle.load(f)  
with open('./data/X_test_unfilled_cleaned.pkl', 'rb') as f:
    X_test_unfilled_cleaned = pickle.load(f) 
with open('./data/X_train_filled_selected.pkl', 'rb') as f:
    X_train_filled_selected = pickle.load(f)
with open('./data/X_valid_filled_selected.pkl', 'rb') as f:
    X_valid_filled_selected = pickle.load(f)
with open('./data/X_test_filled_selected.pkl', 'rb') as f:
    X_test_filled_selected = pickle.load(f)
with open('./data/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
with open('./data/y_valid.pkl', 'rb') as f:
    y_valid = pickle.load(f)   

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的设备:", device)

使用的设备: cuda


In [4]:
train_features = np.array(X_train_filled_cleaned.iloc[:, 2:]).reshape(-1, 1, 19)
train_targets = np.array(y_train)
valid_features = np.array(X_valid_filled_cleaned.iloc[:, 2:]).reshape(-1, 1, 19)
valid_targets = np.array(y_valid)

In [5]:
# 定义数据集类
class StockDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float)
        self.targets = torch.tensor(targets, dtype=torch.float)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]
        

In [6]:
# 准备数据集
train_dataset = StockDataset(train_features, train_targets)
valid_dataset = StockDataset(valid_features, valid_targets)

In [7]:
# 定义GRU模型
class GRUNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(GRUNet, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.bn = nn.BatchNorm1d(hidden_size)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x, h0=None):
        if h0 is None:
            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # 默认使用全零张量作为初始隐藏状态
        # 前向传播 GRU
        out, _ = self.gru(x, h0)
        # 取最后一个时间步的输出
        out = out[:, -1, :]
        # 批量归一化
        out = self.bn(out)
        # 全连接层
        out = self.fc(out)
        return out

In [8]:
'''
斯皮尔曼相关系数直接作为损失函数的话，模型似乎不能有效地优化它
def spearman_rank_correlation(x, y):
    # 获取排名
    rank_x = torch.argsort(x)
    rank_y = torch.argsort(y)
    
    # 计算排名之间的差异
    rank_diff = rank_x - rank_y
    
    # 计算差异的平方和
    sum_rank_diff_sq = torch.sum(torch.pow(rank_diff, 2))
    
    # 计算斯皮尔曼秩相关系数
    n = torch.tensor(len(x))
    rho = torch.tensor(1 - (6 * sum_rank_diff_sq) / ((torch.pow(n, 3) - n)), requires_grad=True)
    
    return rho

class SpearmanLoss(nn.Module):
    def __init__(self):
        super(SpearmanLoss, self).__init__()

    def forward(self, x, y):
        return -spearman_rank_correlation(x, y)
'''
# 用IC作为损失函数
def pearson_correlation(x, y):
    cov = torch.mean((x - torch.mean(x)) * (y - torch.mean(y)))
    std_x = torch.std(x)
    std_y = torch.std(y)
    return cov / (std_x * std_y)

class PearsonLoss(nn.Module):
    def __init__(self):
        super(PearsonLoss, self).__init__()

    def forward(self, x, y):
        return -pearson_correlation(x, y)

In [9]:
# 定义模型、损失函数和优化器
input_size = 19  # 特征数量
hidden_size = 64  # 隐藏层大小
num_layers = 1  # GRU层数
model = GRUNet(input_size, hidden_size, num_layers).to(device)
criterion = PearsonLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 模型训练参数
num_epochs = 10
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [10]:
start_time = datetime.now()
print("开始时间为:", start_time)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0  # 初始化整个 epoch 的损失
    total_batches = 0  # 初始化总批次数
    for features, target in train_loader:
        features, target = features.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs.squeeze(), target)
        #loss.requires_grad_(True)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # 累加当前批次的损失值
        total_batches += 1  # 更新总批次数

    epoch_loss = total_loss / total_batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 18:43:31.020504
Epoch [1/10], Loss: -0.0916
Epoch [2/10], Loss: -0.1020
Epoch [3/10], Loss: -0.1050
Epoch [4/10], Loss: -0.1073
Epoch [5/10], Loss: -0.1091
Epoch [6/10], Loss: -0.1102
Epoch [7/10], Loss: -0.1110
Epoch [8/10], Loss: -0.1123
Epoch [9/10], Loss: -0.1133
Epoch [10/10], Loss: -0.1139
结束时间为: 2024-04-25 19:56:41.700645
处理时间为： 1:13:10.680141


In [11]:
start_time = datetime.now()
print("开始时间为:", start_time)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0  # 初始化整个 epoch 的损失
    total_batches = 0  # 初始化总批次数
    for features, target in train_loader:
        features, target = features.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs.squeeze(), target)
        #loss.requires_grad_(True)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # 累加当前批次的损失值
        total_batches += 1  # 更新总批次数

    epoch_loss = total_loss / total_batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 19:56:41.753646
Epoch [1/10], Loss: -0.1143
Epoch [2/10], Loss: -0.1152
Epoch [3/10], Loss: -0.1160
Epoch [4/10], Loss: -0.1166
Epoch [5/10], Loss: -0.1166
Epoch [6/10], Loss: -0.1175
Epoch [7/10], Loss: -0.1180
Epoch [8/10], Loss: -0.1180
Epoch [9/10], Loss: -0.1187
Epoch [10/10], Loss: -0.1190
结束时间为: 2024-04-25 21:04:42.330793
处理时间为： 1:08:00.577147


In [13]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 评估模型
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
model.eval()
predictions = []

with torch.no_grad():
    total_loss = 0
    for features, target in valid_loader:
        features, target = features.to(device), target.to(device)
        outputs = model(features)
        predictions.append(outputs)
        total_loss += criterion(outputs.squeeze(), target).item()
        
    print(f'Average Valid Loss: {total_loss / len(valid_loader):.4f}')

predictions = torch.cat(predictions, dim=0)
predictions_series = pd.Series(predictions.reshape(-1).cpu().numpy())
GRU_valid_df = pd.concat([X_valid_filled_cleaned, y_valid], axis=1)
GRU_valid_df['y_pred'] = predictions_series
GRU_rankic = model_function.get_rankic(GRU_valid_df)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 21:06:55.393846
Average Valid Loss: -0.0876
rankic均值为： 0.08934850371555376
结束时间为: 2024-04-25 21:07:19.769103
处理时间为： 0:00:24.375257


In [14]:
GRU_rankic

Unnamed: 0,date,RankIC
0,1532,0.161620
1,1533,0.033201
2,1534,0.145431
3,1535,0.153193
4,1536,0.222108
...,...,...
165,1697,0.039359
166,1698,0.038774
167,1699,0.059352
168,1700,-0.050376


In [15]:
GRU_rankic['RankIC'].mean() / GRU_rankic['RankIC'].std()

1.3477991374572058

In [16]:
class TestDataset(Dataset):
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.float)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

In [19]:
# 准备测试集数据
test_features = np.array(X_test_filled_cleaned.iloc[:, 2:]).reshape(-1, 1, 19) 
test_dataset = TestDataset(test_features)

# 创建测试集数据加载器
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [20]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 进行预测
predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(device)
        outputs = model(features)
        predictions.append(outputs)

# 将所有预测结果连接成一个张量
predictions = torch.cat(predictions, dim=0)
predictions_series = pd.Series(predictions.reshape(-1).cpu().numpy())
GRU_test_result = X_test_filled_cleaned.copy()
GRU_test_result['y_pred'] = predictions_series

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-25 21:17:47.915211
结束时间为: 2024-04-25 21:19:22.572207
处理时间为： 0:01:34.656996


In [21]:
GRU_test_result

Unnamed: 0,code,date,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,...,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,y_pred
0,s_4394,1702,0.068291,-0.432540,0.022628,3.165354,0.907793,0.339218,-1.300476,-0.681366,...,0.007007,-0.001962,-0.088637,5.123355,-0.333052,1.406415,0.730259,1.143091,0.428422,0.144769
1,s_4390,1702,0.422512,-0.440728,0.410388,0.067812,0.818190,0.738554,2.079457,0.064696,...,0.007007,-0.001962,7.090391,0.312029,0.924242,2.686006,0.920996,1.269212,0.789571,0.305586
2,s_2394,1702,0.838988,-0.199078,-0.103651,-0.223763,-0.108430,0.779757,1.270638,0.508430,...,0.007007,-0.001962,-0.022319,-0.051760,-0.309964,-0.664588,0.394401,-0.972088,0.948195,0.871417
3,s_5135,1702,3.305397,1.641047,0.578268,-1.187674,-0.990240,3.947727,-0.211763,-0.102795,...,0.007007,-0.001962,-0.130295,-0.359713,7.704144,0.019878,-1.426578,0.316132,3.116464,0.412492
4,s_4733,1702,1.017056,0.760480,-1.023568,-0.508261,-2.677140,-0.596081,-0.555019,-0.645279,...,0.007007,-0.001962,-0.230536,-0.072508,0.801403,-0.096708,-3.142005,-0.429479,-0.211853,-1.715138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3908946,s_3445,2606,2.775059,-0.106231,-0.772367,-1.675571,2.633371,3.623154,-0.713525,-0.087002,...,0.013767,0.001375,0.111419,-0.440882,0.754534,0.983301,6.052376,-0.268740,3.766029,0.353053
3908947,s_1559,2606,-0.409325,-0.355444,0.231227,-1.047911,0.239075,-0.186017,-1.139380,1.153228,...,0.013767,0.001375,0.223317,-0.500254,-0.422076,0.425886,-0.823689,0.683850,-0.095134,0.679945
3908948,s_3627,2606,0.908406,-0.231084,-0.206256,-0.792178,0.989572,1.452187,2.028437,-0.216644,...,0.013767,0.001375,-0.865135,-0.293487,-0.359932,-0.955121,0.268850,-0.232571,1.335906,0.624889
3908949,s_975,2606,0.583221,-0.313091,-1.038109,0.545349,0.918309,0.301894,-0.630959,-0.773183,...,0.013767,0.001375,-0.171120,-0.264226,-0.601279,-0.448135,-0.075550,-0.792248,0.679176,0.542371


In [22]:
with open('./data/GRU_单个样本横向输入_IC.pkl', 'wb') as f:
    pickle.dump(model.cpu(), f)
with open('./data/GRU_单个样本横向输入_test_result.pkl', 'wb') as f:
    pickle.dump(GRU_test_result, f)