导入所需包，准备数据

In [1]:
import pandas as pd
import numpy as np
import pickle
import model_function
import preprocessing
from datetime import datetime
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
with open('./data/train_data.pkl', 'rb') as f:
    train_data_raw = pickle.load(f)
with open('./data/test_data.pkl', 'rb') as f:
    test_data_raw = pickle.load(f)

In [3]:
split_date = round(train_data_raw['date'].unique().shape[0] * 0.9)
valid_data_raw = train_data_raw.loc[train_data_raw['date'] >= split_date]
train_data_raw = train_data_raw.loc[train_data_raw['date'] < split_date]

In [4]:
X_train_raw = train_data_raw.drop(columns=['y']).reset_index(drop=True)
y_train = train_data_raw[['code', 'date', 'y']].reset_index(drop=True)
X_valid_raw = valid_data_raw.drop(columns=['y']).reset_index(drop=True)
y_valid = valid_data_raw[['code', 'date', 'y']].reset_index(drop=True)
X_test_raw = test_data_raw.reset_index(drop=True)

对f_6使用target_encoding编码

In [5]:
start_time = datetime.now()
print("开始时间为:", start_time)

X_train_encoded, target_encoder = preprocessing.target_encode(X_train_raw, 'f_6', y_train['y'])
X_valid_encoded = preprocessing.target_encode(X_valid_raw, 'f_6', encoder=target_encoder)
X_test_encoded = preprocessing.target_encode(X_test_raw, 'f_6', encoder=target_encoder)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 17:48:15.382659
结束时间为: 2024-04-24 17:48:18.968245
处理时间为： 0:00:03.585586


对训练特征集做描述性统计

In [6]:
perct = [0.005,0.01,0.02,0.03, 0.04, 0.05, 0.1,0.15,0.25]
perct += [1-a for a in perct]
perct += [0.5]
X_train_encoded.describe(percentiles = perct)

Unnamed: 0,date,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18
count,4071914.0,4071914.0,3660015.0,4042132.0,4071914.0,4071914.0,4071914.0,4071914.0,4066998.0,4071601.0,4071601.0,4070653.0,4070657.0,4054628.0,4071914.0,4070834.0,3610427.0,4071914.0,3581510.0,4071914.0
mean,829.9569,1.020225,146.9144,30602.5,4.276906,0.9800039,1.000806,-0.01495404,6.857498,0.04706855,0.05508865,1.005105,1.003287,1.595803,181635800.0,11.55865,0.02370389,0.9986466,0.01188183,1.00054
std,449.2696,0.02576125,3095.289,5126441.0,1.005392,0.02416416,0.03210823,0.01690996,125.3369,0.5903206,3.068755,3.701825,1.72989,85.83799,440933100.0,576.0775,1.181143,0.01575991,4.260839,0.02412891
min,0.0,0.8335404,1.2258,-701.1802,1.835776,0.710559,0.710559,-0.07084265,0.3881,-5.538682,-605.5728,-895.4691,-5.474717,-6956.416,7075.0,0.0185,-17.57517,0.778882,-5.739492,0.7776594
0.5%,8.0,0.9691748,5.698807,-0.8466107,2.433594,0.8998853,0.8999781,-0.07009279,0.7399,-0.1893255,-0.740993,0.8460711,0.9471765,-33.36842,3892780.0,0.1705,-2.469069,0.9303887,-2.031957,0.9201136
1%,17.0,0.9798793,6.6911,-0.7322914,2.555626,0.9,0.9001835,-0.06795425,0.8169,-0.122344,-0.4012339,0.9384646,0.9751634,-15.68607,4992371.0,0.2331,-2.381345,0.9534247,-1.909802,0.9318534
2%,35.0,0.9876216,8.234,-0.5891068,2.690701,0.9003623,0.9168761,-0.03693693,0.9209,-0.075733,-0.217496,0.9782609,0.9905513,-6.435936,6562512.0,0.3201,-2.128362,0.9676041,-1.744538,0.9460505
3%,52.0,0.9913092,9.5502,-0.4992984,2.786553,0.9089711,0.9342875,-0.03662287,1.0009,-0.05359715,-0.1382802,0.9896036,0.995871,-3.81985,7852149.0,0.3963,-1.893937,0.9738966,-1.636496,0.9547905
4%,70.0,0.9936167,10.6564,-0.4358318,2.864518,0.9207833,0.9443499,-0.0360506,1.0679,-0.03860668,-0.0907811,0.9946416,0.9984885,-2.693682,9023634.0,0.4741,-1.685659,0.9778227,-1.547749,0.9606372
5%,87.0,0.9952194,11.5935,-0.3847478,2.930295,0.9298246,0.9511284,-0.03542451,1.1307,-0.028254,-0.064624,0.9973936,0.9999592,-2.009076,10128790.0,0.5383,-1.54423,0.9804496,-1.477782,0.9648291


首先在整个数据集用1%，99%缩尾，接下来用robust_zscore做标准化，最后在界面内部用KNN填充缺失值。

In [7]:
start_time = datetime.now()
print("开始时间为:", start_time)

X_train_win, lower_bound, upper_bound = preprocessing.winsorize_X(X_train_encoded)
X_valid_win = preprocessing.winsorize_X(X_valid_encoded, lower_bound, upper_bound)
X_test_win = preprocessing.winsorize_X(X_test_encoded, lower_bound, upper_bound)

X_train_rzscore, MEDIANS, MADS = preprocessing.robust_zscore(X_train_win)
X_valid_rzscore = preprocessing.robust_zscore(X_valid_win, MEDIANS, MADS)
X_test_rzscore = preprocessing.robust_zscore(X_test_win, MEDIANS, MADS)

X_train_filled = preprocessing.imputer_KNN(X_train_rzscore.drop(columns=['f_10', 'f_11']))
X_valid_filled = preprocessing.imputer_KNN(X_valid_rzscore.drop(columns=['f_10', 'f_11']))
X_test_filled = preprocessing.imputer_KNN(X_test_rzscore.drop(columns=['f_10', 'f_11']))

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 17:48:22.276462
结束时间为: 2024-04-24 17:59:49.187512
处理时间为： 0:11:26.911050


选择互信息较高的特征

In [11]:
start_time = datetime.now()
print("开始时间为:", start_time)

X_train_selected, mi_selector = preprocessing.mutual_info_selection(X_train_filled, y_train['y'])
X_valid_selected = preprocessing.mutual_info_selection(X_valid_filled, selector=mi_selector)
X_test_selected = preprocessing.mutual_info_selection(X_test_filled, selector=mi_selector)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 18:00:51.879084
结束时间为: 2024-04-24 18:29:21.122165
处理时间为： 0:28:29.243081


对y做截面zscore

In [8]:
start_time = datetime.now()
print("开始时间为:", start_time)

y_train_zscore = preprocessing.zscore_standardization(y_train)
y_valid_zscore = preprocessing.zscore_standardization(y_valid)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 17:59:49.203514
结束时间为: 2024-04-24 17:59:53.756935
处理时间为： 0:00:04.553421


构建GRU样本，使用同一支股票连续10天的被选出的10个特征作为模型输入，预测最后一天的y值，保存

In [9]:
def construct_gru_sample(raw_sample_df: pd.DataFrame, window_size: int=10):   
    grouped = raw_sample_df.groupby('code')
    gru_sample_list = []
    for name, group in grouped:
        sorted_group = group.sort_values(by='date')
        rolling_window = sorted_group.set_index('date').rolling(window=window_size)
        for window_data in rolling_window:
            if len(window_data) == window_size:
                date_diff = (window_data.index[-1] - window_data.index[0])
                if date_diff == window_size - 1:
                    gru_sample_list.append(np.array(window_data.reset_index()))
    return np.array(gru_sample_list)

In [12]:
start_time = datetime.now()
print("开始时间为:", start_time)

train_samples = construct_gru_sample(pd.concat([X_train_selected, y_train_zscore['y']], axis=1))
valid_samples = construct_gru_sample(pd.concat([X_valid_selected, y_valid_zscore['y']], axis=1))

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 18:29:21.168773
结束时间为: 2024-04-24 18:50:18.641810
处理时间为： 0:20:57.473037


In [13]:
with open('./data/GRU_百分比_RZ_KNN训练样本.pkl', 'wb') as f:
    pickle.dump(train_samples, f)
with open('./data/GRU_百分比_RZ_KNN验证样本.pkl', 'wb') as f:
    pickle.dump(valid_samples, f)

In [14]:
del train_samples
del valid_samples

In [15]:
start_time = datetime.now()
print("开始时间为:", start_time)

test_samples = construct_gru_sample(X_test_selected)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 18:54:17.293122
结束时间为: 2024-04-24 19:11:40.406351
处理时间为： 0:17:23.113229


In [16]:
with open('./data/GRU_百分比_RZ_KNN测试样本.pkl', 'wb') as f:
    pickle.dump(test_samples, f)

In [17]:
del test_samples

读取数据，准备训练（给数据用预处理方法改了名）

In [2]:
with open('./data/GRU_百分比_RZ_KNN训练样本.pkl', 'rb') as f:
    train_samples = pickle.load(f)
with open('./data/GRU_百分比_RZ_KNN验证样本.pkl', 'rb') as f:
    valid_samples = pickle.load(f)

In [3]:
# 为节省内存，把后面验证需要的数据先生成，以便早点删除占内存极大的样本变量
GRU_valid_df_ori = pd.DataFrame(valid_samples[:, -1, [0, 1, -1]], columns=['date', 'code', 'y'])

In [4]:
train_targets = train_samples[:, -1, -1].astype(float)
train_features = train_samples[:, :, 2:-1].astype(float)
valid_targets = valid_samples[:, -1, -1].astype(float)
valid_features = valid_samples[:, :, 2:-1].astype(float)

In [5]:
del train_samples
del valid_samples

In [6]:
with open('./data/GRU_百分比_RZ_KNN测试样本.pkl', 'rb') as f:
    test_samples = pickle.load(f)

In [7]:
# 为节省内存，把后面验证需要的数据先生成，以便早点删除占内存极大的样本变量
GRU_test_result_ori = pd.DataFrame(test_samples[:, -1, [0, 1]], columns=['date', 'code'])

In [8]:
test_features = test_samples[:, :, 2:].astype(float)

In [9]:
del test_samples

加载数据

In [11]:
# 定义数据集类
class StockDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float)
        self.targets = torch.tensor(targets, dtype=torch.float)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [12]:
# 准备数据集
train_dataset = StockDataset(train_features, train_targets)
valid_dataset = StockDataset(valid_features, valid_targets)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用的设备:", device)

使用的设备: cuda


In [14]:
# 定义GRU模型
class GRUNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(GRUNet, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, dropout=0.1, batch_first=True) #加入dropout减少过拟合
        self.bn = nn.BatchNorm1d(hidden_size)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # 默认使用全零张量作为初始隐藏状态
        # 前向传播 GRU
        out, _ = self.gru(x, h0)
        # 取最后一个时间步的输出
        out = out[:, -1, :]
        # 批量归一化
        out = self.bn(out)
        # 全连接层
        out = self.fc(out)
        return out

# 用IC作为损失函数
def pearson_correlation(x, y):
    cov = torch.mean((x - torch.mean(x)) * (y - torch.mean(y)))
    std_x = torch.std(x)
    std_y = torch.std(y)
    return cov / (std_x * std_y)

class PearsonLoss(nn.Module):
    def __init__(self):
        super(PearsonLoss, self).__init__()

    def forward(self, x, y):
        return -pearson_correlation(x, y)

定义模型时，MSE和IC两种损失函数均有尝试

In [38]:
# 定义模型、损失函数和优化器
input_size = 10  # 特征数量
hidden_size = 64  # 隐藏层大小
num_layers = 2  # GRU层数
model = GRUNet(input_size, hidden_size, num_layers).to(device)
criterion = PearsonLoss()
#criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# 模型训练参数
num_epochs = 5 #试过迭代10轮，但并不能有效提高验证集上的ic
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [39]:
start_time = datetime.now()
print("开始时间为:", start_time)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0  # 初始化整个 epoch 的损失
    total_batches = 0  # 初始化总批次数
    for features, target in train_loader:
        features, target = features.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs.squeeze(), target)
        #loss.requires_grad_(True)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # 累加当前批次的损失值
        total_batches += 1  # 更新总批次数

    epoch_loss = total_loss / total_batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 22:40:56.773534
Epoch [1/5], Loss: -0.0990
Epoch [2/5], Loss: -0.1234
Epoch [3/5], Loss: -0.1357
Epoch [4/5], Loss: -0.1435
Epoch [5/5], Loss: -0.1496
结束时间为: 2024-04-24 22:48:53.830518
处理时间为： 0:07:57.056984


In [40]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 评估模型
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
model.eval()
predictions = []

with torch.no_grad():
    total_loss = 0
    for features, target in valid_loader:
        features, target = features.to(device), target.to(device)
        outputs = model(features)
        predictions.append(outputs)
        total_loss += criterion(outputs.squeeze(), target).item()
        
    print(f'Average Valid Loss: {total_loss / len(valid_loader):.4f}')

predictions = torch.cat(predictions, dim=0)
predictions_series = pd.Series(predictions.reshape(-1).cpu().numpy())
GRU_valid_df = GRU_valid_df_ori.copy()
GRU_valid_df['y_pred'] = predictions_series
GRU_rankic = model_function.get_rankic(GRU_valid_df)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 22:48:53.872520
Average Valid Loss: -0.0809
rankic均值为： 0.07884547028294218
结束时间为: 2024-04-24 22:49:07.132527
处理时间为： 0:00:13.260007


In [43]:
GRU_rankic

Unnamed: 0,date,RankIC
0,1541,0.013360
1,1542,0.051285
2,1543,0.103983
3,1544,0.140908
4,1545,0.166584
...,...,...
156,1697,0.066038
157,1698,0.049002
158,1699,0.004933
159,1700,-0.007969


In [44]:
GRU_rankic['RankIC'].mean() / GRU_rankic['RankIC'].std()

1.1043523536805642

In [45]:
class TestDataset(Dataset):
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.float)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

In [46]:
# 准备测试集数据
test_dataset = TestDataset(test_features)

# 创建测试集数据加载器
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [47]:
start_time = datetime.now()
print("开始时间为:", start_time)

# 进行预测
predictions = []
with torch.no_grad():
    for features in test_loader:
        features = features.to(device)
        outputs = model(features)
        predictions.append(outputs)

# 将所有预测结果连接成一个张量
predictions = torch.cat(predictions, dim=0)
predictions_series = pd.Series(predictions.reshape(-1).cpu().numpy())
GRU_test_result = GRU_test_result_ori.copy()
GRU_test_result['y_pred'] = predictions_series

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-24 22:51:42.014082
结束时间为: 2024-04-24 22:52:54.006391
处理时间为： 0:01:11.992309


In [48]:
GRU_test_result

Unnamed: 0,date,code,y_pred
0,1711,s_0,-0.220892
1,1712,s_0,-0.989944
2,1713,s_0,-1.382774
3,1714,s_0,-1.603202
4,1715,s_0,-0.425170
...,...,...,...
3828250,2598,s_999,0.129701
3828251,2599,s_999,0.048975
3828252,2600,s_999,0.105453
3828253,2601,s_999,0.119549


In [49]:
with open('./data/GRU_百分比_RZ_KNN_截面zscore的y.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('./data/GRU_百分比_RZ_KNN_截面zscore的y_test_result.pkl', 'wb') as f:
    pickle.dump(GRU_test_result, f)