In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import time
import glob
import warnings
warnings.filterwarnings('ignore')


#data
path = "./data/data_test"
all_files = glob.glob(path + "/*.csv")   #把所有的数据文件文件名读取在一起

li = []


for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)


frame = pd.concat(li, axis=0, ignore_index=True)
result_df = frame.sort_values(by=['date', 'code'], ascending=True)
result_df['vwap'] = result_df['money'] / result_df['volume']
result_df['target'] = result_df.groupby('code')['1vwap_pct'].shift(-1).values
df = result_df.reset_index(drop=True)
df = df.fillna(0)
# df_train = df[(df['date'] <= '2022-04-30')]
# df_test = df[(df['date'] <= '2022-12-29') & (df['date'] >= '2022-05-01')]
df_train = df[(df['date'] >= '2022-11-31') & (df['date'] <= '2022-12-22')]
df_val = df[(df['date'] <= '2022-12-25') & (df['date'] >= '2022-12-23')]
df_test = df[(df['date'] <= '2022-12-29') & (df['date'] >= '2022-12-26')]

X_train = df_train[['open', 'close', 'low', 'high']].values
y_train = df_train[['target']].values
X_val = df_val[['open', 'close', 'low', 'high']].values
y_val = df_val[['target']].values
X_test = df_test[['open', 'close', 'low', 'high']].values
y_test = df_test[['target']].values

In [2]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn


# 定义基础模型
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# 训练基础模型
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

# 得到基础模型在验证集上的预测结果
rf_pred = rf_model.predict(X_val)
gb_pred = gb_model.predict(X_val)

# 构造元特征
meta_features = np.column_stack((rf_pred, gb_pred))

# 定义元模型
class MetaModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MetaModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.fc2(x)
        return x

# 将元特征转换为PyTorch张量
X_meta_tensor = torch.tensor(meta_features, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

# 定义元模型的输入维度和隐藏层维度
input_dim = meta_features.shape[1]
hidden_dim = 8

# 初始化元模型
meta_model = MetaModel(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=1)

# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(meta_model.parameters(), lr=0.001)

# 训练元模型
for epoch in range(100):
    optimizer.zero_grad()
    y_pred = meta_model(X_meta_tensor)
    loss = criterion(y_pred.squeeze(), y_val_tensor)  # 去掉无关的维度
    loss.backward()
    optimizer.step()

# 测试集上的预测
rf_test_pred = rf_model.predict(X_test)
gb_test_pred = gb_model.predict(X_test)
meta_features_test = np.column_stack((rf_test_pred, gb_test_pred))
X_meta_test_tensor = torch.tensor(meta_features_test, dtype=torch.float32)
y_test_pred = meta_model(X_meta_test_tensor).detach().numpy()

# 计算测试集上的均方误差
mse_test = mean_squared_error(y_test, y_test_pred)
print("Test MSE:", mse_test)


Test MSE: 7.71455662725422


In [4]:
import numpy as np
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error


# 初始化三个模型
xgb_model = XGBRegressor()
catboost_model = CatBoostRegressor()
bayesian_ridge_model = BayesianRidge()

# 创建投票回归器
voting_regressor = VotingRegressor(
    estimators=[
        ('xgb', xgb_model),
        ('catboost', catboost_model),
        ('bayesian_ridge', bayesian_ridge_model)
    ],
    weights=[1, 1, 1]  # 可以根据模型性能调整权重
)

# 训练投票回归器
voting_regressor.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = voting_regressor.predict(X_test)

# 评估模型性能
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')



Learning rate set to 0.0815
0:	learn: 2.5697972	total: 57.1ms	remaining: 57.1s
1:	learn: 2.5696507	total: 63.8ms	remaining: 31.8s
2:	learn: 2.5695941	total: 68.8ms	remaining: 22.9s
3:	learn: 2.5695437	total: 73.3ms	remaining: 18.3s
4:	learn: 2.5694578	total: 78.5ms	remaining: 15.6s
5:	learn: 2.5693859	total: 83.6ms	remaining: 13.8s
6:	learn: 2.5692315	total: 88.4ms	remaining: 12.5s
7:	learn: 2.5691918	total: 93.2ms	remaining: 11.6s
8:	learn: 2.5690935	total: 97.9ms	remaining: 10.8s
9:	learn: 2.5689560	total: 103ms	remaining: 10.2s
10:	learn: 2.5688614	total: 107ms	remaining: 9.66s
11:	learn: 2.5687732	total: 112ms	remaining: 9.22s
12:	learn: 2.5686027	total: 117ms	remaining: 8.89s
13:	learn: 2.5684551	total: 122ms	remaining: 8.57s
14:	learn: 2.5681949	total: 127ms	remaining: 8.34s
15:	learn: 2.5681026	total: 132ms	remaining: 8.11s
16:	learn: 2.5679960	total: 137ms	remaining: 7.89s
17:	learn: 2.5677970	total: 141ms	remaining: 7.67s
18:	learn: 2.5676401	total: 145ms	remaining: 7.49s
19:	

In [5]:
from sklearn.model_selection import train_test_split
df = df.set_index('date')
insample_data = df['2020-01-01':'2022-06-30']
outsample_data = df['2022-07-01':'2022-11-30']
train_date, test_date = train_test_split(insample_data.index.unique(), test_size=0.1)
train_data = insample_data.loc[train_date]
test_data = insample_data.loc[test_date]

In [7]:
import lightgbm as lgb
from itertools import product

param_grid = {
    'subsample': [0.6, 0.7],
    'max_depth': [3, 4]
}

search_space = list(map(lambda x: dict(zip(param_grid.keys(), x)), list(product(*param_grid.values()))))

train_X = train_data.iloc[:, 1:-1]
train_y = train_data.iloc[:, -1]

test_X = test_data.iloc[:, 1:-1]
test_y = test_data.iloc[:, -1]

test_ic_data = test_data[['code', 'target']]

def cross_ic(y_true, y_pred):
    true_y = test_ic_data.copy()
    true_y['target'] = y_true
    true_y = true_y.reset_index().pivot(index='date', columns='code', values='target')
    pred_y = test_ic_data.copy()
    pred_y['target'] = y_pred
    pred_y = pred_y.reset_index().pivot(index='date', columns='code', values='target')
    return true_y.corrwith(pred_y).mean()

best_params = None
best_estimator = None
best_ic = 0

for param in search_space:
    gbdt_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'regression',
                    'metric': {'l2', 'l1'},
                    'num_leaves': 31,
                    'learning_rate': 0.1,
                    'feature_fraction': 0.9,
                    'bagging_fraction':param['subsample'],
                    'bagging_freq': 5,
                    'max_depth':param['max_depth'],
                    'verbose': 1
                    }   
    lgb_train = lgb.Dataset(train_X, train_y)
    gbm = lgb.train(gbdt_params,
                lgb_train,
                valid_sets=lgb_train,
                num_boost_round=50)

# 预测
    y_pred = gbm.predict(test_X, num_iteration=gbm.best_iteration)
    ic = cross_ic(test_y, y_pred)
    print(f"{param}-ic: {ic}")
    if abs(ic) > best_ic:
        best_ic = ic
        best_params = param
        best_estimator = gbm
        
print("最佳参数组合:", best_params)
print("最佳模型:", best_estimator)
print("最佳IC:", best_ic)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 2291663, number of used features: 9
[LightGBM] [Info] Start training from score 0.014439
{'subsample': 0.6, 'max_depth': 3}-ic: 0.054946482088115874
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057411 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 2291663, number of used features: 9
[LightGBM] [Info] Start training from score 0.014439
{'subsample': 0.6, 'max_depth': 4}-ic: 0.056717482896612964
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012736 seconds.
You can set `force_row_wis

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import time
import glob
import warnings
warnings.filterwarnings('ignore')


#data
path = "./data/factor_vp"
all_files = glob.glob(path + "/*.csv")   #把所有的数据文件文件名读取在一起
li = []
#

for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)


frame = pd.concat(li, axis=0, ignore_index=True)
path1 = "./data/data_test"
all_files1 = glob.glob(path1 + "/*.csv")

li1 = []
for filename in tqdm(all_files1):
    df1 = pd.read_csv(filename)  # 读取CSV文件
    # 提取需要的列
    df_subset  = df1[['date', 'code', '1vwap_pct']]
    li1.append(df_subset)

# 合并数据到 factor_vp
frame1 = pd.concat(li1, axis=0, ignore_index=True)
# 将frame1中的1vwap_pct列拼接到frame中
#frame = pd.merge(frame, frame1, on=['date', 'code'], how='left')
frame['1vwap_pct']= frame1['1vwap_pct']
# 删去frame中日期大于2022-12-31的数据
frame = frame[frame['date'] <= '2022-12-31']
path2 = "./data/UTR_final2"
all_files2 = glob.glob(path2 + "/*.csv")
li2 = []
for filename in tqdm(all_files2):
    df2 = pd.read_csv(filename)  # 读取CSV文件
    df_subset2 = df2[['code', 'UTR_resid']]
    li2.append(df_subset2)
frame2 = pd.concat(li2, axis=0, ignore_index=True)

frame['UTR_resid'] = frame2['UTR_resid']
path3 = "./data/data_barra"
all_files3 = glob.glob(path3 + "/*.csv")
li3 = []
for filename in tqdm(all_files3):
    df3 = pd.read_csv(filename)  # 读取CSV文件
    df_subset3 = df3[['code', 'beta','momentum','residual_volatility','non_linear_size','book_to_price_ratio','liquidity','earnings_yield','growth','leverage']]
    li3.append(df_subset3)
frame3 = pd.concat(li3, axis=0, ignore_index=True)
frame['beta'] = frame3['beta']
frame['momentum'] = frame3['momentum']
frame['residual_volatility'] = frame3['residual_volatility']
frame['non_linear_size'] = frame3['non_linear_size']
frame['book_to_price_ratio'] = frame3['book_to_price_ratio']
frame['liquidity'] = frame3['liquidity']
frame['earnings_yield'] = frame3['earnings_yield']
frame['growth'] = frame3['growth']
frame['leverage'] = frame3['leverage']
result_df = frame.sort_values(by=['date', 'code'], ascending=True)
#result_df['vwap'] = result_df['money'] / result_df['volume']
result_df['target'] = result_df.groupby('code')['1vwap_pct'].shift(-1).values
df = result_df.reset_index(drop=True)
df = df.fillna(0)
df_train = df[(df['date'] <= '2022-02-01')]
df_val = df[(df['date'] > '2022-02-01') & (df['date']<='2022-04-30')]
df_test = df[(df['date'] <= '2022-12-29') & (df['date'] >= '2022-05-01')]
#df_train = df[(df['date'] >= '2022-11-31') & (df['date'] <= '2022-12-22')]
#df_val = df[(df['date'] <= '2022-12-25') & (df['date'] >= '2022-12-23')]
#df_test = df[(df['date'] <= '2022-12-29') & (df['date'] >= '2022-12-26')]
X_train = df_train[['boll_down','boll_up','BIAS1M','ROC1M','day_VPT','close_std_1M','vol_std_6M','vol_me_6M','Amihud','UTR_resid','beta','momentum','residual_volatility','non_linear_size','book_to_price_ratio','liquidity','earnings_yield','growth','leverage']].values
y_train = df_train[['target']].values
X_val = df_val[['boll_down','boll_up','BIAS1M','ROC1M','day_VPT','close_std_1M','vol_std_6M','vol_me_6M','Amihud','UTR_resid','beta','momentum','residual_volatility','non_linear_size','book_to_price_ratio','liquidity','earnings_yield','growth','leverage']].values
y_val = df_val[['target']].values
X_test = df_test[['boll_down','boll_up','BIAS1M','ROC1M','day_VPT','close_std_1M','vol_std_6M','vol_me_6M','Amihud','UTR_resid','beta','momentum','residual_volatility','non_linear_size','book_to_price_ratio','liquidity','earnings_yield','growth','leverage']].values
y_test = df_test[['target']].values