In [1]:
import pandas as pd
import numpy as np
import feather
import pickle

模型题：高频交易里，成交量预估是重要的一个环节。已知五分钟样例数据格式如下：

In [2]:
task = pd.read_csv('task.csv')
task.head()

Unnamed: 0,Date,Time,Instrument,close,volume,industry
0,20230421,09:35:00,000001.SZ,12.66,4429000.0,480000
1,20230421,09:35:00,000002.SZ,15.61,11810483.0,430000
2,20230421,09:35:00,000063.SZ,38.450001,25530656.0,730000
3,20230421,09:35:00,000069.SZ,5.09,7344113.0,430000
4,20230421,09:35:00,000100.SZ,4.33,9927899.0,270000


其中 Date 是日期，Time 表示当前分钟末尾，即认为其中的 close 和 volume 是在 Time 那一
瞬间可以获取到的。
Close 表示 Time 时刻的价格，Volume 表示 Time 时刻结尾的前面五分钟
成交量总和。
Instrument 是个股标记，Industry 是某个行业标记，标记同类的股票可能会有
一定关系。

任务目标：预测未来五分钟的 volume，从 Time = 9:40:00 开始预测（你可以用当天第一个
bar 的数据了）

任务提示：

1、不能隐含使用未来数据（如 pandas 直接 sum all 把未来数据加起来了）

2、每个股票 volume 都不一样，而且漂移严重，你需要找合适的替代指标来做模型（比如 current
vol / history n bar volume sum）。

3、你的 benchmark 就是过去一段时间的均值。

4、或许同行业数据可以使用到个票预测上。

任务要求：代码需要有一定结构，包括数据预处理、简单因子/特征的构建、模型 train、valid、
test 划分、最终对比评价。Train 用来训练、valid 验证/早停、test 是模型外验证。根据不同
模型构建评价指标。灵活处理

In [3]:
data = task.copy()
df_mul = data.set_index(['Instrument', 'Date','Time'], drop=True)
data = df_mul.sort_index(level='Instrument')
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,close,volume,industry
Instrument,Date,Time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
000001.SZ,20230421,09:35:00,12.66,4429000.0,480000
000001.SZ,20230421,09:40:00,12.67,2707900.0,480000
000001.SZ,20230421,09:45:00,12.71,1866802.0,480000
000001.SZ,20230421,09:50:00,12.76,2235400.0,480000
000001.SZ,20230421,09:55:00,12.79,4146100.0,480000


#### 简单因子特征的构建

In [4]:
# 提取时间相关特征
data['Hour'] = np.array(pd.to_datetime(data.reset_index()['Time']).dt.hour)
data['Minute'] = np.array(pd.to_datetime(data.reset_index()['Time']).dt.minute)

In [5]:
# 计算行业平均成交量
industry_avg_volume = data.groupby(['industry', 'Date', 'Time'])['volume'].mean().reset_index()
industry_avg_volume.rename(columns={'volume': 'IndustryAvgVolume'}, inplace=True)
# 使用 join 方法将计算出的平均成交量数据合并到原始的 DataFrame 中
data = data.join(industry_avg_volume.set_index(['industry','Date', 'Time']), on=['industry','Date', 'Time'])

In [6]:
# 计算行业成交量波动
industry_std_volume = data.groupby(['industry', 'Date', 'Time'])['volume'].std().reset_index()
industry_std_volume.rename(columns={'volume': 'IndustryStdVolume'}, inplace=True)
# 使用 join 方法将计算出的平均成交量数据合并到原始的 DataFrame 中
data = data.join(industry_std_volume.set_index(['industry','Date', 'Time']), on=['industry','Date', 'Time'])

In [7]:
# 计算行业均价
industry_avg_close = data.groupby(['industry', 'Date', 'Time'])['close'].mean().reset_index()
industry_avg_close.rename(columns={'close': 'IndustryAvgClose'}, inplace=True)
data = data.join(industry_avg_close.set_index(['industry','Date', 'Time']), on=['industry','Date', 'Time'])

In [8]:
# 计算行业波动率
industry_volatility = data.groupby(['industry', 'Date', 'Time'])['close'].std().reset_index()
industry_volatility.rename(columns={'close': 'IndustryVolatility'}, inplace=True)
data = data.join(industry_volatility.set_index(['industry','Date', 'Time']), on=['industry','Date', 'Time'])

#### 生成过去30天的时间序列

In [9]:
# 生成新的列
for i in range(29-1,-1,-1):
    data[f'past_close_{i+1}'] = data.groupby('Instrument')['close'].shift(i+1)/data['close']
    data[f'past_volume_{i+1}'] = data.groupby('Instrument')['volume'].shift(i+1)/data['volume']
    data[f'past_Hour_{i+1}'] = data.groupby('Instrument')['Hour'].shift(i+1)
    data[f'past_Minute_{i+1}'] = data.groupby('Instrument')['Minute'].shift(i+1)
    data[f'past_IAV_{i+1}'] = data.groupby('Instrument')['IndustryAvgVolume'].shift(i+1)/data['volume']
    data[f'past_ISV_{i+1}'] = data.groupby('Instrument')['IndustryStdVolume'].shift(i+1)/data['volume']
    data[f'past_IAC_{i+1}'] = data.groupby('Instrument')['IndustryAvgClose'].shift(i+1)/data['close']
    data[f'past_ISC_{i+1}'] = data.groupby('Instrument')['IndustryVolatility'].shift(i+1)/data['close']


  data[f'past_IAV_{i+1}'] = data.groupby('Instrument')['IndustryAvgVolume'].shift(i+1)/data['volume']
  data[f'past_ISV_{i+1}'] = data.groupby('Instrument')['IndustryStdVolume'].shift(i+1)/data['volume']
  data[f'past_IAC_{i+1}'] = data.groupby('Instrument')['IndustryAvgClose'].shift(i+1)/data['close']
  data[f'past_ISC_{i+1}'] = data.groupby('Instrument')['IndustryVolatility'].shift(i+1)/data['close']
  data[f'past_close_{i+1}'] = data.groupby('Instrument')['close'].shift(i+1)/data['close']
  data[f'past_volume_{i+1}'] = data.groupby('Instrument')['volume'].shift(i+1)/data['volume']
  data[f'past_Hour_{i+1}'] = data.groupby('Instrument')['Hour'].shift(i+1)
  data[f'past_Minute_{i+1}'] = data.groupby('Instrument')['Minute'].shift(i+1)
  data[f'past_IAV_{i+1}'] = data.groupby('Instrument')['IndustryAvgVolume'].shift(i+1)/data['volume']
  data[f'past_ISV_{i+1}'] = data.groupby('Instrument')['IndustryStdVolume'].shift(i+1)/data['volume']
  data[f'past_IAC_{i+1}'] = data.groupby('Instrumen

#### 生成label

In [10]:
# 生成label
data[f'volume_{1}'] = data.groupby('Instrument')['volume'].shift(-1)
data['volume_label1'] = data[f'volume_{1}']/data['volume']

  data[f'volume_{1}'] = data.groupby('Instrument')['volume'].shift(-1)
  data['volume_label1'] = data[f'volume_{1}']/data['volume']


In [11]:
data.dropna().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,close,volume,industry,Hour,Minute,IndustryAvgVolume,IndustryStdVolume,IndustryAvgClose,IndustryVolatility,past_close_29,...,past_close_1,past_volume_1,past_Hour_1,past_Minute_1,past_IAV_1,past_ISV_1,past_IAC_1,past_ISC_1,volume_1,volume_label1
Instrument,Date,Time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
000001.SZ,20230421,13:30:00,12.61,1065100.0,480000,13,30,1191376.0,1011870.0,9.558095,8.156538,1.003965,...,1.000793,1.137734,13.0,25.0,1.35318,1.399048,0.757902,0.64703,1953600.0,1.834194
000001.SZ,20230421,13:35:00,12.59,1953600.0,480000,13,35,1584959.0,1605637.0,9.539524,8.134215,1.006354,...,1.001589,0.545199,13.0,30.0,0.609836,0.517952,0.759181,0.647858,1634972.0,0.836902
000001.SZ,20230421,13:40:00,12.59,1634972.0,480000,13,40,1984549.0,1994986.0,9.536667,8.134369,1.009531,...,1.0,1.194883,13.0,35.0,0.96941,0.982058,0.757706,0.646085,1555236.0,0.951231
000001.SZ,20230421,13:45:00,12.58,1555236.0,480000,13,45,2306005.0,2240732.0,9.520476,8.109616,1.014308,...,1.000795,1.051269,13.0,40.0,1.276043,1.282754,0.758082,0.646611,1044700.0,0.671731
000001.SZ,20230421,13:50:00,12.6,1044700.0,480000,13,50,1443919.0,1454847.0,9.525238,8.128511,1.015079,...,0.998413,1.488691,13.0,45.0,2.207337,2.144857,0.755593,0.64362,742200.0,0.710443


归一化原始数值

In [12]:
data['IndustryAvgVolume'] = data['IndustryAvgVolume']/data['volume']
data['IndustryStdVolume'] = data['IndustryStdVolume']/data['volume']
data['IndustryAvgClose'] = data['IndustryAvgClose'] /data['close']
data['IndustryVolatility'] = data['IndustryVolatility']/data['close']
# data['volume'] = 1.
# data['close'] = 1.

In [13]:
data.dropna().reset_index().drop('industry', axis=1).to_feather('task_8_30_IDT.feather') #.reset_index(drop=True)

In [15]:
# df_train_new = feather.read_dataframe('task_8_30_IDT.feather')
# df_train_new

### 计算benchmark

In [16]:
# df_train_new = feather.read_dataframe('task_30.feather')
df_train_new = task.copy()

df_mul = df_train_new.set_index(['Instrument', 'Date','Time'], drop=True)
df_mul

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,close,volume,industry
Instrument,Date,Time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
000001.SZ,20230421,09:35:00,12.660000,4429000.0,480000
000002.SZ,20230421,09:35:00,15.610000,11810483.0,430000
000063.SZ,20230421,09:35:00,38.450001,25530656.0,730000
000069.SZ,20230421,09:35:00,5.090000,7344113.0,430000
000100.SZ,20230421,09:35:00,4.330000,9927899.0,270000
...,...,...,...,...,...
688363.SH,20230803,15:00:00,95.790001,39556.0,770000
688396.SH,20230803,15:00:00,55.720001,65426.0,270000
688561.SH,20230803,15:00:00,51.070000,31368.0,710000
688599.SH,20230803,15:00:00,37.259998,123220.0,630000


In [17]:
total_time = df_train_new['Date'].drop_duplicates().to_list() # 70天
total_time_len = len(total_time)

train_end_index = int(total_time_len*0.8) # 56 #
valid_end_index = int(total_time_len*0.9) # 63 #

In [18]:
total_time[0],total_time[train_end_index],total_time[valid_end_index],total_time[-1]

(20230421, 20230717, 20230726, 20230803)

In [19]:
# df_mul = data.set_index(['Instrument', 'Date','Time'], drop=True)
data = df_mul.sort_index(level='Instrument')

In [20]:
# 用过去30天均值预测
data['rolling_volume30_mean']= np.array(data.groupby('Instrument')['volume'].rolling(30).mean())
data['volume30_predict'] = data['rolling_volume30_mean']/data['volume']
# 生成label
data[f'volume_{1}'] = data.groupby('Instrument')['volume'].shift(-1)
data['volume_label1'] = data[f'volume_{1}']/data['volume']

In [21]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,close,volume,industry,rolling_volume30_mean,volume30_predict,volume_1,volume_label1
Instrument,Date,Time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
000001.SZ,20230421,09:35:00,12.660000,4429000.0,480000,,,2707900.0,0.611402
000001.SZ,20230421,09:40:00,12.670000,2707900.0,480000,,,1866802.0,0.689391
000001.SZ,20230421,09:45:00,12.710000,1866802.0,480000,,,2235400.0,1.197449
000001.SZ,20230421,09:50:00,12.760000,2235400.0,480000,,,4146100.0,1.854746
000001.SZ,20230421,09:55:00,12.790000,4146100.0,480000,,,3209570.0,0.774118
...,...,...,...,...,...,...,...,...,...
688981.SH,20230803,14:40:00,50.990002,250869.0,270000,190005.700000,0.757390,211027.0,0.841184
688981.SH,20230803,14:45:00,50.910000,211027.0,270000,193382.933333,0.916390,306030.0,1.450194
688981.SH,20230803,14:50:00,50.990002,306030.0,270000,198559.333333,0.648823,295822.0,0.966644
688981.SH,20230803,14:55:00,50.950001,295822.0,270000,203113.666667,0.686608,196869.0,0.665498


In [22]:
datadrop = data.dropna().reset_index().set_index([ 'Date','Time', 'Instrument'], drop=True)
# datadrop = datadrop.sort_index(level=['Date','Time'])

In [23]:
df_test = datadrop.loc[total_time[valid_end_index:]]

In [24]:
df_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,close,volume,industry,rolling_volume30_mean,volume30_predict,volume_1,volume_label1
Date,Time,Instrument,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
20230726,09:35:00,000001.SZ,11.690000,6364800.0,480000,3.024844e+06,0.475246,4063180.0,0.638383
20230726,09:40:00,000001.SZ,11.720000,4063180.0,480000,3.062547e+06,0.753732,3260720.0,0.802504
20230726,09:45:00,000001.SZ,11.660000,3260720.0,480000,3.151011e+06,0.966354,2042600.0,0.626426
20230726,09:50:00,000001.SZ,11.670000,2042600.0,480000,3.161531e+06,1.547797,3007900.0,1.472584
20230726,09:55:00,000001.SZ,11.660000,3007900.0,480000,3.196261e+06,1.062622,1525300.0,0.507098
...,...,...,...,...,...,...,...,...,...
20230803,14:35:00,688981.SH,50.990002,194935.0,270000,1.859326e+05,0.953818,250869.0,1.286937
20230803,14:40:00,688981.SH,50.990002,250869.0,270000,1.900057e+05,0.757390,211027.0,0.841184
20230803,14:45:00,688981.SH,50.910000,211027.0,270000,1.933829e+05,0.916390,306030.0,1.450194
20230803,14:50:00,688981.SH,50.990002,306030.0,270000,1.985593e+05,0.648823,295822.0,0.966644


In [None]:
# dataCSnorm = datadrop.groupby(level=['Date','Time'],group_keys=False).apply(lambda x:(x-x.mean() )/x.std() )
# dataCSnorm

In [25]:
datanorm = df_test.apply(lambda x:(x-x.mean() )/x.std() )   # (df-df.mean())/df.std()

MSE

In [28]:
datanorm.groupby(level=['Date','Time']).apply(lambda x: np.mean(x['volume30_predict']-x['volume_label1'])**2).mean()

0.4228578895900704

RankIC

In [29]:
datanorm.groupby(level=['Date','Time']).apply(lambda x: x['volume30_predict'].corr(x['volume_label1'], method='spearman')).mean()

0.4694772821724271

IC

In [30]:
datanorm.groupby(level=['Date','Time']).apply(lambda x: x['volume30_predict'].corr(x['volume_label1'])).mean()

0.40639092862444837

### GRU训练结果计算

In [16]:
test_pred5=pickle.load(open('output/all_GRU_feat8_new/pred.pkl.test0','rb'))
# test_pred5

RankIC

In [17]:
# test_norm = test_pred5.apply(lambda x:(x-x.mean() )/x.std() )
test_pred5.groupby(level=['Date','Time']).apply(lambda x: x['score'].corr(x['label'], method='spearman')).mean()

0.47352213433465157

IC

In [18]:
test_pred5.groupby(level=['Date','Time']).apply(lambda x: x['score'].corr(x['label'])).mean()

0.41797989090879084

MSE

In [19]:
test_pred5.groupby(level=['Date','Time']).apply(lambda x: np.mean(x['score']-x['label'])**2).mean()

0.02216416806877501