In [1]:
%reload_ext autoreload
%autoreload 2
import os
import shutil
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import torch
import onnx
import onnxruntime
import multiprocessing
from model import mlp

In [2]:
data_path = '/data/local_data/shared/102/intern_data_yzhou/zy4_parquet'
mean_std_path = '/mnt/beegfs/strategy_intern/zzdai_intern/for_wpxu/data_processed/mean_std_all.pkl'
feas = pd.read_pickle('/mnt/beegfs/strategy_intern/zzdai_intern/for_wpxu/feas353.pkl')
feas = [i for i in feas if 'x_' in i]
feas = [i for i in feas if 'x_1663' not in i]
fea_nums = len(feas) 
print(fea_nums)

onnx_save_path = f'/mnt/beegfs/strategy_intern/zzdai_intern/for_wpxu/nn_models/mlp_baseline' 
path = '/mnt/beegfs/strategy_intern/zzdai_intern/for_wpxu/nn_results/mlp_baseline'
md_list = [f'mlp_{i}' for i in range(5)]
model_list = [os.path.join(path, i) for i in md_list]
model_path_list = [os.path.join(i, 'best.pth') for i in model_list]
print(model_path_list)

351
['/mnt/beegfs/strategy_intern/zzdai_intern/for_wpxu/nn_results/mlp_baseline/mlp_0/best.pth', '/mnt/beegfs/strategy_intern/zzdai_intern/for_wpxu/nn_results/mlp_baseline/mlp_1/best.pth', '/mnt/beegfs/strategy_intern/zzdai_intern/for_wpxu/nn_results/mlp_baseline/mlp_2/best.pth', '/mnt/beegfs/strategy_intern/zzdai_intern/for_wpxu/nn_results/mlp_baseline/mlp_3/best.pth', '/mnt/beegfs/strategy_intern/zzdai_intern/for_wpxu/nn_results/mlp_baseline/mlp_4/best.pth']


In [3]:
if not os.path.exists(onnx_save_path):
    os.makedirs(onnx_save_path)
shutil.copyfile(mean_std_path, os.path.join(onnx_save_path, 'mean_std_all.pkl'))
pd.to_pickle(feas, os.path.join(onnx_save_path, 'feas.pkl'))


for i, md_path in enumerate(model_path_list):
    shutil.copyfile(md_path, os.path.join(onnx_save_path, md_list[i].split('/')[-1]+'.pth'))
    onnx_path = os.path.join(onnx_save_path, md_list[i].split('/')[-1]+'.onnx')
    model_i = mlp(len(feas))
    model_i.eval() # 若存在batchnorm、dropout层则一定要eval()!!!!再export
    ckpt = torch.load(md_path, map_location=torch.device('cpu'))
    model_i.load_state_dict(ckpt, strict=True)
    
    dummy_input = torch.randn((2**14, fea_nums))
    torch.onnx.export(model_i, dummy_input, onnx_path, input_names=['input_0'], output_names=['output_0'],
    dynamic_axes={'input_0':{0: 'batch'}, 'output_0':{0: 'batch'}}) #, opset_version = 11)
    onnx_model = onnx.load(onnx_path) # 加载onnx
    onnx.checker.check_model(onnx_model) 
print('finished!')

finished!


In [4]:
fea_path = os.path.join(onnx_save_path, 'feas.pkl')
mean_std_path = os.path.join(onnx_save_path, 'mean_std_all.pkl')
start_date = '20200601'
end_date = '20210331'
y_name = 'm1_ts_y_60twap_2n1open_Wgted_fullmarket_ex'
feas = pd.read_pickle(fea_path)
print(len(feas))
mean_std_df = pd.read_pickle(mean_std_path)
means = mean_std_df.loc[:, 'mean'].astype(np.float32)
stds = mean_std_df.loc[:, 'std']
stds = stds.apply(lambda x: max(1e-10, x)).astype(np.float32)

def get_pred_df_mp2(model_path, feas, means, stds, date_start, date_end, stacking_type):
    nn_models = []
    for i in os.listdir(onnx_save_path):
        if '.onnx' not in i:
            continue
        nn_models.append(onnxruntime.InferenceSession(os.path.join(model_path, i), providers=['CPUExecutionProvider']))
    files = f'{data_path}/{date_start}.parquet'
    dataset = pq.ParquetDataset(files, use_legacy_dataset=False)
    df_pred = dataset.read(columns=['date', 'TimeStamp', 'ticker']).to_pandas()
    df_pred['model_pred'] = 0.0
    _df_pred = df_pred.copy()
    
    test_x = dataset.read(columns=feas).to_pandas()
    test_x.replace([-np.inf, np.inf], np.nan, inplace=True)
    # test_x.fillna(method='ffill', inplace=True)
    test_x = (test_x - means.loc[feas])/stds.loc[feas]
    test_x.fillna(0, inplace=True)
    # test_x.loc[:, 'x_1663'] = 0
    
    test_x = test_x.astype(np.float32)
    input = test_x.values
    for nn_model in nn_models:
        ort_output = nn_model.run(['output_0'], {'input_0': input})[0] 
        _df_pred['model_pred'] = ort_output.reshape(-1)
        if stacking_type == 'rank':
            _df_pred['model_pred'] = _df_pred.groupby(['TimeStamp'])['model_pred'].rank(axis=0, pct=True)
        df_pred['model_pred'] += _df_pred['model_pred']/len(nn_models)
    print(date_start, 'finished')
    return df_pred

351


In [5]:
file_list = sorted(os.listdir(data_path))
date_all = [i.split('.')[0] for i in file_list]
date_list = [date for date in date_all if date>=start_date and date<=end_date]

mul_dfs = []
df_pred_all = []
pool = multiprocessing.Pool(32)
for i, dt in enumerate(date_list):
    mul_dfs.append(pool.apply_async(get_pred_df_mp2, (onnx_save_path, feas, means, stds, dt, dt, 'rank')))
for item in mul_dfs:
    df_pred_all.append(item.get())
pool.close()

df_pred_all = pd.concat(df_pred_all)
df_pred_all = df_pred_all.sort_values(by=['date','TimeStamp','ticker'], )
pd.to_pickle(df_pred_all, os.path.join(onnx_save_path, 'model_pred.pkl'))

20200629 finished
20200624 finished20200617
 finished
20200605 finished
20200602 finished
20200702 finished
20200616 finished
20200611 20200615 finished20200622 
finished
finished
20200618 finished
2020060820200619 finished
 finished
20200604 finished
20200612 finished
20200603 finished
20200601 
finished20200610 finished
20200706 20200630finished 
finished
20200701 finished20200703
 finished
20200707 finished
2020071020200709  finishedfinished

20200623 finished
20200609 finished
20200708 finished
20200715 finished
20200714 finished
20200716 finished
20200713 finished
20200723 finished
20200722 finished
20200826 20200717finished 
finished
2020081320200814 finished finished
20200810
 finished
20200820 finished20200728 
finished
20200807 20200720finished
 finished
20200803 finished
20200827 finished
20200727 finished
20200724 finished
20200730 
finished20200824 finished
20200821 finished
20200729 finished
20200825 finished
20200721 finished
20200806 finished
20200819 finished
20200817 f

In [6]:
from tools import get_eval_result
from PqiDataSdk import *

# 获取评价用收益
# 起始日期
date_s = start_date
# 结束日期
date_e = end_date
ds = PqiDataSdk(user="wpxu", size=1, pool_type="mp", str_map=False)
dates = ds.get_trade_dates(start_date=date_s, end_date=date_e)
data_path = '/data/local_data/shared/102/intern_data_yzhou/zy4_parquet'
files = [f'{data_path}/{i}.parquet' for i in dates] #所有数据文件
y_name = 'm1_ts_y_60twap_2n1open_Wgted_fullmarket_ex' #指定1天后收益y值
zt_filter = ('m1_ts_z_tag_up_limit','=',0.0)
dt_filter = ('m1_ts_z_tag_down_limit','=',0.0)
yna_filter = (y_name,'!=',np.nan)
filters = [zt_filter,dt_filter,yna_filter]
dataset = pq.ParquetDataset(files, use_legacy_dataset=False, filters=filters)
cols = ['date', 'TimeStamp','ticker', y_name]
df_eval = dataset.read(columns=cols).to_pandas()
# 开始评价模型表现，建议分20 21年
param_dict = {
    'crss_ratio_thr': 0.03,
    'max_trade_time': 1,
    'index_code':'000905',
    'start_date':'20200601',
    'end_date':'20210331',
    'y_name': y_name
}
get_eval_result(param_dict, df_eval, df_pred_all)


{'rtn': 0.45248598,
 'sharpe': 7.818074910237697,
 'rtn_mtt': 0.4437583,
 'sharpe_mtt': 8.953303192129178,
 'turnover': 0.7296788713910266,
 'rtn_fee': 0.22047657424182757,
 'exp_size_mean': -0.9351935264285018,
 'exp_size_min': -1.5661436005472904,
 'exp_size_max': 0.2935770728711605,
 'exp_bp_mean': -0.27353997251075635,
 'exp_bp_min': -0.8705440397548537,
 'exp_bp_max': 0.45323319260747674,
 'exp_rv_mean': 0.03812554507776968,
 'exp_rv_min': -0.8473830994345523,
 'exp_rv_max': 0.8700684433532738,
 'kcb_ratio_mean': 0.06650735278792719,
 'kcb_ratio_max': 0.16065573770491803,
 'cyb_ratio_mean': 0.2683294522776527,
 'cyb_ratio_max': 0.652}

In [7]:
param_dict = {
    'crss_ratio_thr': 0.03,
    'max_trade_time': 1,
    'index_code':'000905',
    'start_date':'20210101',
    'end_date':'20210331',
    'y_name': y_name
}
get_eval_result(param_dict, df_eval, df_pred_all)

{'rtn': 0.13077165,
 'sharpe': 7.290189315939747,
 'rtn_mtt': 0.12716727,
 'sharpe_mtt': 8.380594501906193,
 'turnover': 0.7209812873266889,
 'rtn_fee': 0.06444189759009485,
 'exp_size_mean': -0.934032994799822,
 'exp_size_min': -1.5661436005472904,
 'exp_size_max': 0.2935770728711605,
 'exp_bp_mean': -0.21991784764812175,
 'exp_bp_min': -0.8705440397548537,
 'exp_bp_max': 0.37908718357451615,
 'exp_rv_mean': 0.034422427591791746,
 'exp_rv_min': -0.5148980499238308,
 'exp_rv_max': 0.628956876913992,
 'kcb_ratio_mean': 0.07662383630371776,
 'kcb_ratio_max': 0.15544041450777202,
 'cyb_ratio_mean': 0.23251235902586645,
 'cyb_ratio_max': 0.42091836734693877}