## 导入必要的安装包

In [1]:
import pandas as pd
import numpy as np
import tushare as ts
import os 
from tqdm import tqdm
import glob
ts.set_token('0948b6427e46bb1bc7fa60b52df96ab0080b9d4ff80209a5fee99277')
debug = False
from datetime import datetime 
import tensorflow as tf
pro = ts.pro_api()

In [2]:
# !pip install openpyxl

## 生成Alpha因子的基础函数

In [3]:
def ts_sum(df, window=10):
    return df.rolling(window).sum()

def sma(df, window=10):
    return df.rolling(window).mean()

def ts_min(df, window=10):
    return df.rolling(window).min()

def ts_median(df, window=10):
    return df.rolling(window).median()

def ts_max(df, window=10):
    return df.rolling(window).max()

def delta(df, period=1):
    return df.diff(period)

def delay(df, period=1):
    return df.shift(period)

def rank(df):
    return df.rank(axis=1, pct=True)

def ts_argmax(df, window=10):
    return df.rolling(window).apply(np.argmax) + 1 


## 一系列Alpha因子

In [4]:

class AlphaLib(object):
    def __init__(self, daily_info):
        self.open = daily_info['open']
        self.high = daily_info['high']
        self.low = daily_info['low']
        self.close = daily_info['close']
        self.vwap = daily_info['vwap'] 
        self.volume = daily_info['volume'] 
        self.amount = daily_info['amount']

    def calcu_alpha(self):
        alpha_dict={}
        alpha_dict['alpha01']=((self.close/sma(self.close,10)-1)*self.amount).div(self.amount.sum(1),axis=0)
        alpha_dict['alpha02']=sma(self.amount,5).div(self.amount.sum(1),axis=0)
        alpha_dict['alpha06']=sma((self.close/sma(self.close, 5)-1)*self.amount,5).div(self.amount.sum(1),axis=0)
        alpha_dict['alpha07']=ts_max((self.close/sma(self.close, 15)-1)*self.amount,15).div(self.amount.sum(1),axis=0)
        alpha_dict['alpha08']=ts_min((self.close/ts_max(self.close, 60)-1)*self.amount,10).div(self.amount.sum(1),axis=0)
        alpha_dict['alpha09']=ts_max((self.close/ts_min(self.close, 60)-1)*self.amount,50).div(self.amount.sum(1),axis=0)
        alpha_dict['alpha10']=ts_max((self.close/ts_min(self.close, 20)-1)*self.amount,15).div(self.amount.sum(1),axis=0)
        alpha_dict['alpha12']=ts_max((ts_max(self.high,30)/ts_min(self.low, 30)-1)*self.amount,20).div(self.amount.sum(1),axis=0)
        alpha_dict['alpha13']=ts_max(self.high-self.low, 30)/ts_min(self.close+self.open, 30)
        alpha_dict['alpha14']=ts_sum(self.amount*(self.close-self.open), 5)
        alpha_dict['alpha15']=ts_max(self.amount*(self.high-self.low), 15)
        alpha_dict['alpha16']=ts_sum(self.amount*(self.close-self.vwap), 5)
        alpha_dict['alpha17']=ts_min(self.amount*(self.low-self.vwap), 15)
        alpha_dict['alpha18']=ts_min(self.amount*(self.open-self.vwap), 15)
        alpha_dict['alpha19']=ts_min(self.amount*(self.open-self.low), 10)
        alpha_dict['alpha20']=ts_max(self.amount*(self.close-self.low), 10)
        alpha_dict['alpha21']=ts_median(self.amount,15)/ts_sum(self.amount,15)
        alpha_dict['alpha23']=ts_max(self.amount,15)/ts_min(self.amount,10)
        alpha_dict['alpha24']=ts_sum(self.amount.div(self.amount.sum(1),axis=0),5)
        alpha_dict['alpha28']=(ts_max(self.close,5)/delay(self.close,5))*ts_min(self.close,5)/self.close
      
        tmp2=abs((self.close-self.open)/(self.high-self.low+0.01))
        alpha_dict['alpha29']=ts_max(tmp2,6)/ts_min(tmp2,6)
        alpha_dict['alpha30']=tmp2/delay(tmp2,4)

        tmp3=abs((self.low-self.open)/(self.close-self.low+0.01))
        alpha_dict['alpha31']=ts_max(tmp3,4)/ts_min(tmp3,4)

        tmp4=abs((self.high-self.open)/(self.close-self.low+0.01))
        alpha_dict['alpha32']=ts_max(tmp4,2)/ts_min(tmp4,2)
        
        alpha_dict['alpha_w_005'] = (rank((self.open - (ts_sum(self.vwap, 10) / 10))) * (-1 * abs(rank((self.close - self.vwap)))))
    
        return alpha_dict
        

## 生成训练集

In [5]:
import pandas as pd
price_PD_total = pd.read_csv('./data/price_PD.csv', dtype = {'trade_date': 'str'})
price_PD_total = price_PD_total.drop_duplicates(['ts_code', 'trade_date']).sort_values(['ts_code', 'trade_date']).reset_index(drop=True)
all_stock = pro.stock_basic()
all_stock = all_stock[['ts_code','name','market']]
price_PD_total = pd.merge(price_PD_total,all_stock,how='left',on='ts_code')
price_PD_total = price_PD_total[['ts_code', 'name','market', 'trade_date', 'open', 'high', 'low', 'close', 'pre_close',
       'change', 'pct_chg', 'vol', 'amount']]



In [6]:
indus = pd.read_csv('./data/a_stock_industry.csv',encoding='gbk')
indus = indus[['windcode','申万行业L1','申万行业L2','申万行业L3']]
indus.columns = ['ts_code','indus1','indus2','indus3']
indus.head()

Unnamed: 0,ts_code,indus1,indus2,indus3
0,000001.SZ,银行,银行Ⅱ,银行Ⅲ
1,000002.SZ,房地产,房地产开发Ⅱ,房地产开发Ⅲ
2,000004.SZ,计算机,计算机应用,IT服务
3,000005.SZ,公用事业,环保工程及服务Ⅱ,环保工程及服务Ⅲ
4,000006.SZ,房地产,房地产开发Ⅱ,房地产开发Ⅲ


In [7]:
##拼接行业，只保留主板、中小板、创业板，去除ST股票
price_PD_total = pd.merge(price_PD_total,indus,how='left',on='ts_code')
price_PD_total = price_PD_total[price_PD_total.market.isin(['主板','中小板','创业板'])]
price_PD_total = price_PD_total[~price_PD_total.name.str.contains('ST')].reset_index(drop=True)
price_PD_total.head()

Unnamed: 0,ts_code,name,market,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,amount,indus1,indus2,indus3
0,000001.SZ,平安银行,主板,20150105,10.1514,10.3355,9.9038,10.1704,10.0562,0.1142,1.1356,2860436.43,4565388.0,银行,银行Ⅱ,银行Ⅲ
1,000001.SZ,平安银行,主板,20150106,10.0625,10.4053,9.872,10.0181,10.1704,-0.1523,-1.4975,2166421.4,3453446.0,银行,银行Ⅱ,银行Ⅲ
2,000001.SZ,平安银行,主板,20150107,9.8784,10.0498,9.7133,9.8276,10.0181,-0.1905,-1.9016,1700120.67,2634796.0,银行,银行Ⅱ,银行Ⅲ
3,000001.SZ,平安银行,主板,20150108,9.8403,9.8847,9.4594,9.4975,9.8276,-0.3301,-3.3589,1407714.21,2128003.0,银行,银行Ⅱ,银行Ⅲ
4,000001.SZ,平安银行,主板,20150109,9.4594,10.0752,9.3388,9.5737,9.4975,0.0762,0.8023,2508500.23,3835378.0,银行,银行Ⅱ,银行Ⅲ


In [8]:
##计算开盘涨幅
price_PD_total['open_up'] = price_PD_total.groupby('ts_code').open.shift(1)/price_PD_total.groupby('ts_code').close.shift(0)

In [9]:
#t收盘买 t+1收盘卖
price_PD_total['ret1'] = price_PD_total.groupby('ts_code').close.shift(-1)/price_PD_total.groupby('ts_code').close.shift(0) 
#t+1开盘买 t+2收盘卖
price_PD_total['ret2'] = price_PD_total.groupby('ts_code').close.shift(-2)/price_PD_total.groupby('ts_code').open.shift(-1) 
#t+1开盘买 t+2开盘卖
# price_PD_total['ret3'] = price_PD_total.groupby('ts_code').open.shift(-3)/price_PD_total.groupby('ts_code').open.shift(-1) 
# #t+1开盘买 t+3收盘卖
# price_PD_total['ret4'] = price_PD_total.groupby('ts_code').close.shift(-3)/price_PD_total.groupby('ts_code').open.shift(-1)
# #t+1开盘买 t+4收盘卖
# price_PD_total['ret5'] = price_PD_total.groupby('ts_code').close.shift(-4)/price_PD_total.groupby('ts_code').open.shift(-1)

price_PD_total['ret'] = price_PD_total['ret2'].copy()



In [10]:
price_PD_wfq = pd.read_csv('./data/price_PD_2015至今_未复权.csv')
price_PD_wfq.trade_date = price_PD_wfq.trade_date.astype('str')
price_PD_wfq = price_PD_wfq.drop_duplicates(['ts_code', 'trade_date']).sort_values(['ts_code', 'trade_date']).reset_index(drop=True)
price_PD_wfq['vwap'] = price_PD_wfq.amount/price_PD_wfq.vol
price_PD_wfq = price_PD_wfq[price_PD_wfq.ts_code.isin(price_PD_total.ts_code.unique())]
price_PD_wfq.tail()

Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,amount,vwap
6220326,605599.SH,20220909,10.09,10.22,9.22,10.02,10.22,-0.2,-1.9569,58236.31,57322.643,0.984311
6220327,605599.SH,20220913,9.79,10.08,9.79,9.98,10.02,-0.04,-0.3992,31553.39,31488.568,0.997946
6220328,605599.SH,20220914,9.9,9.92,9.77,9.85,9.98,-0.13,-1.3026,18402.2,18093.121,0.983204
6220329,605599.SH,20220915,9.89,9.89,9.61,9.69,9.85,-0.16,-1.6244,14409.77,14020.617,0.972994
6220330,605599.SH,20220916,9.64,9.7,9.38,9.41,9.69,-0.28,-2.8896,14882.43,14157.309,0.951277


In [11]:
daily_info = {}
daily_info['open'] = price_PD_wfq.pivot('trade_date', 'ts_code', 'open')
daily_info['close'] = price_PD_wfq.pivot('trade_date', 'ts_code', 'close')
daily_info['high']  = price_PD_wfq.pivot('trade_date', 'ts_code', 'high')
daily_info['low']  = price_PD_wfq.pivot('trade_date', 'ts_code', 'low')
daily_info['amount']  = price_PD_wfq.pivot('trade_date', 'ts_code', 'amount')
daily_info['volume']  = price_PD_wfq.pivot('trade_date', 'ts_code', 'vol')
daily_info['vwap']  = (daily_info['amount']*1000)/(daily_info['volume']*100+1) 



In [12]:
del price_PD_wfq

In [13]:
tmp_class = AlphaLib(daily_info)
alpha_dict = tmp_class.calcu_alpha()

In [14]:
data = pd.DataFrame()
for k, v in tqdm(alpha_dict.items()):
    tmp = v.unstack().reset_index().rename(columns={0:k})
    if len(data) == 0:
        data = tmp
    else:
        data = pd.merge(data, tmp, how='left', on=['ts_code', 'trade_date'])

100%|██████████| 25/25 [02:51<00:00,  6.88s/it]


In [15]:
data = pd.merge(data, price_PD_total[['trade_date', 'ts_code','name', 'ret1', 'ret2', \
                                    'ret','open_up']], how='left', on=['trade_date', 'ts_code'])

In [16]:
price_PD_total.columns

Index(['ts_code', 'name', 'market', 'trade_date', 'open', 'high', 'low',
       'close', 'pre_close', 'change', 'pct_chg', 'vol', 'amount', 'indus1',
       'indus2', 'indus3', 'open_up', 'ret1', 'ret2', 'ret'],
      dtype='object')

In [17]:
del price_PD_total

In [18]:
train = data.copy()
train = train[train.trade_date < '20200401']#.dropna()

train['ret_rank'] = train.groupby('trade_date').ret.rank(pct=True)
train = train[~train.ret_rank.isna()].reset_index(drop=True)

from sklearn.model_selection import KFold,GroupKFold,train_test_split

kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)

enumsplit = []
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
#     enumsplit.append([trn_ind[int(0.025*len(trn_ind)):int(0.975*len(trn_ind))], val_ind])   ##避免泄露
    enumsplit.append([trn_ind, val_ind])  ##数据存在一点点泄露
enumsplit

[[array([      0,       1,       2, ..., 3625332, 3625333, 3625334]),
  array([     12,      13,      19, ..., 3625324, 3625327, 3625330])],
 [array([      3,       4,       6, ..., 3625331, 3625332, 3625333]),
  array([      0,       1,       2, ..., 3625316, 3625328, 3625334])],
 [array([      0,       1,       2, ..., 3625331, 3625333, 3625334]),
  array([      3,      18,      23, ..., 3625326, 3625329, 3625332])],
 [array([      0,       1,       2, ..., 3625331, 3625332, 3625334]),
  array([      6,      10,      14, ..., 3625322, 3625325, 3625333])],
 [array([      0,       1,       2, ..., 3625332, 3625333, 3625334]),
  array([      4,       8,      15, ..., 3625310, 3625313, 3625331])]]

In [19]:
train.columns

Index(['ts_code', 'trade_date', 'alpha01', 'alpha02', 'alpha06', 'alpha07',
       'alpha08', 'alpha09', 'alpha10', 'alpha12', 'alpha13', 'alpha14',
       'alpha15', 'alpha16', 'alpha17', 'alpha18', 'alpha19', 'alpha20',
       'alpha21', 'alpha23', 'alpha24', 'alpha28', 'alpha29', 'alpha30',
       'alpha31', 'alpha32', 'alpha_w_005', 'name', 'ret1', 'ret2', 'ret',
       'open_up', 'ret_rank'],
      dtype='object')

In [20]:
features = ['alpha01', 'alpha02', 'alpha06', 'alpha07',
       'alpha08', 'alpha09', 'alpha10', 'alpha12', 'alpha13', 'alpha14',
       'alpha15', 'alpha16', 'alpha17', 'alpha18', 'alpha19', 'alpha20',
       'alpha21', 'alpha23', 'alpha24', 'alpha28', 'alpha29', 'alpha30',
       'alpha31', 'alpha32', 'alpha_w_005']


In [29]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.python.ops import math_ops
from tensorflow.python.keras import backend as K
class GCF:
    
    SEED = 0
    
    N_EPOCHS = 1000
    BATCH_SIZE = 4096
    EARLY_STOPPING_PATIENCE = 10
    EARLY_STOPPING_MIN_DELTA = 1e-3
    ALL_TRAIN_ADD_EPOCH = 3
    
    # Transformer Parameters
    EMBED_DIM = 64//2
    N_HEAD = 8
    FF_DIM = 128//2
    DROPOUT = 0.0
    N_BLOCK = 4
    
feat_dim = len(features)

In [30]:

# https://www.kaggle.com/pratikskarnik/riiid-keras-transformer-starter
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output
    
    def get_config(self):
        config = {
            "embed_dim" : self.embed_dim,
            "num_heads" : self.num_heads,
            "projection_dim" : self.projection_dim,
            "query_dense" : self.query_dense,
            "key_dense" : self.key_dense,
            "value_dense" : self.value_dense,
            "combine_heads" : self.combine_heads,
        }
        base_config = super(MultiHeadSelfAttention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))  


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim=GCF.EMBED_DIM, feat_dim=feat_dim, num_heads=GCF.N_HEAD, ff_dim=GCF.FF_DIM, rate=GCF.DROPOUT, **kwargs):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(num_heads=num_heads, embed_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="gelu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        config = {
            "att" : self.att,
            "ffn" : self.ffn,
            "layernorm1" : self.layernorm1,
            "layernorm2" : self.layernorm2,
            "dropout1" : self.dropout1,
            "dropout2" : self.dropout2,
        }
        base_config = super(TransformerBlock, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [31]:
def base_model():
    inputs = layers.Input(shape=(1, feat_dim))
    
    # "EMBEDDING LAYER"
    x = layers.Dense(GCF.EMBED_DIM)(inputs)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    
    # TRANSFORMER BLOCKS
    for k in range(GCF.N_BLOCK):
        #x_old = x
        transformer_block = TransformerBlock(GCF.EMBED_DIM, feat_dim, GCF.N_HEAD, GCF.FF_DIM, GCF.DROPOUT)
        x = transformer_block(x)
        #x = 0.7*x + 0.3*x_old # SKIP CONNECTION
    
    x = layers.GlobalAveragePooling1D()(x)
    #x = layers.Dropout(0.2)(x)
    x = layers.Dense(20, activation="relu")(x)
    #x = layers.Dropout(0.2)(x)
    
    # REGRESSION HEAD
    outputs = layers.Dense(1, activation="linear")(x)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer=tf.optimizers.Adam(1e-4),
        loss='mse',
        #loss=correlationLoss,
        metrics=[keras.metrics.RootMeanSquaredError()]
    )
    return model


In [32]:
model = base_model()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1, 25)]           0         
_________________________________________________________________
dense (Dense)                (None, 1, 32)             832       
_________________________________________________________________
layer_normalization (LayerNo (None, 1, 32)             64        
_________________________________________________________________
transformer_block (Transform (None, None, 32)          8544      
_________________________________________________________________
transformer_block_1 (Transfo (None, None, 32)          8544      
_________________________________________________________________
transformer_block_2 (Transfo (None, None, 32)          8544      
_________________________________________________________________
transformer_block_3 (Transfo (None, None, 32)          8544  

## 模型训练

In [33]:
train.replace([np.inf, -np.inf], np.nan,inplace=True)
train[features] = train[features].fillna(train[features].mean())


In [35]:
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10, verbose=0,
    mode='min',restore_best_weights=True)

plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=7, verbose=0,
    mode='min')

In [None]:
# from sklearn.preprocessing import QuantileTransformer

# train_nn=train[features].copy()
# test_nn=test[features].copy()
# qt_train = []
# for col in features:
#     qt = QuantileTransformer(random_state=21,n_quantiles=2000, output_distribution='normal')
#     train_nn[col] = qt.fit_transform(train_nn[[col]])
#     test_nn[col] = qt.transform(test_nn[[col]])
#     qt_train.append(qt)

In [None]:
from sklearn.metrics import mean_squared_error

modellist = []
y = train['ret_rank']
# y = train['ret']

oof_predictions = np.zeros(train.shape[0])

for fold, (trn_ind, val_ind) in enumerate(enumsplit):
    print(f'Training fold {fold + 1}')
    x_train, x_val = train.iloc[trn_ind][features], train.iloc[val_ind][features]
    y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
    
    model = base_model()
    model.fit(np.expand_dims(x_train, axis=1), 
              y_train,               
              batch_size=2048,
              epochs=1000,
              validation_data=(np.expand_dims(x_val, axis=1), y_val),
              callbacks=[es, plateau],
              validation_batch_size=len(y_val),
              shuffle=True,
              verbose = 1)

    
    modellist.append(model)
    oof_predictions[val_ind] = model.predict(x_val).reshape(1,-1)[0]
    
    print(f'fold RMSPE is {mean_squared_error(y_val, oof_predictions[val_ind])**0.5}')
    corrr_score = np.corrcoef([y_val, oof_predictions[val_ind]])[0,1]
    print(f'fold CORR is {corrr_score}')

rmspe_score = mean_squared_error(y, oof_predictions)**0.5
print(f'Our out of folds RMSPE is {rmspe_score}')
corrr_score = np.corrcoef([y, oof_predictions])[0,1]
print(f'Our out of folds CORR is {corrr_score}')

Training fold 1
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000

## 模型预测和收益pnl

In [None]:
num = 30
test = data.copy()
test = test[(test.open_up<1.05)&(test.open_up>0.95)]  ##不考虑高开或者低开过多的股票
test = test[test.trade_date > '20200401']

test.replace([np.inf, -np.inf], np.nan,inplace=True)
test[features] = test[features].fillna(train[features].mean())


In [None]:
pred = np.zeros(test.shape[0])
for model in modellist:
    pred += model.predict(test[features]).reshape(1,-1)[0]
test['pred'] = pred
date_calc = sorted(test.trade_date.unique())


In [None]:
## 考虑t+1开盘买 t+2收盘卖的收益情况
tmp = test.copy()
tmp = tmp.groupby('trade_date').apply(lambda x: x.sort_values('pred', ascending = False).head(num)).reset_index(drop=True)
ret_pd1 = pd.DataFrame(tmp.groupby('trade_date').ret2.mean() - 1.0015)
(ret_pd1.loc['20200401':'20230101',] ).cumsum().plot(rot=45, title = '2020-2022 year')


In [None]:
# ## 考虑t收盘买 t+1收盘卖的收益情况
# tmp = test.copy()
# tmp = tmp.groupby('trade_date').apply(lambda x: x.sort_values('pred', ascending = False).head(num)).reset_index(drop=True)
# ret_pd2 = pd.DataFrame(tmp.groupby('trade_date').ret1.mean() - 1.0015)
# (ret_pd2.loc['20200401':'20220501',] ).cumsum().plot(rot=45, title = '2020-2022 year')


## 计算收益相关评价指标

In [None]:
def max_dd(returns):
    """Assumes returns is a pandas Series"""
    r = returns.cumsum()+1
    dd = r.div(r.cummax()).sub(1)
    mdd = dd.min()
    end = returns.index[dd.argmin()]
    start = returns.index[r.loc[:end].argmax()]
    return mdd, start, end

def cal_sell_turnover(tmp):
    stock = pd.DataFrame(tmp.groupby('trade_date').apply(lambda x: set(x.ts_code))).rename(columns={0:'stock_list'})
    stock['stock_list_shift1'] = stock.stock_list.shift(1)
    stock = stock.dropna()
    freq = []
    for i in range(len(stock)):
        freq.append(1 - len(stock.iloc[i]['stock_list'] & stock.iloc[i]['stock_list_shift1'])/len(stock.iloc[i]['stock_list']))
    stock['freq'] = freq
    return stock['freq'].mean()

def max_dd_month(ret_PD):
    ret2_month = ret_PD.reset_index()
    ret2_month['month'] = [str(xx)[:6] for xx in ret2_month['trade_date']]
    ret2_month = ret2_month.groupby('month')['ret'].sum()
    ret2_month = ret2_month.sort_values()
    return [ret2_month.head(1).values[0],ret2_month.head(1).index[0],\
            (ret2_month>0).sum()/len(ret2_month),ret2_month[ret2_month>=0].mean(),\
            ret2_month[ret2_month<0].mean()]

def calc_pfmc(ret_PD):
    performance = []
    ret_PD.columns = ['ret']
    returnlist = ret_PD['ret']
    ret_year = (returnlist.mean())*252
    ret_sharpe = (returnlist.mean())/returnlist.std()*((252)**0.5)
    month_pfmc = max_dd_month(ret_PD)

    winratio = (returnlist>0).sum()/(returnlist!=0).sum()
    std = returnlist.std()
    
    ret2_len = pd.DataFrame(tmp.groupby('trade_date').ret.count())
    ret2_len = ret2_len.reindex(date_calc).fillna(0)
    stock_pct = ret2_len.mean().values[0]

    turnover = cal_sell_turnover(tmp)
    performance.append(['LGB',ret_year,ret_sharpe,std,winratio,turnover,stock_pct]+month_pfmc)
    performance = pd.DataFrame(performance,columns = ['group_name','return','sharpe',\
                    'std','winratio','turnover','stock_num','mdd_month','mdd_month_No',\
                    'month_winratio','month_win_ret','month_lose_ret'])

    return performance

In [None]:
calc_pfmc(ret_pd1)

In [None]:
## 看某天的股票持仓
tmp[tmp.trade_date=='20220908'][['ts_code','name','trade_date','pred','ret','ret1','ret2']]

In [None]:
## 计算沪深300的收益
df300 = pro.index_daily(ts_code='000300.SH', start_date=ret_pd1.index[0], end_date=ret_pd1.index[-1])
df300 = df300[['trade_date','pct_chg']].set_index('trade_date').sort_index()
df300

In [None]:
ret_pd_concat = pd.merge(ret_pd1,df300/100,left_index=True, right_index=True)
ret_pd_concat.columns = ['ret','df300']
ret_pd_concat['diff'] = ret_pd_concat['ret'] - ret_pd_concat['df300']
ret_pd_concat

In [None]:
ret_pd_concat.corr()

In [None]:
(ret_pd_concat[['ret','df300','diff']].cumsum()).plot(rot=45, title = '2020-2022 year')


## 计算oof的收益情况和PNL；实际参考意义不大，因为模型会过拟合训练集

In [None]:

train['pred'] = oof_predictions

date_calc = sorted(train.trade_date.unique())
num = 30
tmp = train.copy()
tmp = tmp[(tmp.open_up<1.05)&(tmp.open_up>0.95)]

tmp = tmp.groupby('trade_date').apply(lambda x: x.sort_values('pred', ascending = False).head(num)).reset_index(drop=True)
ret_pd = pd.DataFrame(tmp.groupby('trade_date').ret2.mean() - 1.0015)
(ret_pd.loc['20100101':'20200401',] ).cumsum().plot(rot=45, title = '2010-2020 year')

calc_pfmc(ret_pd)

In [None]:
df300 = pro.index_daily(ts_code='000300.SH', start_date=ret_pd.index[0], end_date=ret_pd.index[-1])
df300 = df300[['trade_date','close']].set_index('trade_date').sort_index()
df300 = df300/df300.head(1).values
ret_pd_concat = pd.merge(ret_pd.cumsum(),df300-1,left_index=True, right_index=True)
ret_pd_concat.columns = ['ret','df300']
ret_pd_concat['diff'] = ret_pd_concat[['ret']]-ret_pd_concat[['df300']].values
(ret_pd_concat[['ret','df300','diff']]).plot(rot=45, title = '2020-2022 year')


In [None]:
sss = ret_pd_concat[['ret']]- ret_pd_concat[['df300']].values
calc_pfmc(sss.diff())
