In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import akshare as ak
import json
import os
import time

from datetime import datetime, timedelta
from tqdm import tqdm

pd.options.display.max_rows=None
pd.options.display.max_columns=None

In [139]:
# stock_code_list = ak.stock_info_a_code_name()['code'] # 获取A股所有股票列表
stock_code_list = ak.index_stock_cons("000016")['品种代码'].tolist() # 获取中证50的股票代码列表

## 1. 获取基础原始数据

In [141]:
def get_stock_history_info(stock_code):
    result = ak.stock_zh_a_hist(symbol=stock_code, adjust='hfq').rename(
            columns={
                "日期": "datetime",
                "开盘": "open",
                "最高": "high",
                "最低": "low",
                "收盘": "close",
                "成交量": "volume",
                "成交额": "turnover",
                "振幅": "amplitude",
                "涨跌幅": "change_pct",
                "涨跌额": "change_amount",
                "换手率": "turnover_rate",
            }
        )
    result = result[['datetime', 'open', 'high', 'low', 'close', 'volume']]
    result.insert(0, 'stock_code', stock_code)
    return result

def get_stock_label(dataframe, expression_excutor):
    # 计算收益：5日收盘价(作为卖出价格)除以明日开盘价(作为买入价格)
    expression_1 = "shift(close,-5)/shift(open,-1)-1"
    # 极值处理：用1%和99%分位的值做clip
    expression_2 = "clip(label,all_quantile(label, 0.01),all_quantile(label,0.99))"
    # 过滤掉一字涨停的情况 (设置label为NaN，在后续处理和训练中会忽略NaN的label)
    expression_3 = "where(shift(high,-1)=shift(low,-1), nan, label)"

    dataframe['label'] = expression_excutor.excute(dataframe, expression_1)
    dataframe['label'] = expression_excutor.excute(dataframe, expression_2)
    dataframe['label'] = expression_excutor.excute(dataframe, expression_3)
    return dataframe

def get_basic_factor(dataframe, expression_excutor):
    alpha_dict = json.loads(open('./alpha_184.json', "r").read())
    for alpha_name, alpha_expression in alpha_dict.items():
        dataframe[alpha_name] = expression_excutor.excute(dataframe, alpha_expression)
    return dataframe

In [143]:
from expression_excutor import AlphaExpressionExcutor
expression_excutor = AlphaExpressionExcutor()

stock_data_list = []
for stock_code in tqdm(stock_code_list):
    stock_data = get_stock_history_info(stock_code)
    stock_data = get_stock_label(stock_data, expression_excutor)
    stock_data = get_basic_factor(stock_data, expression_excutor)
    stock_data = stock_data.replace([np.inf, -np.inf], np.nan).dropna()
    stock_data_list.append(stock_data)

100%|██████████| 50/50 [01:34<00:00,  1.88s/it]


In [144]:
df = pd.concat(stock_data_list)
print([i for i in df.columns])

['stock_code', 'datetime', 'open', 'high', 'low', 'close', 'volume', 'label', 'KMID', 'KLEN', 'KMID2', 'KUP', 'KUP2', 'KLOW', 'KLOW2', 'KSFT', 'KSFT2', 'OPEN0', 'OPEN1', 'OPEN2', 'OPEN3', 'OPEN4', 'HIGH0', 'HIGH1', 'HIGH2', 'HIGH3', 'HIGH4', 'LOW0', 'LOW1', 'LOW2', 'LOW3', 'LOW4', 'CLOSE0', 'CLOSE1', 'CLOSE2', 'CLOSE3', 'CLOSE4', 'VOLUME0', 'VOLUME1', 'VOLUME2', 'VOLUME3', 'VOLUME4', 'ROC5', 'ROC10', 'ROC20', 'ROC30', 'ROC60', 'MAX5', 'MAX10', 'MAX20', 'MAX30', 'MAX60', 'MIN5', 'MIN10', 'MIN20', 'MIN30', 'MIN60', 'MA5', 'MA10', 'MA20', 'MA30', 'MA60', 'STD5', 'STD10', 'STD20', 'STD30', 'STD60', 'BETA5', 'BETA10', 'BETA20', 'BETA30', 'BETA60', 'RSQR5', 'RSQR10', 'RSQR20', 'RSQR30', 'RSQR60', 'RESI5', 'RESI10', 'RESI20', 'RESI30', 'RESI60', 'QTLU5', 'QTLU10', 'QTLU20', 'QTLU30', 'QTLU60', 'QTLD5', 'QTLD10', 'QTLD20', 'QTLD30', 'QTLD60', 'TSRANK5', 'TSRANK10', 'TSRANK20', 'TSRANK30', 'TSRANK60', 'RSV5', 'RSV10', 'RSV20', 'RSV30', 'RSV60', 'IMAX5', 'IMAX10', 'IMAX20', 'IMAX30', 'IMAX60', '

## 4. 模型训练

In [159]:
# 6. 选择固定时间区间的数据
train_start_date = pd.to_datetime('2010-01-01')
train_end_date = pd.to_datetime('2019-12-31')
val_start_date = pd.to_datetime('2020-01-01')
val_end_date = pd.to_datetime('2020-12-31')

train_data = df[(pd.to_datetime(df['datetime']) >= train_start_date) & (pd.to_datetime(df['datetime']) <= train_end_date)]
validation_data = df[(pd.to_datetime(df['datetime']) >= val_start_date) & (pd.to_datetime(df['datetime']) <= val_end_date)]

print(f"train_data_size: {train_data.shape}")
print(f"validation_data_size: {validation_data.shape}")

train_data_size: (93655, 182)
validation_data_size: (11027, 182)


## 使用Tensorflow

In [160]:
# 使用tensorflow处理原始数据
import numpy as np
import pandas as pd
import tensorflow as tf
print(tf.__version__)

2.15.0


In [174]:
def get_numeric_boundaries(series, num_bins=20):
    if series.nunique() < num_bins:
        boundaries = sorted(series.unique())
    else:
        boundaries = pd.qcut(series, num_bins, retbins=True, duplicates='drop')[1].tolist()
    return boundaries

In [175]:
TARGET_FEATURE_NAME = ["label"]

NUMERIC_FEATURES = ['KMID', 'KLEN', 'KMID2', 'KUP', 'KUP2', 'KLOW', 'KLOW2', 'KSFT', 'KSFT2', 'OPEN0', 'OPEN1', 'OPEN2', 'OPEN3', 'OPEN4', 'HIGH0', 'HIGH1', 'HIGH2', 'HIGH3', 'HIGH4', 'LOW0', 'LOW1', 'LOW2', 'LOW3', 'LOW4', 'CLOSE0', 'CLOSE1', 'CLOSE2', 'CLOSE3', 'CLOSE4', 'VOLUME0', 'VOLUME1', 'VOLUME2', 'VOLUME3', 'VOLUME4', 'ROC5', 'ROC10', 'ROC20', 'ROC30', 'ROC60', 'MAX5', 'MAX10', 'MAX20', 'MAX30', 'MAX60', 'MIN5', 'MIN10', 'MIN20', 'MIN30', 'MIN60', 'MA5', 'MA10', 'MA20', 'MA30', 'MA60', 'STD5', 'STD10', 'STD20', 'STD30', 'STD60', 'BETA5', 'BETA10', 'BETA20', 'BETA30', 'BETA60', 'RSQR5', 'RSQR10', 'RSQR20', 'RSQR30', 'RSQR60', 'RESI5', 'RESI10', 'RESI20', 'RESI30', 'RESI60', 'QTLU5', 'QTLU10', 'QTLU20', 'QTLU30', 'QTLU60', 'QTLD5', 'QTLD10', 'QTLD20', 'QTLD30', 'QTLD60', 'TSRANK5', 'TSRANK10', 'TSRANK20', 'TSRANK30', 'TSRANK60', 'RSV5', 'RSV10', 'RSV20', 'RSV30', 'RSV60', 'IMAX5', 'IMAX10', 'IMAX20', 'IMAX30', 'IMAX60', 'IMIN5', 'IMIN10', 'IMIN20', 'IMIN30', 'IMIN60', 'IMXD5', 'IMXD10', 'IMXD20', 'IMXD30', 'IMXD60', 'CORD5', 'CORD10', 'CORD20', 'CORD30', 'CORD60', 'CNTP5', 'CNTP10', 'CNTP20', 'CNTP30', 'CNTP60', 'CNTN5', 'CNTN10', 'CNTN20', 'CNTN30', 'CNTN60', 'CNTD5', 'CNTD10', 'CNTD20', 'CNTD30', 'CNTD60', 'SUMP5', 'SUMP10', 'SUMP20', 'SUMP30', 'SUMP60', 'SUMN5', 'SUMN10', 'SUMN20', 'SUMN30', 'SUMN60', 'SUMD5', 'SUMD10', 'SUMD20', 'SUMD30', 'SUMD60', 'VMA5', 'VMA10', 'VMA20', 'VMA30', 'VMA60', 'VSTD5', 'VSTD10', 'VSTD20', 'VSTD30', 'VSTD60', 'WVMA5', 'WVMA10', 'WVMA20', 'WVMA30', 'WVMA60', 'VSUMP5', 'VSUMP10', 'VSUMP20', 'VSUMP30', 'VSUMP60', 'VSUMN5', 'VSUMN10', 'VSUMN20', 'VSUMN30', 'VSUMN60', 'VSUMD5', 'VSUMD10', 'VSUMD20', 'VSUMD30', 'VSUMD60']
NUMERIC_FEATURES_WITH_BOUNDARIES = {k: get_numeric_boundaries(train_data[k])  for k in NUMERIC_FEATURES}

INTEGER_CATEGORICAL_FEATURES = []
INTEGER_CATEGORICAL_FEATURES_WITH_VOCAB = {}

STRING_CATEGORICAL_FEATURES = []
STRING_CATEGORICAL_FEATURES_WITH_VOCAB = {}

FEATURE_NAMES = NUMERIC_FEATURES + INTEGER_CATEGORICAL_FEATURES + STRING_CATEGORICAL_FEATURES

In [176]:
class Senet(tf.keras.layers.Layer):
    def __init__(self, reduction_ratio=3, seed=1024, **kwargs):
        super(Senet, self).__init__(**kwargs)
        self.reduction_ratio = reduction_ratio
        self.seed = seed  

    def build(self, input_shape):
        self.field_size = len(input_shape)
        self.reduction_size = max(1, self.field_size // self.reduction_ratio)
        self.scale_layer = tf.keras.layers.Dense(units=self.reduction_size, activation='relu')
        self.expand_layer = tf.keras.layers.Dense(units=self.field_size, activation='relu')
        super(Senet, self).build(input_shape)

    def call(self, inputs, training=None):
        inputs = [tf.expand_dims(i, axis=1) for i in inputs]
        inputs = tf.concat(inputs, axis=1) # [B, N, dim]
        Z = tf.reduce_mean(inputs, axis=-1) # [B, N]
        A_1 = self.scale_layer(Z) # [B, X]
        A_2 = self.expand_layer(A_1) # [B, N]
        scale_inputs = tf.multiply(inputs, tf.expand_dims(A_2, axis=-1))
        output = scale_inputs + inputs # skip-connection
        return output # [B, N, dim]


class Dnn(tf.keras.layers.Layer):
    def __init__(self, hidden_units, activation="relu", dropout_rate=0.2, use_bn=False, seed=1024, **kwargs):
        super(Dnn, self).__init__(**kwargs)
        self.hidden_units = hidden_units
        self.activation = activation
        self.dropout_rate = dropout_rate
        self.use_bn = use_bn
        self.seed = seed
        self.dense_layers = []
        self.dropout_layers = []
        self.bn_layers = []
        
    def build(self, input_shape):
        for units in self.hidden_units:
            self.dense_layers.append(tf.keras.layers.Dense(units=units, activation=self.activation))
            self.dropout_layers.append(tf.keras.layers.Dropout(rate=self.dropout_rate, seed=self.seed))
            if self.use_bn:
                self.bn_layers.append(tf.keras.layers.BatchNormalization())
        super(Dnn, self).build(input_shape)  # Be sure to call this at the end
    
    def call(self, inputs, training=False):
        x = inputs
        for i in range(len(self.hidden_units)):
            x = self.dense_layers[i](x)
            if self.use_bn:
                x = self.bn_layers[i](x, training=training)
            x = self.dropout_layers[i](x, training=training)
        return x

In [177]:
class QuantModel(tf.keras.Model):
	def __init__(self, config, **kwargs):
		super(QuantModel, self).__init__(**kwargs)
		self.config = config

		# 添加属性来存储预定义的层
		self.lookup_layers = {}
		self.embedding_layers = {}

        # 创建连续特征的离散化层和嵌入层
		for feature_name, boundaries in self.config.get("numeric_features_with_boundaries").items():
			self.lookup_layers[feature_name] = tf.keras.layers.Discretization(bin_boundaries=boundaries, output_mode='int', name=f'{feature_name}_lookup')
			self.embedding_layers[feature_name] = tf.keras.layers.Embedding(input_dim=len(boundaries) + 1, output_dim=self.config.get("feature_embedding_dims", 6), name=f'{feature_name}_embedding')
        # 创建整数特征的查找层和嵌入层
		for feature_name, vocab in self.config.get("integer_categorical_features_with_vocab").items():
			self.lookup_layers[feature_name] = tf.keras.layers.IntegerLookup(vocabulary=vocab, name=f'{feature_name}_lookup')
			self.embedding_layers[feature_name] = tf.keras.layers.Embedding(input_dim=len(vocab) + 1, output_dim=self.config.get("feature_embedding_dims", 6), name=f'{feature_name}_embedding')
		# 创建字符串特征的查找层和嵌入层
		for feature_name, vocab in self.config.get("string_categorical_features_with_vocab").items():
			self.lookup_layers[feature_name] = tf.keras.layers.StringLookup(vocabulary=vocab, name=f'{feature_name}_lookup')
			self.embedding_layers[feature_name] = tf.keras.layers.Embedding(input_dim=len(vocab) + 1, output_dim=self.config.get("feature_embedding_dims", 6), name=f'{feature_name}_embedding')

		self.senet_layer = Senet(
			reduction_ratio=self.config.get('reduction_ratio', 3), 
			seed=self.config.get('seed', 1024),
		)
		self.dnn_layer = Dnn(
			hidden_units=self.config.get('dnn_hidden_units', [64,32]),
			activation=self.config.get('dnn_activation', 'relu'),
			dropout_rate=self.config.get('dnn_dropout', 0.2),
			use_bn=self.config.get('dnn_use_bn', True)
		)
		self.output_layer = tf.keras.layers.Dense(1, activation=None)
		

	def call(self, inputs, training=False):
		# 确保inputs是一个字典类型，每个键值对应一个特征输入
		if not isinstance(inputs, dict): 
			raise ValueError('The inputs to the model should be a dictionary where keys are feature names.')
		encoded_features = []
    	# 现在使用已经实例化的层来编码输入
		for feature_name in inputs:
        	# 使用预定义的查找层和嵌入层
			lookup_layer = self.lookup_layers[feature_name]
			embedding_layer = self.embedding_layers[feature_name]
			encoded_feature = embedding_layer(lookup_layer(inputs[feature_name]))
			encoded_features.append(encoded_feature)

		senet_output = self.senet_layer(encoded_features, training=training)
		senet_output = tf.keras.layers.Flatten()(senet_output) # [B, N * dim]
		dnn_output = self.dnn_layer(senet_output, training=training)
		output = self.output_layer(dnn_output, training=training)
		return output

In [178]:
def df_to_dataset(dataframe, feature_cols, label_cols, shuffle=True, batch_size=32):
  features = dataframe[feature_cols]
  labels = dataframe[label_cols]
  ds = tf.data.Dataset.from_tensor_slices((dict(features), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(features))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

train_ds = df_to_dataset(train_data, FEATURE_NAMES, TARGET_FEATURE_NAME, shuffle=True)
val_ds = df_to_dataset(validation_data, FEATURE_NAMES, TARGET_FEATURE_NAME, shuffle=False)

In [179]:
model_config = {
    "seed": 1024,
    "reduction_ratio": 3,
    "dnn_hidden_units": [64,32],
    "dnn_activation": 'relu',
    "dnn_dropout": 0.2,
    "dnn_use_bn": True,
    "numeric_features_with_boundaries": NUMERIC_FEATURES_WITH_BOUNDARIES,
    "integer_categorical_features_with_vocab": INTEGER_CATEGORICAL_FEATURES_WITH_VOCAB,
    "string_categorical_features_with_vocab": STRING_CATEGORICAL_FEATURES_WITH_VOCAB,
    "feature_embedding_dims": 6
}

model = QuantModel(model_config)

optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-3)
loss = tf.keras.losses.MeanSquaredError()
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True,
)

model.compile(optimizer, loss=loss)
model.fit(
        train_ds, 
        validation_data=val_ds, 
        epochs=10,
        verbose=2,
        callbacks=[early_stopping])

Epoch 1/10
2927/2927 - 21s - loss: 0.0867 - val_loss: 0.0029 - 21s/epoch - 7ms/step
Epoch 2/10
2927/2927 - 15s - loss: 0.0023 - val_loss: 0.0032 - 15s/epoch - 5ms/step
Epoch 3/10
2927/2927 - 15s - loss: 0.0023 - val_loss: 0.0030 - 15s/epoch - 5ms/step
Epoch 4/10
2927/2927 - 15s - loss: 0.0023 - val_loss: 0.0030 - 15s/epoch - 5ms/step
Epoch 5/10
2927/2927 - 15s - loss: 0.0023 - val_loss: 0.0030 - 15s/epoch - 5ms/step
Epoch 6/10
2927/2927 - 15s - loss: 0.0022 - val_loss: 0.0030 - 15s/epoch - 5ms/step
Epoch 7/10
2927/2927 - 15s - loss: 0.0022 - val_loss: 0.0030 - 15s/epoch - 5ms/step
Epoch 8/10
2927/2927 - 16s - loss: 0.0021 - val_loss: 0.0031 - 16s/epoch - 5ms/step
Epoch 9/10
2927/2927 - 17s - loss: 0.0021 - val_loss: 0.0031 - 17s/epoch - 6ms/step
Epoch 10/10
2927/2927 - 15s - loss: 0.0020 - val_loss: 0.0031 - 15s/epoch - 5ms/step


<keras.src.callbacks.History at 0x3867e3160>

In [180]:
# baseline_model.save('./stock_selection_base_model')
# reloaded_model = tf.keras.models.load_model('./stock_selection_base_model')

In [181]:
test_labels = []
for _, labels in val_ds:
    test_labels.extend(labels.numpy().squeeze())

test_predictions = model.predict(val_ds).squeeze().tolist()



In [188]:
validation_data.head()

Unnamed: 0,stock_code,datetime,open,high,low,close,volume,label,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,OPEN1,OPEN2,OPEN3,OPEN4,HIGH0,HIGH1,HIGH2,HIGH3,HIGH4,LOW0,LOW1,LOW2,LOW3,LOW4,CLOSE0,CLOSE1,CLOSE2,CLOSE3,CLOSE4,VOLUME0,VOLUME1,VOLUME2,VOLUME3,VOLUME4,ROC5,ROC10,ROC20,ROC30,ROC60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,TSRANK5,TSRANK10,TSRANK20,TSRANK30,TSRANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX10,IMAX20,IMAX30,IMAX60,IMIN5,IMIN10,IMIN20,IMIN30,IMIN60,IMXD5,IMXD10,IMXD20,IMXD30,IMXD60,CORD5,CORD10,CORD20,CORD30,CORD60,CNTP5,CNTP10,CNTP20,CNTP30,CNTP60,CNTN5,CNTN10,CNTN20,CNTN30,CNTN60,CNTD5,CNTD10,CNTD20,CNTD30,CNTD60,SUMP5,SUMP10,SUMP20,SUMP30,SUMP60,SUMN5,SUMN10,SUMN20,SUMN30,SUMN60,SUMD5,SUMD10,SUMD20,SUMD30,SUMD60,VMA5,VMA10,VMA20,VMA30,VMA60,VSTD5,VSTD10,VSTD20,VSTD30,VSTD60,WVMA5,WVMA10,WVMA20,WVMA30,WVMA60,VSUMP5,VSUMP10,VSUMP20,VSUMP30,VSUMP60,VSUMN5,VSUMN10,VSUMN20,VSUMN30,VSUMN60,VSUMD5,VSUMD10,VSUMD20,VSUMD30,VSUMD60
60,688981,2020-10-16,59.5,59.86,57.7,58.21,380308,-0.021713,-0.021681,0.036303,-0.597222,0.00605,0.166667,0.008571,0.236111,-0.01916,-0.527778,1.022161,0.96547,0.984367,0.972342,0.88301,1.028346,1.028346,0.986944,0.991926,0.999313,0.991239,0.96547,0.958083,0.960831,0.87923,1.0,0.984367,0.961347,0.984195,0.986257,1.0,1.487063,0.759642,1.172229,2.199359,0.875107,0.936265,0.906717,1.154956,1.424498,1.028346,1.028346,1.028346,1.028346,1.030751,0.085896,0.171792,0.343584,0.515375,0.991239,0.983233,0.928517,0.946676,0.984768,1.144491,0.013888,0.061417,0.047926,0.084887,0.180822,0.002766,0.015706,-0.000789,-0.006342,-0.009381,0.099158,0.599459,0.009497,0.432588,0.820992,0.011235,0.000806,0.060824,0.107192,0.132262,0.989005,0.984745,0.984745,1.028586,1.314757,0.979625,0.85999,0.913039,0.928157,0.96492,5.0,10.0,20.0,23.0,23.0,0.969923,0.966907,0.958605,0.944742,0.221739,0.2,0.1,0.05,0.966667,0.8,0.8,0.7,0.35,0.233333,0.116667,-0.6,-0.6,-0.3,0.733333,0.683333,0.855801,0.283967,0.312768,0.044704,0.087857,0.6,0.5,0.5,0.4,0.433333,0.4,0.5,0.5,0.6,0.566667,0.2,0.0,0.0,-0.2,-0.133333,0.857424,0.61168,0.603824,0.399375,0.382165,0.142576,0.38832,0.396176,0.600625,0.617835,0.714848,0.223359,0.207648,-0.201249,-0.23567,1.323659,1.076814,1.006073,1.140756,1.782748,0.556631,0.516227,0.419912,0.569329,1.234033,1.674423,1.762494,1.902575,1.932545,1.560068,0.525113,0.531506,0.498254,0.476893,0.326698,0.474887,0.468494,0.501746,0.523107,0.673302,0.050227,0.063012,-0.003491,-0.046214,-0.346603
61,688981,2020-10-19,58.49,61.1,57.76,60.28,590069,-0.020938,0.030604,0.057104,0.535928,0.014019,0.245509,0.012481,0.218563,0.029065,0.508982,0.970305,0.98706,0.932316,0.950564,0.938952,1.013603,0.993033,0.993033,0.953052,0.957863,0.958195,0.9572,0.932316,0.925182,0.927837,1.0,0.96566,0.950564,0.928334,0.950398,1.0,0.644514,0.958434,0.4896,0.755518,0.952389,0.890843,0.916224,1.10783,1.278368,1.013603,1.013603,1.013603,1.013603,1.013603,0.082946,0.165893,0.331785,0.497678,0.958195,0.958991,0.907548,0.918356,0.947357,1.10055,0.026509,0.067591,0.050109,0.077074,0.173629,0.013653,0.020419,0.000496,-0.004726,-0.008938,0.663146,0.836558,0.003433,0.291375,0.808278,0.013703,0.000567,0.076929,0.121169,0.163128,0.972528,0.955043,0.953318,0.980358,1.262376,0.945985,0.830458,0.881685,0.896284,0.931785,5.0,10.0,20.0,25.0,25.0,0.985383,0.983953,0.980049,0.973633,0.754491,0.0,0.0,0.0,0.966667,0.816667,0.6,0.8,0.4,0.266667,0.133333,-0.6,-0.8,-0.4,0.7,0.683333,0.739526,0.283248,0.340614,0.069698,0.059791,0.6,0.6,0.5,0.433333,0.45,0.4,0.4,0.5,0.566667,0.55,0.2,0.2,0.0,-0.133333,-0.1,0.7487,0.684004,0.597982,0.430017,0.41698,0.2513,0.315996,0.402018,0.569983,0.58302,0.4974,0.368009,0.195964,-0.139966,-0.16604,0.769613,0.760381,0.661644,0.745208,1.103648,0.213885,0.319375,0.281365,0.370025,0.720208,0.805623,1.583658,1.897444,1.857509,1.514124,0.398962,0.57812,0.525345,0.516388,0.431484,0.601038,0.42188,0.474655,0.483612,0.568516,-0.202075,0.15624,0.050691,0.032775,-0.137032
62,688981,2020-10-20,59.7,59.98,58.4,59.5,350670,-0.016586,-0.00335,0.026466,-0.126582,0.00469,0.177215,0.018425,0.696203,0.010385,0.392405,1.003361,0.983025,1.0,0.944538,0.963025,1.008067,1.026891,1.00605,1.00605,0.965546,0.981513,0.970756,0.969748,0.944538,0.937311,1.0,1.013109,0.978319,0.963025,0.940504,1.0,1.68269,1.084518,1.612747,0.823846,0.962857,0.839328,0.922353,1.120672,1.330588,1.008067,1.008067,1.008067,1.008067,1.008403,0.084034,0.168067,0.336134,0.504202,0.981513,0.978992,0.935513,0.934277,0.955754,1.109468,0.028905,0.066408,0.053037,0.072412,0.174206,0.016908,0.019739,0.00137,-0.003419,-0.008878,0.855367,0.809891,0.02336,0.172742,0.792224,-0.012807,-0.024339,0.052706,0.093818,0.152447,1.002622,0.982655,0.969613,0.990588,1.27321,0.958521,0.853277,0.893244,0.908034,0.944,4.0,9.0,19.0,25.0,25.0,0.99127,0.990396,0.987994,0.983989,0.6875,0.2,0.1,0.05,0.866667,0.833333,0.8,0.9,0.45,0.3,0.15,-0.6,-0.8,-0.4,0.566667,0.683333,0.775923,0.856863,0.349286,0.07798,0.063255,0.6,0.6,0.5,0.433333,0.433333,0.4,0.4,0.5,0.566667,0.566667,0.2,0.2,0.0,-0.133333,-0.133333,0.671851,0.820805,0.588168,0.423812,0.401384,0.328149,0.179195,0.411832,0.576188,0.598616,0.343701,0.641611,0.176336,-0.152377,-0.197233,1.24076,1.220876,1.113888,1.256331,1.765095,0.384015,0.532189,0.473306,0.621489,1.050702,0.663189,1.883316,1.876164,1.839003,1.566983,0.455458,0.456964,0.500593,0.502257,0.418448,0.544542,0.543036,0.499407,0.497743,0.581552,-0.089085,-0.086071,0.001185,0.004514,-0.163105
63,688981,2020-10-21,59.69,59.8,56.92,57.65,351384,0.037238,-0.034177,0.048249,-0.708333,0.001843,0.038194,0.01223,0.253472,-0.02379,-0.493056,1.035386,1.035559,1.014571,1.03209,0.974848,1.037294,1.040416,1.059844,1.038335,1.038335,0.987337,1.01301,1.001908,1.000867,0.974848,1.0,1.03209,1.04562,1.009714,0.993929,1.0,0.997968,1.679271,1.082315,1.60947,0.970685,0.868864,0.978144,1.133565,1.36392,1.037294,1.037294,1.037294,1.037294,1.040763,0.08673,0.173461,0.346921,0.520382,0.987337,1.016271,0.978647,0.965351,0.981972,1.139005,0.021901,0.060002,0.055247,0.069461,0.178421,0.003452,0.015808,0.002144,-0.002426,-0.00903,0.062107,0.636262,0.052711,0.094542,0.781277,-0.023174,-0.049783,0.014281,0.053206,0.127387,1.034796,1.014189,1.000729,1.011761,1.31098,0.998786,0.95327,0.921908,0.937173,0.974293,2.0,7.0,15.0,21.0,21.0,0.960766,0.956827,0.94598,0.927852,0.237013,0.4,0.2,0.1,0.9,0.85,0.8,0.9,0.5,0.333333,0.166667,-0.4,-0.7,-0.4,0.566667,0.683333,0.566542,0.828119,0.332559,0.078429,0.062726,0.6,0.5,0.45,0.433333,0.433333,0.4,0.5,0.55,0.566667,0.566667,0.2,0.0,-0.1,-0.133333,-0.133333,0.621583,0.727711,0.523738,0.419186,0.39618,0.378417,0.272289,0.476262,0.580814,0.60382,0.243165,0.455422,0.047476,-0.161629,-0.207641,1.273805,1.236137,1.09191,1.256358,1.701375,0.34088,0.519247,0.468135,0.61896,0.984027,0.585686,1.755317,1.895445,1.820685,1.567168,0.534267,0.514694,0.477556,0.502456,0.44341,0.465733,0.485306,0.522444,0.497544,0.55659,0.068534,0.029387,-0.044887,0.004912,-0.113181
64,688981,2020-10-22,57.2,58.8,56.38,57.67,288265,0.056944,0.008217,0.042308,0.194215,0.019755,0.466942,0.014336,0.338843,0.002797,0.066116,0.99185,1.035027,1.0352,1.014219,1.031732,1.019594,1.036934,1.040055,1.059476,1.037975,0.977631,0.986995,1.012658,1.001561,1.00052,1.0,0.999653,1.031732,1.045257,1.009364,1.0,1.218962,1.216485,2.046967,1.3193,0.993584,0.860933,0.98318,1.11271,1.379747,1.019594,1.019594,1.019594,1.019594,1.040402,0.0867,0.1734,0.346801,0.520201,0.977631,1.017201,0.992214,0.965857,0.977874,1.132281,0.020393,0.043639,0.055645,0.065008,0.176385,-0.006433,0.009161,0.00293,-0.001403,-0.008846,0.248778,0.403946,0.097027,0.036084,0.767075,-0.004335,-0.033438,0.006309,0.042465,0.128665,1.034437,1.013837,1.000381,1.002289,1.305601,0.999931,0.988798,0.921588,0.936848,0.973955,2.0,7.0,15.0,22.0,22.0,0.978996,0.976844,0.970876,0.960764,0.356354,0.6,0.3,0.15,0.933333,0.866667,0.0,0.9,0.55,0.366667,0.183333,0.6,-0.6,-0.4,0.566667,0.683333,0.47516,0.825326,0.332091,0.079005,0.065186,0.6,0.6,0.45,0.466667,0.433333,0.4,0.4,0.55,0.533333,0.566667,0.2,0.2,-0.1,-0.066667,-0.133333,0.53286,0.747837,0.518476,0.430077,0.390631,0.46714,0.252163,0.481524,0.569923,0.609369,0.065719,0.495674,0.036952,-0.139845,-0.218738,1.360343,1.526437,1.294749,1.522058,2.013113,0.401145,0.611386,0.567312,0.759443,1.158566,0.935652,1.780842,1.920728,1.850042,1.593504,0.301442,0.513303,0.466749,0.492688,0.452138,0.698558,0.486697,0.533251,0.507312,0.547862,-0.397116,0.026606,-0.066503,-0.014624,-0.095723


In [189]:
backtest_df = validation_data[['stock_code', 'datetime', 'open', 'high', 'low', 'close', 'volume', 'label']]
backtest_df['prediction'] = test_predictions
backtest_df.sort_values(by=['prediction'], ascending=False).head(20)

Unnamed: 0,stock_code,datetime,open,high,low,close,volume,label,prediction
112,688599,2020-11-25,22.08,23.2,21.66,21.8,470519,-0.000456,0.162605
533,603501,2020-03-24,140.2,140.23,132.24,137.57,62488,0.075053,0.150638
1372,603799,2020-11-10,85.64,86.24,81.75,84.42,495770,0.141865,0.140389
3068,601919,2020-11-11,8.66,8.66,8.01,8.02,2919605,0.212792,0.137548
5321,600089,2020-03-11,345.91,366.4,338.03,343.81,1580960,0.087536,0.133135
3767,600438,2020-06-29,145.89,153.73,145.89,147.49,1095344,0.09556,0.128173
6326,600690,2020-08-06,777.42,798.26,757.87,770.25,1132858,0.020526,0.118324
3716,600438,2020-04-10,104.89,104.89,100.49,101.05,1207549,0.102409,0.116239
5449,600104,2020-11-10,166.3,166.3,158.93,160.97,1075551,-0.015941,0.114778
1290,603799,2020-07-09,81.09,83.08,81.04,82.49,579147,-0.092581,0.11428


In [193]:
output = backtest_df[['stock_code', 'datetime', 'prediction']].rename(columns={
    'stock_code': 'instrument',
    'datetime': 'date',
    'prediction': 'pred'
})
output.head()

Unnamed: 0,instrument,date,pred
60,688981,2020-10-16,-0.000408
61,688981,2020-10-19,0.000736
62,688981,2020-10-20,-0.041374
63,688981,2020-10-21,-0.004999
64,688981,2020-10-22,0.029366


In [194]:
output.to_csv('./stock_selection_result.csv', index=False)