In [1]:
import sys
sys.path.append('../')

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import sqlite3
import akshare as ak
import matplotlib.pyplot as plt
%matplotlib inline

from datetime import datetime
from dateutil.relativedelta import relativedelta
from tqdm import tqdm

from database_auto.db_data_downloader.downloader_base import DownloaderBase
import database_auto.database_config as db_config

pd.options.display.max_rows=None
pd.options.display.max_columns=None

!python --version

Python 3.8.10


In [2]:
import tensorflow as tf

# 只使用CPU进行训练
tf.config.set_visible_devices([], 'GPU')

# 打印Tensorflow版本
print(f"Tensorflow Version: {tf.__version__}")

# 检查是否有可用的GPU设备
if tf.test.is_built_with_cuda():
    print("TensorFlow GPU version is installed")
else:
    print("TensorFlow CPU version is installed")

# 检查TensorFlow是否能够访问GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print("GPU devices available:", gpus)
else:
    print("No GPU devices found. Running on CPU.")

# !nvidia-smi

2024-05-28 08:06:04.684345: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Tensorflow Version: 2.13.1
TensorFlow GPU version is installed
GPU devices available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

# 绘图相关函数
def plot_series_dist(series):
    data = series
    plt.figure(figsize=(5,5))
    # 使用matplotlib画直方图
    plt.hist(data, bins=60, edgecolor='k', alpha=0.7)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Histogram of Data')
    plt.show()

def plot_metrics(history):
  metrics = ['loss', 'mean_squared_error']
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    plt.ylim([0, plt.ylim()[1]])
    plt.legend()

def plot_cm(true_labels, pred_labels):
    cm = confusion_matrix(true_labels, pred_labels)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="g", cmap='Blues')
    plt.title('Confusion matrix')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels)
    recall = recall_score(true_labels, pred_labels)
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

In [4]:
class PreProcessing:
    def __init__(self, db_downloader:DownloaderBase) -> None:
        self.db_downloader = db_downloader

    def _build_reg_label(self, stock_dataframe, N=15):
        stock_df = stock_dataframe.copy()
        # 计算未来N日内的最高收益率
        stock_df['max_return'] = stock_df['close'].rolling(window=N).max().shift(-N) / stock_df['close'] - 1
        # 计算未来N日内的最低收益率
        stock_df['min_return'] = stock_df['close'].rolling(window=N).min().shift(-N) / stock_df['close'] - 1
        # 计算未来N日内的收益率和（期望最高收益率越高越好，最低收益率也越高越好，由于最低收益率是负数，因此使用最高+最低来作为综合收益指标）
        stock_df['return_sum'] = stock_df['max_return'] + stock_df['min_return']
        stock_df['label'] = stock_df['return_sum']
        # 过滤第二天涨停股票
        stock_df = stock_df[stock_df['high'].shift(-1) != stock_df['low'].shift(-1)]
        return stock_df[['datetime', 'label']]

    def _process_one_stock(self, stock_code, start_date, end_date):
        stock_history = self.db_downloader._download_history_base_info(stock_code, start_date, end_date)
        stock_history = stock_history.replace("", np.NaN).ffill()
        stock_profile = self.db_downloader._download_all_stock_info(stock_code)
        stock_indicator = self.db_downloader._download_history_indicator_info(stock_code, start_date, end_date)
        stock_indicator = stock_indicator.replace("", np.NaN).ffill()
        stock_factor_date = self.db_downloader._download_history_date_factor_info(start_date, end_date)
        stock_factor_alpha158 = self.db_downloader._download_history_alpha158_factor_info(stock_code, start_date, end_date)
        stock_label = self._build_reg_label(stock_history)

        stock_df = stock_history.merge(stock_profile, on=['code'], how='left') \
                .merge(stock_indicator, on=['code', 'datetime'], how='left') \
                .merge(stock_factor_alpha158, on=['code', 'datetime'], how='left') \
                .merge(stock_factor_date, on=['datetime'], how='left') \
                .merge(stock_label, on=['datetime'], how='left')
        stock_df = stock_df.dropna()
        return stock_df

    def _get_index_hist_cons(self, benchmark, start_date, end_date):
        import baostock as bs
        bs.login()
        index_stock_cons_set = set()
        for cur_date in tqdm(pd.date_range(start=start_date, end=end_date, freq='B')):
            cur_date = datetime.strftime(cur_date, '%Y-%m-%d')
            if benchmark == '000016':
                # print("开始处理上证50...")
                dataframe = bs.query_sz50_stocks(date=cur_date).get_data()
            elif benchmark == '000300':
                # print("开始处理沪深300...")
                dataframe = bs.query_hs300_stocks(date=cur_date).get_data()
            elif benchmark == '000905':
                # print("开始处理中证500...")
                dataframe = bs.query_zz500_stocks(date=cur_date).get_data()
            if not dataframe.empty:
                index_stock_cons_set.update(
                    dataframe['code'].unique()
                )
        bs.logout()
        return index_stock_cons_set

    def _process_all_stock(self, benchmark, start_date, end_date):
        # 获取区间内benchmark的所有成份股
        # stock_code_list = self._get_index_hist_cons(benchmark, start_date, end_date)
        stock_code_list = list(ak.index_stock_cons(benchmark)['品种代码'].map(lambda x: 'sh.'+str(x)).unique()) # 获取指数成分股列表
        stock_df_list = []
        for stock_code in tqdm(stock_code_list, desc=f'Process: {benchmark} ...'):
            stock_df = self._process_one_stock(stock_code, start_date, end_date)
            if not stock_df.empty:
                stock_df_list.append(stock_df)
        return pd.concat(stock_df_list)

In [15]:
# import baostock as bs
# bs.login()
# bs.query_sz50_stocks(date='2023-01-01').get_data()

In [5]:
def get_rolling_data_period(backtest_start_date, backtest_duration=5, train_period=6, val_period=0.5, test_period=0.5):
    """
    Args:
        backtest_start_date (_type_): _description_
        backtest_duration (int, optional): _description_. Defaults to 5.
        train_period (int, optional): _description_. Defaults to 6.
        val_period (float, optional): _description_. Defaults to 0.5.
        test_period (float, optional): _description_. Defaults to 0.5.
    Returns:
        result: _description_
    """
    backtest_start_date = datetime.strptime(backtest_start_date, '%Y%m%d')
    backtest_end_date = backtest_start_date + relativedelta(years=backtest_duration) # 回测5年数据
    train_period = relativedelta(years=train_period) # 使用6年的训练数据
    val_period = relativedelta(months=(12 * val_period)) # 使用半年的验证数据
    test_period = relativedelta(months=(12 * test_period)) # 使用半年的测试数据(半年模型一更新)

    result = []
    rolling_flag = True
    bench_date = backtest_start_date
    while rolling_flag:
        if bench_date < backtest_end_date:
            test_start, test_end = bench_date, (bench_date + test_period - relativedelta(days=1))
            val_start, val_end = (test_start - relativedelta(days=1) - val_period), (test_start - relativedelta(days=1))
            train_start, train_end =(val_start - relativedelta(days=1) - train_period), (val_start - relativedelta(days=1))
            result.append({
                "train": [train_start.strftime("%Y%m%d"), train_end.strftime("%Y%m%d")],
                "val": [val_start.strftime("%Y%m%d"), val_end.strftime("%Y%m%d")],
                "test": [test_start.strftime("%Y%m%d"), test_end.strftime("%Y%m%d")]
            })
            bench_date += test_period
        else:
            rolling_flag = False 
    return result

In [6]:
def extract_train_val_data(df, train_start_date, train_end_date, val_start_date, val_end_date, test_start_date, test_end_date):
    train_start_date = pd.to_datetime(train_start_date)
    train_end_date = pd.to_datetime(train_end_date)
    val_start_date = pd.to_datetime(val_start_date)
    val_end_date = pd.to_datetime(val_end_date)
    test_start_date = pd.to_datetime(test_start_date)
    test_end_date = pd.to_datetime(test_end_date)

    train_data = df[(pd.to_datetime(df['datetime']) >= train_start_date) & (pd.to_datetime(df['datetime']) <= train_end_date)]
    val_data = df[(pd.to_datetime(df['datetime']) >= val_start_date) & (pd.to_datetime(df['datetime']) <= val_end_date)]
    test_data = df[(pd.to_datetime(df['datetime']) >= test_start_date) & (pd.to_datetime(df['datetime']) <= test_end_date)]

    print(f"train_data_size: {train_data.shape}")
    print(f"validation_data_size: {val_data.shape}")
    print(f"test_data_size: {test_data.shape}")
    return train_data, val_data, test_data

def df_to_dataset(dataframe, feature_cols, label_cols, shuffle=False, batch_size=32):
    features = dataframe[feature_cols]
    labels = dataframe[label_cols]
    ds = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(len(features), 10000), seed=1024)
    ds = ds.cache().batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler


class QuantileClipTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lower_quantile=0.01, upper_quantile=0.99):
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile
        
    def fit(self, X, y=None):
        # 计算给定分位数的分界值
        self.lower_bound_ = np.nanquantile(X, self.lower_quantile, axis=0)
        self.upper_bound_ = np.nanquantile(X, self.upper_quantile, axis=0)
        return self
    
    def transform(self, X, y=None):
        # 对整个数组应用剪辑操作
        return np.clip(X, self.lower_bound_, self.upper_bound_)

    def fit_transform(self, X, y=None, **fit_params):
        return self.fit(X, y).transform(X)

In [8]:
# 数据库初始化
db_conn = sqlite3.connect('../database_auto/hh_quant_auto.db')
db_downloader = DownloaderBase(db_conn, db_config)
proprocessor = PreProcessing(db_downloader=db_downloader)

# 相关配置
rolling_flag = False
# benchmark = '000905'
benchmark = '000016'  # 上证50
feature_config = {
    "target_features": ['label'],
    "numeric_features": ['turnover_rate', 'pe_ttm', 'ps_ttm', 'pcf_ncf_ttm', 'pb_mrq', 'KMID', 'KLEN', 'KMID2', 'KUP', 'KUP2', 'KLOW', 'KLOW2', 'KSFT', 'KSFT2', 'OPEN0', 'OPEN1', 'OPEN2', 'OPEN3', 'OPEN4', 'HIGH0', 'HIGH1', 'HIGH2', 'HIGH3', 'HIGH4', 'LOW0', 'LOW1', 'LOW2', 'LOW3', 'LOW4', 'CLOSE1', 'CLOSE2', 'CLOSE3', 'CLOSE4', 'VOLUME1', 'VOLUME2', 'VOLUME3', 'VOLUME4', 'ROC5', 'ROC10', 'ROC20', 'ROC30', 'ROC60', 'MAX5', 'MAX10', 'MAX20', 'MAX30', 'MAX60', 'MIN5', 'MIN10', 'MIN20', 'MIN30', 'MIN60', 'MA5', 'MA10', 'MA20', 'MA30', 'MA60', 'STD5', 'STD10', 'STD20', 'STD30', 'STD60', 'BETA5', 'BETA10', 'BETA20', 'BETA30', 'BETA60', 'RSQR5', 'RSQR10', 'RSQR20', 'RSQR30', 'RSQR60', 'RESI5', 'RESI10', 'RESI20', 'RESI30', 'RESI60', 'QTLU5', 'QTLU10', 'QTLU20', 'QTLU30', 'QTLU60', 'QTLD5', 'QTLD10', 'QTLD20', 'QTLD30', 'QTLD60', 'TSRANK5', 'TSRANK10', 'TSRANK20', 'TSRANK30', 'TSRANK60', 'RSV5', 'RSV10', 'RSV20', 'RSV30', 'RSV60', 'IMAX5', 'IMAX10', 'IMAX20', 'IMAX30', 'IMAX60', 'IMIN5', 'IMIN10', 'IMIN20', 'IMIN30', 'IMIN60', 'IMXD5', 'IMXD10', 'IMXD20', 'IMXD30', 'IMXD60', 'CORR5', 'CORR10', 'CORR20', 'CORR30', 'CORR60', 'CORD5', 'CORD10', 'CORD20', 'CORD30', 'CORD60', 'CNTP5', 'CNTP10', 'CNTP20', 'CNTP30', 'CNTP60', 'CNTN5', 'CNTN10', 'CNTN20', 'CNTN30', 'CNTN60', 'CNTD5', 'CNTD10', 'CNTD20', 'CNTD30', 'CNTD60', 'SUMP5', 'SUMP10', 'SUMP20', 'SUMP30', 'SUMP60', 'SUMN5', 'SUMN10', 'SUMN20', 'SUMN30', 'SUMN60', 'SUMD5', 'SUMD10', 'SUMD20', 'SUMD30', 'SUMD60', 'VMA5', 'VMA10', 'VMA20', 'VMA30', 'VMA60', 'VSTD5', 'VSTD10', 'VSTD20', 'VSTD30', 'VSTD60', 'WVMA5', 'WVMA10', 'WVMA20', 'WVMA30', 'WVMA60', 'VSUMP5', 'VSUMP10', 'VSUMP20', 'VSUMP30', 'VSUMP60', 'VSUMN5', 'VSUMN10', 'VSUMN20', 'VSUMN30', 'VSUMN60', 'VSUMD5', 'VSUMD10', 'VSUMD20', 'VSUMD30', 'VSUMD60'],
    "integer_categorical_features": ['month'],
    "string_categorical_features": ['industry', 'season'],
}
batch_size = 1024

# 是否开启滚动训练&回测
if rolling_flag:
    print("开启滚动回测...")
    backtest_period = get_rolling_data_period(
        backtest_start_date='20200101', # 回测开始日期
        backtest_duration=4, # 一共回测多久的数据（单位：年）
        train_period=6, # 使用过去多久的时间进行训练（单位：年）
        val_period=1, # 验证数据周期（单位：年）
        test_period=1, # 测试数据周期（单位：年）
    )
else:
    print("关闭滚动回测...")
    backtest_period = [
        {
            'train': ['2009-01-01', '2016-12-31'], # 8年训练
            'val': ['2017-01-01', '2018-12-31'], # 2年验证
            'test': ['2019-01-01', '2024-12-31'] # 5年预测
        }
    ]

backtest_period

关闭滚动回测...


[{'train': ['2009-01-01', '2016-12-31'],
  'val': ['2017-01-01', '2018-12-31'],
  'test': ['2019-01-01', '2024-12-31']}]

In [9]:
# df = proprocessor._process_all_stock('000016', '2024-01-01', '2024-12-31')
# df.tail()

In [10]:
date_period_params = backtest_period[0]
print(date_period_params)
train_start_date, train_end_date = date_period_params['train']
val_start_date, val_end_date = date_period_params['val']
test_start_date, test_end_date = date_period_params['test']
# 获取全区间数据
print("开始加载原始数据...")
df = proprocessor._process_all_stock(benchmark=benchmark, start_date=train_start_date, end_date=test_end_date)
# 抽取训练验证数据
print("开始拆分训练、验证、测试集合...")
train_data, val_data, test_data = extract_train_val_data(df, *[train_start_date, train_end_date, val_start_date, val_end_date, test_start_date, test_end_date])
# 从data中抽取相关特征数据
print("开始抽取特征数据...")
feature_columns = feature_config.get('numeric_features', []) + feature_config.get('integer_categorical_features', []) + feature_config.get('string_categorical_features', [])
label_columns = feature_config.get('target_features', [])
full_feature_columns = feature_columns + label_columns
train_df, val_df, test_df = train_data[full_feature_columns], val_data[full_feature_columns], test_data[full_feature_columns]
# 对相关特征进行特征工程
print("开始对特征进行预处理...")
feature_preprocess_pipeline = Pipeline(steps=[
    ('robust_scaler', RobustScaler()),
    ('minmax_scaler', MinMaxScaler()),
])
preprocess_feature_columns = feature_config.get('numeric_features', [])
train_df[preprocess_feature_columns] = feature_preprocess_pipeline.fit_transform(train_df[preprocess_feature_columns])
val_df[preprocess_feature_columns] = feature_preprocess_pipeline.transform(val_df[preprocess_feature_columns])
test_df[preprocess_feature_columns] = feature_preprocess_pipeline.transform(test_df[preprocess_feature_columns])

print("开始对标签进行预处理...")
label_preprocess_pipeline = Pipeline(steps=[
    ('quantile_clipper', QuantileClipTransformer()),
    ('robust_scaler', RobustScaler()),
    ('minmax_scaler', MinMaxScaler()),
])
preprocess_target_columns = feature_config.get('target_features', [])
train_df[preprocess_target_columns] = label_preprocess_pipeline.fit_transform(train_df[preprocess_target_columns])
val_df[preprocess_target_columns] = label_preprocess_pipeline.transform(val_df[preprocess_target_columns])
test_df[preprocess_target_columns] = label_preprocess_pipeline.transform(test_df[preprocess_target_columns])

{'train': ['2009-01-01', '2016-12-31'], 'val': ['2017-01-01', '2018-12-31'], 'test': ['2019-01-01', '2024-12-31']}
开始加载原始数据...
login success!


 17%|█▋        | 714/4174 [07:12<34:57,  1.65it/s]  


KeyboardInterrupt: 

In [None]:
# 转换为tensorflow所使用的dataset
print("开始将DataFrame转换为DataSet...")
train_ds = df_to_dataset(train_df, feature_columns, label_columns, shuffle=True, batch_size=batch_size)
val_ds = df_to_dataset(val_df, feature_columns, label_columns, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test_df, feature_columns, label_columns, shuffle=False, batch_size=batch_size)  

开始将DataFrame转换为DataSet...


In [None]:
# 准备模型训练
print("开始模型初始化 & 训练...")
from models.single_task.model_moe import QuantModel
model_config = {
        "seed": 1024,
        "feature_use_embedding": False,
        "feature_embedding_dims": 4,
        "numeric_features_with_boundaries": {k: pd.qcut(train_df[k], q=20, retbins=True, duplicates='drop')[1].tolist() for k in feature_config.get('numeric_features', [])},
        "integer_categorical_features_with_vocab": {k: list(train_df[k].unique()) for k in feature_config.get('integer_categorical_features', [])},
        "string_categorical_features_with_vocab": {k: list(train_df[k].unique()) for k in feature_config.get('string_categorical_features', [])},
    }
model = QuantModel(config=model_config)

initial_learning_rate = 5e-4

# lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
#     initial_learning_rate,
#     decay_steps=5 * (len(train_df) // batch_size), # 每5个batch进行一次调整
#     decay_rate=0.9,
#     staircase=True)
model.compile(
    # optimizer=tf.keras.optimizers.Adam(lr_schedule),
    optimizer=tf.keras.optimizers.Adam(initial_learning_rate),
    # loss=tf.keras.losses.MeanSquaredError(),
    # loss=tf.keras.losses.MeanAbsoluteError(),
    loss=tf.keras.losses.Huber(),
    # metrics=[
    #     # tf.keras.metrics.MeanSquaredError(),
    #     # tf.keras.metrics.MeanAbsoluteError(),
    # ],
    )

EPOCHS = 500

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=1,
    patience=10,
    mode='min',
    restore_best_weights=True)

baseline_history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks = [early_stopping],
)

开始模型初始化 & 训练...
Epoch 1/500


2024-05-28 07:20:21.332814: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x150f002d29d0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2024-05-28 07:20:21.332964: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2024-05-28 07:20:21.358243: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-28 07:20:21.665276: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 70: early stopping


In [None]:
baseline_history.history.keys()

dict_keys(['loss', 'val_loss'])

In [None]:
# plot_metrics(baseline_history)

In [None]:
# model.save(f'./tf_models/{benchmark}/')

In [None]:
test_data.head()

Unnamed: 0,code,datetime,open,high,low,close,volume,amount,turnover_rate,code_name,in_date,out_date,type,status,industry,pe_ttm,ps_ttm,pcf_ncf_ttm,pb_mrq,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,OPEN1,OPEN2,OPEN3,OPEN4,HIGH0,HIGH1,HIGH2,HIGH3,HIGH4,LOW0,LOW1,LOW2,LOW3,LOW4,CLOSE0,CLOSE1,CLOSE2,CLOSE3,CLOSE4,VOLUME0,VOLUME1,VOLUME2,VOLUME3,VOLUME4,ROC5,ROC10,ROC20,ROC30,ROC60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,TSRANK5,TSRANK10,TSRANK20,TSRANK30,TSRANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX10,IMAX20,IMAX30,IMAX60,IMIN5,IMIN10,IMIN20,IMIN30,IMIN60,IMXD5,IMXD10,IMXD20,IMXD30,IMXD60,CORR5,CORR10,CORR20,CORR30,CORR60,CORD5,CORD10,CORD20,CORD30,CORD60,CNTP5,CNTP10,CNTP20,CNTP30,CNTP60,CNTN5,CNTN10,CNTN20,CNTN30,CNTN60,CNTD5,CNTD10,CNTD20,CNTD30,CNTD60,SUMP5,SUMP10,SUMP20,SUMP30,SUMP60,SUMN5,SUMN10,SUMN20,SUMN30,SUMN60,SUMD5,SUMD10,SUMD20,SUMD30,SUMD60,VMA5,VMA10,VMA20,VMA30,VMA60,VSTD5,VSTD10,VSTD20,VSTD30,VSTD60,WVMA5,WVMA10,WVMA20,WVMA30,WVMA60,VSUMP5,VSUMP10,VSUMP20,VSUMP30,VSUMP60,VSUMN5,VSUMN10,VSUMN20,VSUMN30,VSUMN60,VSUMD5,VSUMD10,VSUMD20,VSUMD30,VSUMD60,weekday,day_of_week,day_of_month,month,season,label
60,sh.688981,2020-10-16,59.5,59.86,57.7,58.21,38030820.0,2224178000.0,3.656,中芯国际,2020-07-16,,股票,上市,电子,156.818263,17.920095,38.664818,9.457008,-0.021681,0.036303,-0.597222,0.00605,0.166667,0.008571,0.236111,-0.01916,-0.527778,1.022161,0.96547,0.984367,0.972342,0.88301,1.028346,1.028346,0.986944,0.991926,0.999313,0.991239,0.96547,0.958083,0.960831,0.87923,1.0,0.984367,0.961347,0.984195,0.986257,1.0,1.486745,0.759498,1.171918,2.198876,0.875107,0.936265,0.906717,1.154956,1.424498,1.028346,1.028346,1.028346,1.028346,1.030751,0.085896,0.171792,0.343584,0.515375,0.991239,0.983233,0.928517,0.946676,0.984768,1.144491,0.013888,0.061417,0.047926,0.084887,0.180822,0.002766,0.015706,-0.000789,-0.006342,-0.009381,0.099158,0.599459,0.009497,0.432588,0.820992,0.011235,0.000806,0.060824,0.107192,0.132262,0.989005,0.984745,0.984745,1.028586,1.314757,0.979625,0.85999,0.913039,0.928157,0.96492,5.0,10.0,20.0,23.0,23.0,0.969923,0.966907,0.958605,0.944742,0.221739,0.2,0.1,0.05,0.966667,0.8,0.8,0.7,0.35,0.233333,0.116667,-0.6,-0.6,-0.3,0.733333,0.683333,0.401893,0.454797,0.328602,0.226705,0.650816,0.855797,0.284017,0.312852,0.044759,0.087912,0.6,0.5,0.5,0.4,0.433333,0.4,0.5,0.5,0.6,0.566667,0.2,0.0,0.0,-0.2,-0.133333,0.857424,0.61168,0.603824,0.399375,0.382165,0.142576,0.38832,0.396176,0.600625,0.617835,0.714848,0.223359,0.207648,-0.201249,-0.23567,1.323407,1.076539,1.005835,1.140506,1.782309,0.556475,0.516126,0.419837,0.569235,1.233619,1.674397,1.762504,1.90259,1.932585,1.559979,0.525134,0.531553,0.498265,0.476898,0.326704,0.474866,0.468447,0.501735,0.523102,0.673296,0.050269,0.063107,-0.00347,-0.046203,-0.346592,4,Friday,16,10,Autumn,0.09878
61,sh.688981,2020-10-19,58.49,61.1,57.76,60.28,59006912.0,3529811000.0,5.6725,中芯国际,2020-07-16,,股票,上市,电子,162.394861,18.557349,40.039773,9.793308,0.030604,0.057104,0.535928,0.014019,0.245509,0.012481,0.218563,0.029065,0.508982,0.970305,0.98706,0.932316,0.950564,0.938952,1.013603,0.993033,0.993033,0.953052,0.957863,0.958195,0.9572,0.932316,0.925182,0.927837,1.0,0.96566,0.950564,0.928334,0.950398,1.0,0.644515,0.958229,0.489507,0.755318,0.952389,0.890843,0.916224,1.10783,1.278368,1.013603,1.013603,1.013603,1.013603,1.013603,0.082946,0.165893,0.331785,0.497678,0.958195,0.958991,0.907548,0.918356,0.947357,1.10055,0.026509,0.067591,0.050109,0.077074,0.173629,0.013653,0.020419,0.000496,-0.004726,-0.008938,0.663146,0.836558,0.003433,0.291375,0.808278,0.013703,0.000567,0.076929,0.121169,0.163128,0.972528,0.955043,0.953318,0.980358,1.262376,0.945985,0.830458,0.881685,0.896284,0.931785,5.0,10.0,20.0,25.0,25.0,0.985383,0.983953,0.980049,0.973633,0.754491,0.0,0.0,0.0,0.966667,0.816667,0.6,0.8,0.4,0.266667,0.133333,-0.6,-0.8,-0.4,0.7,0.683333,0.695579,0.565611,0.401474,0.252112,0.642997,0.739556,0.283326,0.340701,0.069755,0.059834,0.6,0.6,0.5,0.433333,0.45,0.4,0.4,0.5,0.566667,0.55,0.2,0.2,0.0,-0.133333,-0.1,0.7487,0.684004,0.597982,0.430017,0.41698,0.2513,0.315996,0.402018,0.569983,0.58302,0.4974,0.368009,0.195964,-0.139966,-0.16604,0.769514,0.760208,0.661498,0.745052,1.1034,0.213873,0.319342,0.281329,0.36997,0.720027,0.805679,1.583632,1.897436,1.857532,1.514107,0.399012,0.578143,0.525368,0.516402,0.431517,0.600988,0.421857,0.474632,0.483598,0.568483,-0.201977,0.156287,0.050736,0.032803,-0.136966,0,Monday,19,10,Autumn,0.074818
62,sh.688981,2020-10-20,59.7,59.98,58.4,59.5,35066961.0,2070347000.0,3.3711,中芯国际,2020-07-16,,股票,上市,电子,160.293534,18.317224,39.521674,9.666586,-0.00335,0.026466,-0.126582,0.00469,0.177215,0.018425,0.696203,0.010385,0.392405,1.003361,0.983025,1.0,0.944538,0.963025,1.008067,1.026891,1.00605,1.00605,0.965546,0.981513,0.970756,0.969748,0.944538,0.937311,1.0,1.013109,0.978319,0.963025,0.940504,1.0,1.682692,1.08452,1.612405,0.82369,0.962857,0.839328,0.922353,1.120672,1.330588,1.008067,1.008067,1.008067,1.008067,1.008403,0.084034,0.168067,0.336134,0.504202,0.981513,0.978992,0.935513,0.934277,0.955754,1.109468,0.028905,0.066408,0.053037,0.072412,0.174206,0.016908,0.019739,0.00137,-0.003419,-0.008878,0.855367,0.809891,0.02336,0.172742,0.792224,-0.012807,-0.024339,0.052706,0.093818,0.152447,1.002622,0.982655,0.969613,0.990588,1.27321,0.958521,0.853277,0.893244,0.908034,0.944,4.0,9.0,19.0,25.0,25.0,0.99127,0.990396,0.987994,0.983989,0.6875,0.2,0.1,0.05,0.866667,0.833333,0.8,0.9,0.45,0.3,0.15,-0.6,-0.8,-0.4,0.566667,0.683333,0.487975,0.670093,0.378855,0.30387,0.635473,0.775972,0.85682,0.349374,0.078036,0.063287,0.6,0.6,0.5,0.433333,0.433333,0.4,0.4,0.5,0.566667,0.566667,0.2,0.2,0.0,-0.133333,-0.133333,0.671851,0.820805,0.588168,0.423812,0.401384,0.328149,0.179195,0.411832,0.576188,0.598616,0.343701,0.641611,0.176336,-0.152377,-0.197233,1.240662,1.220639,1.113645,1.256074,1.764709,0.383975,0.532139,0.473245,0.621395,1.05044,0.663226,1.88324,1.876151,1.839022,1.566963,0.455502,0.456995,0.500595,0.502261,0.418443,0.544498,0.543005,0.499405,0.497739,0.581557,-0.088996,-0.086009,0.00119,0.004522,-0.163113,1,Tuesday,20,10,Autumn,0.102017
63,sh.688981,2020-10-21,59.69,59.8,56.92,57.65,35138418.0,2041441000.0,3.3779,中芯国际,2020-07-16,,股票,上市,电子,155.309617,17.747697,38.292849,9.366029,-0.034177,0.048249,-0.708333,0.001843,0.038194,0.01223,0.253472,-0.02379,-0.493056,1.035386,1.035559,1.014571,1.03209,0.974848,1.037294,1.040416,1.059844,1.038335,1.038335,0.987337,1.01301,1.001908,1.000867,0.974848,1.0,1.03209,1.04562,1.009714,0.993929,1.0,0.997966,1.679271,1.082315,1.609126,0.970685,0.868864,0.978144,1.133565,1.36392,1.037294,1.037294,1.037294,1.037294,1.040763,0.08673,0.173461,0.346921,0.520382,0.987337,1.016271,0.978647,0.965351,0.981972,1.139005,0.021901,0.060002,0.055247,0.069461,0.178421,0.003452,0.015808,0.002144,-0.002426,-0.00903,0.062107,0.636262,0.052711,0.094542,0.781277,-0.023174,-0.049783,0.014281,0.053206,0.127387,1.034796,1.014189,1.000729,1.011761,1.31098,0.998786,0.95327,0.921908,0.937173,0.974293,2.0,7.0,15.0,21.0,21.0,0.960766,0.956827,0.94598,0.927852,0.237013,0.4,0.2,0.1,0.9,0.85,0.8,0.9,0.5,0.333333,0.166667,-0.4,-0.7,-0.4,0.566667,0.683333,0.176705,0.611021,0.36997,0.362697,0.626665,0.566609,0.828113,0.332648,0.07848,0.062758,0.6,0.5,0.45,0.433333,0.433333,0.4,0.5,0.55,0.566667,0.566667,0.2,0.0,-0.1,-0.133333,-0.133333,0.621583,0.727711,0.523738,0.419186,0.39618,0.378417,0.272289,0.476262,0.580814,0.60382,0.243165,0.455422,0.047476,-0.161629,-0.207641,1.273735,1.235954,1.091673,1.256108,1.701008,0.340796,0.519155,0.468067,0.618862,0.983789,0.585693,1.755218,1.895424,1.820693,1.567143,0.534304,0.514743,0.477557,0.502465,0.44341,0.465696,0.485257,0.522443,0.497535,0.55659,0.068608,0.029485,-0.044887,0.00493,-0.113181,2,Wednesday,21,10,Autumn,0.169471
64,sh.688981,2020-10-22,57.2,58.8,56.38,57.67,28826455.0,1666799000.0,2.7712,中芯国际,2020-07-16,,股票,上市,电子,155.363498,17.753854,38.306134,9.369278,0.008217,0.042308,0.194215,0.019755,0.466942,0.014336,0.338843,0.002797,0.066116,0.99185,1.035027,1.0352,1.014219,1.031732,1.019594,1.036934,1.040055,1.059476,1.037975,0.977631,0.986995,1.012658,1.001561,1.00052,1.0,0.999653,1.031732,1.045257,1.009364,1.0,1.218964,1.216485,2.046971,1.319303,0.993584,0.860933,0.98318,1.11271,1.379747,1.019594,1.019594,1.019594,1.019594,1.040402,0.0867,0.1734,0.346801,0.520201,0.977631,1.017201,0.992214,0.965857,0.977874,1.132281,0.020393,0.043639,0.055645,0.065008,0.176385,-0.006433,0.009161,0.00293,-0.001403,-0.008846,0.248778,0.403946,0.097027,0.036084,0.767075,-0.004335,-0.033438,0.006309,0.042465,0.128665,1.034437,1.013837,1.000381,1.002289,1.305601,0.999931,0.988798,0.921588,0.936848,0.973955,2.0,7.0,15.0,22.0,22.0,0.978996,0.976844,0.970876,0.960764,0.356354,0.6,0.3,0.15,0.933333,0.866667,0.0,0.9,0.55,0.366667,0.183333,0.6,-0.6,-0.4,0.566667,0.683333,0.797904,0.368093,0.341492,0.388736,0.616411,0.475261,0.825321,0.33218,0.079056,0.065218,0.6,0.6,0.45,0.466667,0.433333,0.4,0.4,0.55,0.533333,0.566667,0.2,0.2,-0.1,-0.066667,-0.133333,0.53286,0.747837,0.518476,0.430077,0.390631,0.46714,0.252163,0.481524,0.569923,0.609369,0.065719,0.495674,0.036952,-0.139845,-0.218738,1.360345,1.52626,1.294479,1.52177,2.012693,0.401146,0.611224,0.567223,0.759317,1.158296,0.935652,1.780731,1.920707,1.850047,1.593477,0.301494,0.513334,0.466753,0.492696,0.452139,0.698506,0.486666,0.533247,0.507304,0.547861,-0.397012,0.026668,-0.066495,-0.014608,-0.095723,3,Thursday,22,10,Autumn,0.168719


In [None]:
# 输出回测预测
print("开始保存回测预测结果...")
model_pred_result = model.predict(test_ds)
output_df = test_data[['code', 'code_name', 'datetime']]
output_df['label'] = test_df['label']
output_df['label_pred'] = model_pred_result
output_df = output_df.rename(columns={
    'code': 'stock_code',
    'code_name': 'stock_name'
})
output_df.to_pickle(f'../../Offline/backtest/backtest_data/test/{benchmark}_{test_start_date}_回归任务_v4.pkl') 

开始保存回测预测结果...


In [None]:
# output_df.head()