In [1]:
import sys
sys.path.append('../')

import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
import numpy as np
import akshare as ak
import sqlite3
import matplotlib.pyplot as plt
%matplotlib inline

from datetime import datetime
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
from database.downloader.downloader_base import DownloaderBase
import database.database_config as db_config

pd.options.display.max_rows=None
pd.options.display.max_columns=None

!python --version

Python 3.8.10


In [2]:
import tensorflow as tf

# 只使用CPU进行训练
tf.config.set_visible_devices([], 'GPU')

# 打印Tensorflow版本
print(f"Tensorflow Version: {tf.__version__}")

# 检查是否有可用的GPU设备
if tf.test.is_built_with_cuda():
    print("TensorFlow GPU version is installed")
else:
    print("TensorFlow CPU version is installed")

# 检查TensorFlow是否能够访问GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print("GPU devices available:", gpus)
else:
    print("No GPU devices found. Running on CPU.")

# !nvidia-smi

2024-04-25 05:50:06.604091: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Tensorflow Version: 2.13.1
TensorFlow GPU version is installed
GPU devices available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer, KBinsDiscretizer, StandardScaler

In [4]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

# 绘图相关函数
def plot_series_dist(series):
    data = series
    plt.figure(figsize=(5,5))
    # 使用matplotlib画直方图
    plt.hist(data, bins=60, edgecolor='k', alpha=0.7)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title('Histogram of Data')
    plt.show()

def plot_metrics(history):
  metrics = ['loss', 'mean_absolute_error', 'mean_squared_error']
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    plt.ylim([0, plt.ylim()[1]])
    plt.legend()

def plot_cm(true_labels, pred_labels):
    cm = confusion_matrix(true_labels, pred_labels)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="g", cmap='Blues')
    plt.title('Confusion matrix')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels)
    recall = recall_score(true_labels, pred_labels)
    print(f"Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")

def plot_close_label(df, label_name='label'):
    # 设置图像的大小
    plt.figure(figsize=(14, 7))
    # 绘制收盘价曲线
    plt.plot(df['datetime'], df['close'], label='Close Price', color='blue')
    # 提取买入和卖出点
    buy_points = df[df[label_name] == 1]
    sell_points = df[df[label_name] == 2]
    # 在买入点绘制上升三角形标记
    plt.scatter(buy_points['datetime'], buy_points['close'], label='Buy', color='green', marker='^', alpha=1)
    # 在卖出点绘制下降三角形标记
    plt.scatter(sell_points['datetime'], sell_points['close'], label='Sell', color='red', marker='v', alpha=1)
    # 增加标题和标签
    plt.title('Stock Price with Buy and Sell Signals')
    plt.xlabel('Date')
    plt.ylabel('Price')
    # 显示图例
    plt.legend()
    # 展示图形
    plt.show()

In [5]:
class PreProcessing:
    def __init__(self, db_downloader:DownloaderBase) -> None:
        self.db_downloader = db_downloader

    def _build_cls_label(self, stock_dataframe):
        """
        明日开始未来N天内优先触发止盈 = 1, 触发止损=2, 其他=0
        """
        def calculate_atr(df, period=14):
            df['high-low'] = df['high'] - df['low']
            df['high-close_prev'] = abs(df['high'] - df['close'].shift(1))
            df['low-close_prev'] = abs(df['low'] - df['close'].shift(1))
            df['tr'] = df[['high-low', 'high-close_prev', 'low-close_prev']].max(axis=1)
            atr = df['tr'].rolling(window=period, min_periods=1).mean()
            return atr

        # 初始化标签参数
        N = 15 # 时间周期
        ATR_period = 14 # ATR计算周期
        ATR_take_profit_factor = 2.5 # 止盈参数
        ATR_stop_loss_factor = 1.5
        # 开始构建标签
        df = stock_dataframe.copy()
        # 计算标签构建所需要的指标
        df['atr'] = calculate_atr(df, period=ATR_period) # 计算每一天的ATR
        df['label'] = 0  # 初始化标签列
        df['return'] = np.NaN # 初始化收益率列
        # 轮询判断先止盈还是先止损
        for index in range(len(df)-N-1):
            buy_price = df.at[index + 1, 'open'] # 第二天的开盘价作为买入价
            buy_atr = df.at[index, 'atr'] # 获取目前的ATR
            take_profit_price = buy_price + ATR_take_profit_factor * buy_atr # 提前确定止盈价格
            stop_loss_price = buy_price - ATR_stop_loss_factor * buy_atr # 提前确定止损价格
            for day in range(2, N+2):
                future_day_close = df.at[index+day, 'close'] # 买入后每天的收盘价
                # 检查价格是否触发止盈或止损条件
                if future_day_close > take_profit_price:
                    df.at[index, 'label'] = 1  # 未来N日走势上升 + 突破止盈
                    df.at[index, 'return'] = (future_day_close / buy_price) - 1
                    break  # 退出内循环
                elif future_day_close < stop_loss_price:
                    df.at[index, 'label'] = 2  # 未来N日走势下降 + 突破止损
                    df.at[index, 'return'] = (future_day_close / buy_price) - 1
                    break  # 退出内循环
            else:
                df.at[index, 'return'] = (future_day_close / buy_price) - 1
        # 过滤第二天一字涨停情况
        # df = df[df['high'].shift(-1) != df['low'].shift(-1)]
        return df[['datetime', 'label', 'return']]

    def _process_one_stock(self, stock_code, start_date, end_date):
        stock_base = self.db_downloader._download_stock_base_info(stock_code) # 获取基础代码
        stock_individual = self.db_downloader._download_stock_individual_info(stock_code) # 获取profile信息
        stock_history = self.db_downloader._download_stock_history_info(stock_code, start_date, end_date) # 获取历史行情
        stock_indicator = self.db_downloader._download_stock_indicator_info(stock_code, start_date, end_date) # 获取指标数据
        stock_factor_date = self.db_downloader._download_stock_factor_date_info() # 获取日期特征
        stock_factor_qlib = self.db_downloader._download_stock_factor_qlib_info(stock_code, start_date, end_date) # 获取量价特征
        stock_label = self._build_cls_label(stock_history, ) # 构建Label
        stock_df = stock_base.merge(stock_individual, on=['stock_code']).merge(stock_history, on=['stock_code']).merge(stock_indicator, on=['stock_code', 'datetime']).merge(stock_label, on=['datetime']).merge(stock_factor_date, on=['datetime']).merge(stock_factor_qlib, on=['stock_code', 'datetime']) # 整合数据
        stock_df = stock_base \
            .merge(stock_individual, on=['stock_code', 'stock_name']) \
            .merge(stock_history, on=['stock_code']) \
            .merge(stock_indicator, on=['stock_code', 'datetime']) \
            .merge(stock_label, on=['datetime']) \
            .merge(stock_factor_date, on=['datetime']) \
            .merge(stock_factor_qlib, on=['stock_code', 'datetime']) # 整合数据
        stock_df = stock_df.dropna()
        return stock_df
    
    def _process_all_stock(self, code_type, start_date, end_date):
        # stock_code_list = list(ak.stock_info_a_code_name()['code'].unique()) # 获取A股所有股票列表
        stock_code_list = list(ak.index_stock_cons(code_type)['品种代码'].unique()) # 获取沪深300的股票代码列表
        stock_df_list = []
        for stock_code in tqdm(stock_code_list, desc=f'Process: {code_type} ...'):
            stock_df = self._process_one_stock(stock_code, start_date, end_date)
            if not stock_df.empty:
                stock_df_list.append(stock_df)
        return pd.concat(stock_df_list)

In [6]:
# 数据库初始化
db_conn = sqlite3.connect('../database/hh_quant.db')
db_downloader = DownloaderBase(db_conn, db_config)
proprocessor = PreProcessing(db_downloader=db_downloader)
# df = proprocessor._process_all_stock('000016', '20120101', '20171231')

In [7]:
# print(df.stock_code.unique())
# sample_df = df[df['stock_code'] == '601601']
# plot_close_label(sample_df)

In [8]:
def get_rolling_data_period(backtest_start_date, backtest_duration=5, train_period=6, val_period=0.5, test_period=0.5):
    """
    Args:
        backtest_start_date (_type_): _description_
        backtest_duration (int, optional): _description_. Defaults to 5.
        train_period (int, optional): _description_. Defaults to 6.
        val_period (float, optional): _description_. Defaults to 0.5.
        test_period (float, optional): _description_. Defaults to 0.5.
    Returns:
        result: _description_
    """
    backtest_start_date = datetime.strptime(backtest_start_date, '%Y%m%d')
    backtest_end_date = backtest_start_date + relativedelta(years=backtest_duration) # 回测5年数据
    train_period = relativedelta(years=train_period) # 使用6年的训练数据
    val_period = relativedelta(months=(12 * val_period)) # 使用半年的验证数据
    test_period = relativedelta(months=(12 * test_period)) # 使用半年的测试数据(半年模型一更新)

    result = []
    rolling_flag = True
    bench_date = backtest_start_date
    while rolling_flag:
        if bench_date < backtest_end_date:
            test_start, test_end = bench_date, (bench_date + test_period - relativedelta(days=1))
            val_start, val_end = (test_start - relativedelta(days=1) - val_period), (test_start - relativedelta(days=1))
            train_start, train_end =(val_start - relativedelta(days=1) - train_period), (val_start - relativedelta(days=1))
            result.append({
                "train": [train_start.strftime("%Y%m%d"), train_end.strftime("%Y%m%d")],
                "val": [val_start.strftime("%Y%m%d"), val_end.strftime("%Y%m%d")],
                "test": [test_start.strftime("%Y%m%d"), test_end.strftime("%Y%m%d")]
            })
            bench_date += test_period
        else:
            rolling_flag = False 
    return result

def extract_train_val_data(df, train_start_date, train_end_date, val_start_date, val_end_date, test_start_date, test_end_date):
    train_start_date = pd.to_datetime(train_start_date)
    train_end_date = pd.to_datetime(train_end_date)
    val_start_date = pd.to_datetime(val_start_date)
    val_end_date = pd.to_datetime(val_end_date)
    test_start_date = pd.to_datetime(test_start_date)
    test_end_date = pd.to_datetime(test_end_date)

    train_data = df[(pd.to_datetime(df['datetime']) >= train_start_date) & (pd.to_datetime(df['datetime']) <= train_end_date)]
    val_data = df[(pd.to_datetime(df['datetime']) >= val_start_date) & (pd.to_datetime(df['datetime']) <= val_end_date)]
    test_data = df[(pd.to_datetime(df['datetime']) >= test_start_date) & (pd.to_datetime(df['datetime']) <= test_end_date)]

    print(f"train_data_size: {train_data.shape}")
    print(f"validation_data_size: {val_data.shape}")
    print(f"test_data_size: {test_data.shape}")
    return train_data, val_data, test_data

def df_to_dataset(dataframe, feature_cols, label_cols, shuffle=True, batch_size=32):
    features = dataframe[feature_cols]
    labels = tuple([dataframe[col] for col in label_cols])
    # labels = dataframe[label_cols]
    ds = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(len(features), 10000))
    ds = ds.cache().batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

In [9]:
# 相关配置
rolling_flag = False
benchmark = '000905'
feature_config = {
    "target_features": ['label', 'return'],
    "numeric_features": ['turnover_rate', 'pe_ttm', 'ps_ttm', 'pcf_ncf_ttm', 'pb_mrq', 'KMID', 'KLEN', 'KMID2', 'KUP', 'KUP2', 'KLOW', 'KLOW2', 'KSFT', 'KSFT2', 'OPEN0', 'OPEN1', 'OPEN2', 'OPEN3', 'OPEN4', 'HIGH0', 'HIGH1', 'HIGH2', 'HIGH3', 'HIGH4', 'LOW0', 'LOW1', 'LOW2', 'LOW3', 'LOW4', 'CLOSE1', 'CLOSE2', 'CLOSE3', 'CLOSE4', 'VOLUME1', 'VOLUME2', 'VOLUME3', 'VOLUME4', 'ROC5', 'ROC10', 'ROC20', 'ROC30', 'ROC60', 'MAX5', 'MAX10', 'MAX20', 'MAX30', 'MAX60', 'MIN5', 'MIN10', 'MIN20', 'MIN30', 'MIN60', 'MA5', 'MA10', 'MA20', 'MA30', 'MA60', 'STD5', 'STD10', 'STD20', 'STD30', 'STD60', 'BETA5', 'BETA10', 'BETA20', 'BETA30', 'BETA60', 'RSQR5', 'RSQR10', 'RSQR20', 'RSQR30', 'RSQR60', 'RESI5', 'RESI10', 'RESI20', 'RESI30', 'RESI60', 'QTLU5', 'QTLU10', 'QTLU20', 'QTLU30', 'QTLU60', 'QTLD5', 'QTLD10', 'QTLD20', 'QTLD30', 'QTLD60', 'TSRANK5', 'TSRANK10', 'TSRANK20', 'TSRANK30', 'TSRANK60', 'RSV5', 'RSV10', 'RSV20', 'RSV30', 'RSV60', 'IMAX5', 'IMAX10', 'IMAX20', 'IMAX30', 'IMAX60', 'IMIN5', 'IMIN10', 'IMIN20', 'IMIN30', 'IMIN60', 'IMXD5', 'IMXD10', 'IMXD20', 'IMXD30', 'IMXD60', 'CORR5', 'CORR10', 'CORR20', 'CORR30', 'CORR60', 'CORD5', 'CORD10', 'CORD20', 'CORD30', 'CORD60', 'CNTP5', 'CNTP10', 'CNTP20', 'CNTP30', 'CNTP60', 'CNTN5', 'CNTN10', 'CNTN20', 'CNTN30', 'CNTN60', 'CNTD5', 'CNTD10', 'CNTD20', 'CNTD30', 'CNTD60', 'SUMP5', 'SUMP10', 'SUMP20', 'SUMP30', 'SUMP60', 'SUMN5', 'SUMN10', 'SUMN20', 'SUMN30', 'SUMN60', 'SUMD5', 'SUMD10', 'SUMD20', 'SUMD30', 'SUMD60', 'VMA5', 'VMA10', 'VMA20', 'VMA30', 'VMA60', 'VSTD5', 'VSTD10', 'VSTD20', 'VSTD30', 'VSTD60', 'WVMA5', 'WVMA10', 'WVMA20', 'WVMA30', 'WVMA60', 'VSUMP5', 'VSUMP10', 'VSUMP20', 'VSUMP30', 'VSUMP60', 'VSUMN5', 'VSUMN10', 'VSUMN20', 'VSUMN30', 'VSUMN60', 'VSUMD5', 'VSUMD10', 'VSUMD20', 'VSUMD30', 'VSUMD60'],
    "integer_categorical_features": ['month'],
    "string_categorical_features": ['industry', 'season'],
}
batch_size = 1024

# 是否开启滚动训练&回测
if rolling_flag:
    print("开启滚动回测...")
    backtest_period = get_rolling_data_period(
        backtest_start_date='20200101', # 回测开始日期
        backtest_duration=4, # 一共回测多久的数据（单位：年）
        train_period=6, # 使用过去多久的时间进行训练（单位：年）
        val_period=1, # 验证数据周期（单位：年）
        test_period=1, # 测试数据周期（单位：年）
    )
else:
    print("关闭滚动回测...")
    backtest_period = [
        {
            'train': ['20120101', '20171231'],
            'val': ['20180101', '20181231'],
            'test': ['20190101', '20231231']
        }
    ]

backtest_period

关闭滚动回测...


[{'train': ['20120101', '20171231'],
  'val': ['20180101', '20181231'],
  'test': ['20190101', '20231231']}]

In [10]:
# feature_columns = feature_config.get('numeric_features', []) + feature_config.get('integer_categorical_features', []) + feature_config.get('string_categorical_features', [])
# label_columns = feature_config.get('target_features', [])
# ds = df_to_dataset(df, feature_columns, label_columns, shuffle=True, batch_size=batch_size)

In [11]:
date_period_params = backtest_period[0]
print(date_period_params)
train_start_date, train_end_date = date_period_params['train']
val_start_date, val_end_date = date_period_params['val']
test_start_date, test_end_date = date_period_params['test']
# 获取全区间数据
print("开始加载原始数据...")
df = proprocessor._process_all_stock(code_type=benchmark, start_date=train_start_date, end_date=test_end_date)
# 抽取训练验证数据
print("开始拆分训练、验证、测试集合...")
train_data, val_data, test_data = extract_train_val_data(df, *[train_start_date, train_end_date, val_start_date, val_end_date, test_start_date, test_end_date])
# 计算类别权重
print("开始计算类别权重...")
value_count = train_data['label'].value_counts()
print(value_count)
total_count = train_data['label'].count()
class_weights = ((1 / value_count) * (total_count / 2.0)).to_dict()
class_weights

{'train': ['20120101', '20171231'], 'val': ['20180101', '20181231'], 'test': ['20190101', '20231231']}
开始加载原始数据...


Process: 000905 ...: 100%|██████████| 430/430 [03:40<00:00,  1.95it/s]


开始拆分训练、验证、测试集合...
train_data_size: (317172, 208)
validation_data_size: (76518, 208)
test_data_size: (465480, 208)
开始计算类别权重...
label
2    121646
0    103074
1     92452
Name: count, dtype: int64


{2: 1.3036680203212598, 0: 1.5385645264567203, 1: 1.7153333621771296}

In [12]:
# 从data中抽取相关特征数据
print("开始抽取特征数据...")
feature_columns = feature_config.get('numeric_features', []) + feature_config.get('integer_categorical_features', []) + feature_config.get('string_categorical_features', [])
label_columns = feature_config.get('target_features', [])
full_feature_columns = feature_columns + label_columns
train_df, val_df, test_df = train_data[full_feature_columns], val_data[full_feature_columns], test_data[full_feature_columns]
# 对相关特征进行特征工程
print("开始特征工程处理...")
preprocessing_pipeline = Pipeline([
        ('quantile_transformer', QuantileTransformer(output_distribution='uniform', n_quantiles=1000)),
    ])
numeric_feature_columns = feature_config.get('numeric_features', [])
train_df[numeric_feature_columns] = preprocessing_pipeline.fit_transform(train_df[numeric_feature_columns])
val_df[numeric_feature_columns] = preprocessing_pipeline.transform(val_df[numeric_feature_columns])
test_df[numeric_feature_columns] = preprocessing_pipeline.transform(test_df[numeric_feature_columns])
# 转换为tensorflow所使用的dataset
print("开始将DataFrame转换为DataSet...")
train_ds = df_to_dataset(train_df, feature_columns, label_columns, shuffle=True, batch_size=batch_size)
val_ds = df_to_dataset(val_df, feature_columns, label_columns, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test_df, feature_columns, label_columns, shuffle=False, batch_size=batch_size) 

开始抽取特征数据...
开始特征工程处理...
开始将DataFrame转换为DataSet...


In [13]:
# 准备模型训练
print("开始模型初始化 & 训练...")
# 自定义模型
from models.multi_task.model_mmoe import QuantModel

model_config = {
    "seed": 1024,
    "feature_embedding_dims": 4,
    "integer_categorical_features_with_vocab": {k: list(train_data[k].unique()) for k in feature_config.get("integer_categorical_features", [])},
    "string_categorical_features_with_vocab": {k: list(train_data[k].unique()) for k in feature_config.get("string_categorical_features", [])},
}
model = QuantModel(config=model_config)

# 自定义优化器
initial_learning_rate = 5e-4
lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
    initial_learning_rate, decay_steps=(len(train_data) // batch_size) * 10, decay_rate=1, staircase=False
)
optimizer = tf.keras.optimizers.Adam(lr_schedule)

# 自定义损失函数
cls_loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) # 分类损失
reg_loss_object = tf.keras.losses.MeanSquaredError() # 回归损失

# 自定义指标Metrics
train_loss = tf.keras.metrics.Mean(name="train_loss")
train_cls_metric = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")
train_reg_metric = tf.keras.metrics.MeanSquaredError(name="train_mse")
val_loss = tf.keras.metrics.Mean(name="val_loss")
val_cls_metric = tf.keras.metrics.SparseCategoricalAccuracy(name="val_accuracy")
val_reg_metric = tf.keras.metrics.MeanSquaredError(name="val_mse")

# 自定义训练步骤
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        predictions = model(inputs, training=True)
        cls_label, reg_label = labels
        cls_pred, reg_pred = predictions
        cls_sample_weights = tf.gather(
            tf.constant([class_weights[ind] for ind in sorted(class_weights.keys())], dtype=tf.float32),
            tf.cast(cls_label, dtype=tf.int32)
        )
        cls_loss = cls_loss_object(cls_label, cls_pred, sample_weight=cls_sample_weights)
        reg_loss = reg_loss_object(reg_label, reg_pred)
        loss = cls_loss + reg_loss
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(loss)
    train_cls_metric(cls_label, cls_pred)
    train_reg_metric(reg_label, reg_pred)


# 自定义验证步骤
@tf.function
def val_step(inputs, labels):
    predictions = model(inputs, training=False)
    cls_label, reg_label = labels
    cls_pred, reg_pred = predictions
    cls_loss = cls_loss_object(cls_label, cls_pred)
    reg_loss = reg_loss_object(reg_label, reg_pred)
    loss = cls_loss + reg_loss
    val_loss(loss)
    val_cls_metric(cls_label, cls_pred)
    val_reg_metric(reg_label, reg_pred)

# 设定早停参数
patience = 10
best_val_loss = float('inf')
patience_counter = 0
best_weights=None
restore_best_weights=True

EPOCHS = 500
for epoch in range(EPOCHS):
    # 重新初始化Epoch内的参数
    train_loss.reset_states()
    val_loss.reset_states()
    train_cls_metric.reset_states()
    train_reg_metric.reset_states()
    val_cls_metric.reset_states()
    val_reg_metric.reset_states()

    # 训练逻辑
    for train_inputs, train_labels in tqdm(train_ds, desc="Training..."):
        train_step(train_inputs, train_labels)

    # 验证逻辑
    for val_inputs, val_labels in tqdm(val_ds, desc='Validatioin...'):
        val_step(val_inputs, val_labels)

    # EarlyStoping逻辑
    current_val_loss = val_loss.result()
    if current_val_loss <= best_val_loss:
        best_val_loss = current_val_loss
        patience_counter = 0
        # 可以选择在这里保存模型
        best_weights = model.get_weights()
        # model.save('path_to_my_model.h5')
    else:  # 如果不是，则耐心计数器加1
        patience_counter += 1
    # 如果耐心计数器超出设定的耐心值，则停止训练
    if patience_counter > patience:
        print(f'Early stopping at epoch {epoch + 1}')
        if restore_best_weights and best_weights is not None:
            # 恢复最佳权重
            print('Restoring model weights from the end of the best epoch.')
            model.set_weights(best_weights)
        break

    print(
        f"Epoch {epoch + 1}, "
        f"loss: {train_loss.result():.4f}, "
        f"accuracy: {train_cls_metric.result() * 100:.4f}, "
        f"mse: {train_reg_metric.result():.4f}, "
        f"val_loss: {val_loss.result():.4f}, "
        f"val_accuracy: {val_cls_metric.result() * 100:.4f}, "
        f"val_mse: {val_reg_metric.result():.4f}, "
    )


开始模型初始化 & 训练...


Training...:   0%|          | 0/310 [00:00<?, ?it/s]2024-04-25 06:22:27.221869: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x147e4c0c0850 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2024-04-25 06:22:27.221988: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2024-04-25 06:22:27.316125: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-04-25 06:22:27.744146: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
Training...:  33%|███▎      | 103/310 [00:23<00:32,  6.38it/s]

Training...: 100%|██████████| 310/310 [00:54<00:00,  5.71it/s]
Validatioin...: 100%|██████████| 75/75 [00:09<00:00,  7.68it/s]


Epoch 1, loss: 2.6586, accuracy: 34.9432, mse: 0.7496, val_loss: 1.1964, val_accuracy: 29.1435, val_mse: 0.0509, 


Training...: 100%|██████████| 310/310 [00:22<00:00, 13.61it/s]
Validatioin...: 100%|██████████| 75/75 [00:04<00:00, 17.26it/s]


Epoch 2, loss: 1.8817, accuracy: 37.7978, mse: 0.1852, val_loss: 1.1956, val_accuracy: 30.3746, val_mse: 0.0562, 


Training...: 100%|██████████| 310/310 [00:21<00:00, 14.12it/s]
Validatioin...: 100%|██████████| 75/75 [00:03<00:00, 18.92it/s]


Epoch 3, loss: 1.7477, accuracy: 39.5240, mse: 0.0978, val_loss: 1.1647, val_accuracy: 30.7483, val_mse: 0.0243, 


Training...: 100%|██████████| 310/310 [00:20<00:00, 14.97it/s]
Validatioin...: 100%|██████████| 75/75 [00:05<00:00, 12.52it/s]


Epoch 4, loss: 1.6900, accuracy: 40.5610, mse: 0.0607, val_loss: 1.1612, val_accuracy: 30.7941, val_mse: 0.0194, 


Training...: 100%|██████████| 310/310 [00:21<00:00, 14.61it/s]
Validatioin...: 100%|██████████| 75/75 [00:05<00:00, 12.93it/s]


Epoch 5, loss: 1.6570, accuracy: 41.2782, mse: 0.0414, val_loss: 1.1590, val_accuracy: 30.7457, val_mse: 0.0145, 


Training...: 100%|██████████| 310/310 [00:21<00:00, 14.35it/s]
Validatioin...: 100%|██████████| 75/75 [00:04<00:00, 17.34it/s]


Epoch 6, loss: 1.6388, accuracy: 41.8628, mse: 0.0313, val_loss: 1.1612, val_accuracy: 30.7000, val_mse: 0.0132, 


Training...: 100%|██████████| 310/310 [00:21<00:00, 14.16it/s]
Validatioin...: 100%|██████████| 75/75 [00:04<00:00, 17.71it/s]


Epoch 7, loss: 1.6249, accuracy: 42.4372, mse: 0.0246, val_loss: 1.1608, val_accuracy: 30.7601, val_mse: 0.0110, 


Training...: 100%|██████████| 310/310 [00:21<00:00, 14.51it/s]
Validatioin...: 100%|██████████| 75/75 [00:04<00:00, 16.82it/s]


Epoch 8, loss: 1.6154, accuracy: 42.9363, mse: 0.0208, val_loss: 1.1697, val_accuracy: 30.5000, val_mse: 0.0104, 


Training...: 100%|██████████| 310/310 [00:22<00:00, 13.79it/s]
Validatioin...: 100%|██████████| 75/75 [00:03<00:00, 19.75it/s]


Epoch 9, loss: 1.6060, accuracy: 43.4638, mse: 0.0179, val_loss: 1.1729, val_accuracy: 30.7862, val_mse: 0.0089, 


Training...: 100%|██████████| 310/310 [00:21<00:00, 14.69it/s]
Validatioin...: 100%|██████████| 75/75 [00:04<00:00, 16.67it/s]


Epoch 10, loss: 1.5995, accuracy: 43.8362, mse: 0.0159, val_loss: 1.1798, val_accuracy: 30.6882, val_mse: 0.0086, 


Training...: 100%|██████████| 310/310 [00:21<00:00, 14.41it/s]
Validatioin...: 100%|██████████| 75/75 [00:04<00:00, 18.56it/s]


Epoch 11, loss: 1.5934, accuracy: 44.3933, mse: 0.0142, val_loss: 1.1821, val_accuracy: 30.9692, val_mse: 0.0081, 


Training...: 100%|██████████| 310/310 [00:22<00:00, 13.57it/s]
Validatioin...: 100%|██████████| 75/75 [00:05<00:00, 14.75it/s]


Epoch 12, loss: 1.5871, accuracy: 44.8961, mse: 0.0127, val_loss: 1.1892, val_accuracy: 31.0489, val_mse: 0.0085, 


Training...: 100%|██████████| 310/310 [00:22<00:00, 13.96it/s]
Validatioin...: 100%|██████████| 75/75 [00:04<00:00, 18.01it/s]


Epoch 13, loss: 1.5820, accuracy: 45.3196, mse: 0.0121, val_loss: 1.1959, val_accuracy: 31.3011, val_mse: 0.0077, 


Training...: 100%|██████████| 310/310 [00:20<00:00, 14.80it/s]
Validatioin...: 100%|██████████| 75/75 [00:04<00:00, 17.47it/s]


Epoch 14, loss: 1.5755, accuracy: 45.6951, mse: 0.0112, val_loss: 1.1965, val_accuracy: 31.6997, val_mse: 0.0078, 


Training...: 100%|██████████| 310/310 [00:25<00:00, 12.29it/s]
Validatioin...: 100%|██████████| 75/75 [00:04<00:00, 18.67it/s]


Epoch 15, loss: 1.5692, accuracy: 46.1094, mse: 0.0106, val_loss: 1.2057, val_accuracy: 31.2136, val_mse: 0.0073, 


Training...: 100%|██████████| 310/310 [00:23<00:00, 13.23it/s]
Validatioin...: 100%|██████████| 75/75 [00:03<00:00, 20.06it/s]


Early stopping at epoch 16
Restoring model weights from the end of the best epoch.


In [14]:
# test数据处理
test_cls_result, test_reg_result = model.predict(test_ds)





In [15]:
from sklearn.metrics import classification_report
test_true = test_data['label']
test_pred = np.argmax(tf.nn.softmax(test_cls_result), axis=-1)
print(classification_report(test_true, test_pred))

              precision    recall  f1-score   support

           0       0.39      0.47      0.43    140712
           1       0.30      0.49      0.37    124668
           2       0.50      0.22      0.31    200100

    accuracy                           0.37    465480
   macro avg       0.39      0.39      0.37    465480
weighted avg       0.41      0.37      0.36    465480



In [16]:
output_df = test_data[['stock_code', 'stock_name', 'datetime', 'open', 'high', 'low', 'close', 'volume']]
output_df['label_cls'] = test_data['label']
output_df['label_reg'] = test_data['return']
# 处理分类结果
output_df['label_cls_pred'] = test_pred
# 处理回归结果
output_df['label_reg_pred'] = test_reg_result

In [17]:
output_df.to_pickle(f'../../Offline/backtest/backtest_data/test/{benchmark}_stock_selection_results_{test_start_date}_cls.pkl') 

In [18]:
sample_df = output_df[output_df['stock_code'] == '688981']
sample_df = sample_df[pd.to_datetime(sample_df['datetime']).dt.year == 2021]

In [19]:
# plot_close_label(sample_df, 'label_cls')

In [20]:
# plot_close_label(sample_df, 'label_cls_pred')