In [1]:
import sys
sys.path.append('../')

import json
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib
import baostock as bs
import json

from datetime import datetime
from dateutil.relativedelta import relativedelta
from tqdm import tqdm

from Share.database_auto.db_factor_prebuilder.utils.expression_excutor import AlphaExpressionExcutor

pd.options.display.max_rows=None
pd.options.display.max_columns=None

2024-06-14 05:40:55.851077: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def download_stock_history_info(code, start_date, end_date):
    df = bs.query_history_k_data_plus(
            code=code,
            fields="code,date,open,high,low,close,volume,amount,turn",
            start_date=start_date,
            end_date=end_date,
            frequency="d",
            adjustflag="1",
        ).get_data()
    if not df.empty:
        df = df.astype({
                "code": "str",
                "date": "str",
                "open": "float",
                "high": "float",
                "low": "float",
                "close": "float",
                "volume": "float",
                "amount": "float",
                "turn": "float",
            })
        df = df.rename(
            columns={
                "code": "code",
                "date": "datetime",
                "open": "open",
                "high": "high",
                "low": "low",
                "close": "close",
                "volume": "volume",
                "amount": "amount",
                "turn": "turnover_rate",
            }
        )
        
        df = df.replace("", np.NaN)
    return df

def download_stock_profile_info(code):
    df = bs.query_stock_basic(code=code).get_data()
    dd = bs.query_stock_industry(code=code).get_data()
    if not df.empty and not dd.empty:
        dataframe = df.merge(dd[["code", "industry"]], on=["code"], how="left")
        dataframe = dataframe.rename(
            columns={
                "code": "code",
                "code_name": "code_name",
                "industry": "industry",
                "ipoDate": "in_date",
                "outDate": "out_date",
                "type": "type",
                "status": "status",
            }
        )
        dataframe["industry"] = dataframe["industry"].replace("", "其他")
        dataframe["type"] = dataframe["type"].map(lambda x: {"1": "股票", "2": "指数", "3": "其它", "4": "可转债", "5": "ETF"}.get(x, "其他"))
        dataframe["status"] = dataframe["status"].map(lambda x: {"1": "上市", "2": "退市"}.get(x, "其他"))
    return dataframe

def download_stock_indicator_info(code, start_date, end_date):
    df = bs.query_history_k_data_plus(
            code=code,
            fields="code,date,peTTM,psTTM,pcfNcfTTM,pbMRQ",
            start_date=start_date,
            end_date=end_date,
            frequency="d",
            adjustflag="1",
        ).get_data()
    if not df.empty:
        df = df.astype({
                "code": "str",
                "date": "str",
                "peTTM": "float",
                "psTTM": "float",
                "pcfNcfTTM": "float",
                "pbMRQ": "float",
            }
        )
        df = df.rename(
            columns={
                "code": "code",
                "date": "datetime",
                "peTTM": "pe_ttm",
                "psTTM": "ps_ttm",
                "pcfNcfTTM": "pcf_ncf_ttm",
                "pbMRQ": "pb_mrq",
            }
        )
        df = df.replace("", np.NaN)
    return df

def build_alpha_factor_sync(stock_history, alpha_factor_path, exp_excutor):
    stock_history['vwap'] = stock_history[['open', 'high', 'low', 'close']].mean(axis=1)
    stock_history['returns'] = stock_history['close'].pct_change()
    alpha_factor_dict = json.loads(open(alpha_factor_path, "r").read())
    dataframe = stock_history[["code", "datetime"]]
    for alpha_name, alpha_expression in alpha_factor_dict.items():
        try:
            dataframe[alpha_name] = exp_excutor.excute(stock_history, alpha_expression)
        except Exception as e:
            dataframe[alpha_name] = np.NaN
    return dataframe

def build_date_factor_sync(start_date, end_date):
    dataframe = pd.DataFrame()
    dataframe['datetime'] = [datetime.strftime(i, '%Y-%m-%d') for i in pd.date_range(start=start_date, end=end_date)]
    datetime_series = pd.to_datetime(dataframe["datetime"])
    dataframe["weekday"] = datetime_series.dt.weekday  # 星期几（0=星期一，6=星期日）",
    dataframe["day_of_week"] = datetime_series.dt.day_name()  # 星期几的名称",
    dataframe["day_of_month"] = datetime_series.dt.day  # 一个月中的第几天",
    dataframe["month"] = datetime_series.dt.month  # 月份",
    dataframe["season"] = datetime_series.dt.month.map(
        lambda x: {
            1: "Winter",
            2: "Winter",
            3: "Spring",
            4: "Spring",
            5: "Spring",
            6: "Summer",
            7: "Summer",
            8: "Summer",
            9: "Autumn",
            10: "Autumn",
            11: "Autumn",
            12: "Winter",
        }.get(x)
    )
    dataframe = dataframe.replace([np.inf, -np.inf], np.nan).dropna()
    return dataframe

def df_to_dataset(dataframe, feature_cols, shuffle=False, batch_size=32):
    features = dataframe[feature_cols]
    ds = tf.data.Dataset.from_tensor_slices((dict(features)))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(len(features), 10000), seed=1024)
    ds = ds.cache().batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

In [3]:
# 0. 初始化相关配置
exp_excutor = AlphaExpressionExcutor()
benchmark = '000016'
factor_lookback_window = 180 # 回看天数-180
# end_date = datetime.strftime(datetime.now(), '%Y-%m-%d')
end_date = '2024-06-13'
start_date = datetime.strftime(datetime.now() - relativedelta(days=factor_lookback_window), '%Y-%m-%d')
print(f"CurrentDate: {end_date}")

CurrentDate: 2024-06-13


In [4]:
# 1. 获取指数成分股信息
bs.login()
stock_pool = bs.query_sz50_stocks().get_data()['code'].unique()
print(stock_pool)

login success!
['sh.600028' 'sh.600030' 'sh.600031' 'sh.600036' 'sh.600048' 'sh.600050'
 'sh.600089' 'sh.600104' 'sh.600111' 'sh.600150' 'sh.600276' 'sh.600309'
 'sh.600406' 'sh.600436' 'sh.600438' 'sh.600519' 'sh.600690' 'sh.600809'
 'sh.600887' 'sh.600893' 'sh.600900' 'sh.600905' 'sh.601012' 'sh.601088'
 'sh.601166' 'sh.601225' 'sh.601288' 'sh.601318' 'sh.601390' 'sh.601398'
 'sh.601601' 'sh.601628' 'sh.601633' 'sh.601668' 'sh.601669' 'sh.601728'
 'sh.601857' 'sh.601888' 'sh.601899' 'sh.601919' 'sh.601988' 'sh.603259'
 'sh.603288' 'sh.603501' 'sh.603799' 'sh.603986' 'sh.688041' 'sh.688111'
 'sh.688599' 'sh.688981']


In [5]:
# 2. 获取基础数据
stock_df_list = []
for stock_code in tqdm(stock_pool, desc='LoadingStockData'):
    # 1. 获取基础信息
    stock_history = download_stock_history_info(stock_code, start_date, end_date)
    stock_profile = download_stock_profile_info(stock_code)
    stock_indicator = download_stock_indicator_info(stock_code, start_date, end_date)
    stock_date_factor = build_date_factor_sync(start_date, end_date)
    stock_alpha_184_factor = build_alpha_factor_sync(
        stock_history, 
        '../Share/database_auto/db_factor_prebuilder/factor_lib/alpha_184.json',
        exp_excutor)
    stock_df = stock_history.merge(stock_profile, on=['code']) \
            .merge(stock_indicator, on=['code', 'datetime']) \
            .merge(stock_date_factor, on=['datetime']) \
            .merge(stock_alpha_184_factor, on=['code', 'datetime'])
    stock_df_list.append(stock_df)

bs.logout()

LoadingStockData: 100%|██████████| 50/50 [00:46<00:00,  1.07it/s]

logout success!





<baostock.data.resultset.ResultData at 0x148b94d086d0>

In [6]:
stock_df_list[0]['datetime'].max()

'2024-06-13'

In [7]:
# 3. 整合数据 & 过滤今日的特征数据
whole_stock_df = pd.concat(stock_df_list)
whole_stock_df = whole_stock_df[whole_stock_df['datetime'] == end_date] # 只过滤今日预测明日的结果
whole_stock_df.shape

(50, 210)

In [8]:
# 4. 加载预处理相关内容
loaded_feature_preprocess_pipeline = joblib.load('./enhance_sz50/feature_preprocess_pipeline.joblib')
loaded_feature_config = json.load(open('./enhance_sz50/feature_config.json'))
loaded_model = tf.keras.models.load_model('./enhance_sz50/tf_models/')

2024-06-14 05:41:45.397665: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22270 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:25:00.0, compute capability: 8.6


In [9]:
# 5. 进行基础的数据处理 & 特征处理
def base_dataframe_cleaner(dataframe):
    stock_df = dataframe.copy()
    stock_df = stock_df.replace("", np.NaN)
    stock_df = stock_df.replace([np.inf, -np.inf], np.nan)
    stock_df = stock_df.dropna(axis=1, how='all') # 过滤特征列全为空的column
    stock_df = stock_df.dropna() # 剔除包含空值的行
    return stock_df

loaded_full_feature_columns = ['code', 'code_name', 'datetime'] + [i for feas in loaded_feature_config.values() for i in feas]
test_df = whole_stock_df[loaded_full_feature_columns]
test_df = base_dataframe_cleaner(test_df)

loaded_preprocess_feature_columns = loaded_feature_config.get('numeric_features', [])
test_df[loaded_preprocess_feature_columns] = loaded_feature_preprocess_pipeline.transform(test_df[loaded_preprocess_feature_columns])
test_df.shape

(50, 195)

In [10]:
loaded_feature_columns = loaded_feature_config.get('numeric_features', []) + loaded_feature_config.get('integer_categorical_features', []) + loaded_feature_config.get('string_categorical_features', [])
test_ds = df_to_dataset(test_df, loaded_feature_columns, shuffle=False, batch_size=512)

model_pred_result = loaded_model.predict(test_ds)
output_df = test_df[['code', 'code_name', 'datetime']]
output_df['label_pred'] = model_pred_result
output_df = output_df.rename(columns={
    'code': 'stock_code',
    'code_name': 'stock_name'
})



2024-06-14 05:41:49.711477: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [11]:
output_df.sort_values('label_pred', ascending=False).head()

Unnamed: 0,stock_code,stock_name,datetime,label_pred
115,sh.601988,中国银行,2024-06-13,0.527806
115,sh.601398,工商银行,2024-06-13,0.525711
115,sh.688981,中芯国际,2024-06-13,0.485161
115,sh.601288,农业银行,2024-06-13,0.483139
115,sh.600030,中信证券,2024-06-13,0.461266


## 对比验证下与回测数据打分的正确性

In [12]:
# model_pred_df = pd.read_pickle(f'../Offline/backtest/backtest_data/test/000016_2019-01-01_回归任务_v6.pkl')
# sample_df = model_pred_df[model_pred_df['datetime'] == '2024-05-13']
# sample_df.sort_values('label_pred', ascending=False).head()