In [54]:
import warnings
warnings.filterwarnings("ignore")

import math

import pandas as pd
import akshare as ak

from datetime import datetime, timedelta
from tqdm import tqdm

pd.options.display.max_rows=None
pd.options.display.max_columns=None

In [2]:
# 1. 获取中证50（000016）的股票列表
stock_code_list = ak.index_stock_cons('000016')['品种代码'].to_list()
stock_code_list[:5]

['688981', '688041', '601988', '601601', '600150']

In [3]:
# 2. 获取这些股票的个股信息
all_stock_individual_info = pd.DataFrame([ak.stock_individual_info_em(symbol=stock_code).set_index('item').to_dict()['value'] for stock_code in tqdm(stock_code_list)]).rename(columns={
                    "总市值": "total_market_cap",
                    "流通市值": "circulating_market_cap",
                    "行业": "industry",
                    "上市时间": "listing_date",
                    "股票代码": "stock_code",
                    "股票简称": "stock_name",
                    "总股本": "total_shares",
                    "流通股": "circulating_shares",
                })
all_stock_individual_info.head()

100%|██████████| 50/50 [00:04<00:00, 10.11it/s]


Unnamed: 0,stock_code,stock_name,total_shares,circulating_shares,total_market_cap,circulating_market_cap,industry,listing_date
0,688981,中芯国际,7946658000.0,1973609000.0,349652900000.0,86838800000.0,半导体,20200716
1,688041,海光信息,2324338000.0,880557200.0,178323200000.0,67556350000.0,半导体,20220812
2,601988,中国银行,294387800000.0,210765500000.0,1268811000000.0,908399400000.0,银行,20060705
3,601601,中国太保,9620341000.0,6845041000.0,250321300000.0,178108000000.0,保险,20071225
4,600150,中国船舶,4472429000.0,4472429000.0,140434300000.0,140434300000.0,船舶制造,19980520


In [4]:
# 3. 获取这些股票的历史数据
all_stock_history_info = []
for stock_code in tqdm(stock_code_list):
    stock_history_info = ak.stock_zh_a_hist(symbol=stock_code, adjust='hfq').rename(
            columns={
                "日期": "datetime",
                "开盘": "open",
                "最高": "high",
                "最低": "low",
                "收盘": "close",
                "成交量": "volume",
                "成交额": "turnover",
                "振幅": "amplitude",
                "涨跌幅": "change_pct",
                "涨跌额": "change_amount",
                "换手率": "turnover_rate",
            }
        )
    stock_history_info.insert(0, "stock_code", stock_code)
    all_stock_history_info.append(stock_history_info)
all_stock_history_info = pd.concat(all_stock_history_info)
all_stock_history_info.head()

100%|██████████| 50/50 [00:12<00:00,  4.02it/s]


Unnamed: 0,stock_code,datetime,open,close,high,low,volume,turnover,amplitude,change_pct,change_amount,turnover_rate
0,688981,2020-07-16,95.0,82.92,95.0,80.0,5522480,47979120000.0,54.62,201.97,55.46,53.09
1,688981,2020-07-17,79.0,77.06,84.9,75.0,2195971,17397820000.0,11.94,-7.07,-5.86,21.11
2,688981,2020-07-20,77.19,79.17,80.51,70.02,2286412,17009810000.0,13.61,2.74,2.11,21.98
3,688981,2020-07-21,78.3,78.63,82.89,77.77,1619190,12981260000.0,6.47,-0.68,-0.54,15.57
4,688981,2020-07-22,77.8,79.57,81.78,77.2,1339817,10685590000.0,5.82,1.2,0.94,12.88


In [5]:
# 4. 构建label表（10天后的收益率）
all_stock_label_info = all_stock_history_info[['stock_code', 'datetime', 'close']]
all_stock_label_info = all_stock_label_info.sort_values(['stock_code', 'datetime'])
all_stock_label_info['close_in_10_days'] = all_stock_label_info.groupby('stock_code')['close'].shift(-10)
all_stock_label_info['return_10_days'] = all_stock_label_info['close_in_10_days'] / all_stock_label_info['close'] - 1
all_stock_label_info = all_stock_label_info[['stock_code', 'datetime', 'return_10_days']]
all_stock_label_info.head()

Unnamed: 0,stock_code,datetime,return_10_days
0,600028,2001-08-08,-0.050459
1,600028,2001-08-09,-0.037471
2,600028,2001-08-10,-0.087558
3,600028,2001-08-13,-0.069767
4,600028,2001-08-14,-0.07243


In [6]:
# 5. label表join特征表
wide_table_info = all_stock_label_info.merge(all_stock_individual_info[['stock_code', 'industry']], how='left').merge(all_stock_history_info, on=["stock_code", "datetime"], how="left")
wide_table_info.head()

Unnamed: 0,stock_code,datetime,return_10_days,industry,open,close,high,low,volume,turnover,amplitude,change_pct,change_amount,turnover_rate
0,600028,2001-08-08,-0.050459,石油行业,4.6,4.36,4.7,4.31,6775533,3015767000.0,9.24,3.32,0.14,44.0
1,600028,2001-08-09,-0.037471,石油行业,4.34,4.27,4.35,4.23,1568098,671175000.0,2.75,-2.06,-0.09,10.18
2,600028,2001-08-10,-0.087558,石油行业,4.27,4.34,4.4,4.25,808393,349733000.0,3.51,1.64,0.07,5.25
3,600028,2001-08-13,-0.069767,石油行业,4.33,4.3,4.36,4.27,380127,163571000.0,2.07,-0.92,-0.04,2.47
4,600028,2001-08-14,-0.07243,石油行业,4.3,4.28,4.35,4.27,282869,121521000.0,1.86,-0.47,-0.02,1.84


In [9]:
# 使用tensorflow处理原始数据
import numpy as np
import pandas as pd

import tensorflow as tf

print(tf.__version__)

2.15.0


In [130]:
# 6. 选择固定时间区间的数据
wide_table_info['target'] = np.where(wide_table_info['return_10_days']>=0.1, 1, 0)
# dataframe = wide_table_info

train_start_date = pd.to_datetime('2000-01-01')
train_end_date = pd.to_datetime('2020-12-31')
val_start_date = pd.to_datetime('2021-01-01')
val_end_date = pd.to_datetime('2021-12-31')
test_start_date = pd.to_datetime('2022-01-01')
test_end_date = pd.to_datetime('2022-12-31')

train_data = wide_table_info[(pd.to_datetime(wide_table_info['datetime']) >= train_start_date) & (pd.to_datetime(wide_table_info['datetime']) <= train_end_date)]
val_data = wide_table_info[(pd.to_datetime(wide_table_info['datetime']) >= val_start_date) & (pd.to_datetime(wide_table_info['datetime']) <= val_end_date)]
test_data = wide_table_info[(pd.to_datetime(wide_table_info['datetime']) >= test_start_date) & (pd.to_datetime(wide_table_info['datetime']) <= test_end_date)]

train_data = train_data[['target', 'industry', 'open', 'close', 'high', 'low', 'volume', 'turnover', 'amplitude','change_pct', 'change_amount', 'turnover_rate']]
validation_data = val_data[['target', 'industry', 'open', 'close', 'high', 'low', 'volume', 'turnover', 'amplitude','change_pct', 'change_amount', 'turnover_rate']]
test_data = test_data[['target', 'industry', 'open', 'close', 'high', 'low', 'volume', 'turnover', 'amplitude','change_pct', 'change_amount', 'turnover_rate']]

In [131]:
print(f"train_data ::: {train_data['target'].value_counts()}")
print(f"validation_data ::: {validation_data['target'].value_counts()}")
print(f"test_data ::: {test_data['target'].value_counts()}")

train_data ::: target
0    136624
1     15631
Name: count, dtype: int64
validation_data ::: target
0    10505
1     1134
Name: count, dtype: int64
test_data ::: target
0    11237
1      709
Name: count, dtype: int64


In [132]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

train_ds = df_to_dataset(train_data)
val_ds = df_to_dataset(test_data)
test_ds = df_to_dataset(test_data)

In [135]:
def get_numeric_boundaries(series, num_bins=20):
    return pd.qcut(series, num_bins, retbins=True)[1].tolist()

In [136]:
TARGET_FEATURE_NAME = "target"
TARGET_FEATURE_LABELS = ["0", "1"]

# 连续特征分桶
NUMERIC_FEATURES_WITH_BOUNDARIES = {
    'open': get_numeric_boundaries(train_data['open']),
    'close': get_numeric_boundaries(train_data['close']),
    'high': get_numeric_boundaries(train_data['high']),
    'low': get_numeric_boundaries(train_data['low']),
    'volume': get_numeric_boundaries(train_data['volume']),
    'turnover': get_numeric_boundaries(train_data['turnover']),
    'amplitude': get_numeric_boundaries(train_data['amplitude']),
    'change_pct': get_numeric_boundaries(train_data['change_pct']),
    'change_amount': get_numeric_boundaries(train_data['change_amount']),
    'turnover_rate': get_numeric_boundaries(train_data['turnover_rate'])
}
NUMERIC_FEATURE_NAMES = list(NUMERIC_FEATURES_WITH_BOUNDARIES.keys())

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "industry": sorted(list(train_data["industry"].unique())),
}
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

NUM_CLASSES = len(TARGET_FEATURE_LABELS)

In [139]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = tf.keras.layers.Input(
                name=feature_name, shape=(), dtype="float32"
            )
        else:
            inputs[feature_name] = tf.keras.layers.Input(
                name=feature_name, shape=(), dtype="string"
            )
    return inputs

def encode_inputs(inputs):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in NUMERIC_FEATURE_NAMES: # 处理连续特征
            discret = tf.keras.layers.Discretization(bin_boundaries=NUMERIC_FEATURES_WITH_BOUNDARIES[feature_name],output_mode='one_hot')
            encoded_feature = discret(inputs[feature_name])
        elif feature_name in CATEGORICAL_FEATURE_NAMES: # 处理类别特征
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            lookup = tf.keras.layers.StringLookup(
                vocabulary=vocabulary,
                mask_token=None,
                num_oov_indices=0,
                output_mode="int"
            )
            embedding_dims = int(math.sqrt(len(vocabulary)))
            embedding = tf.keras.layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_dims
            )
            encoded_feature = embedding(lookup(inputs[feature_name]))

        # print(encoded_feature)
        encoded_features.append(encoded_feature)

    all_features = tf.keras.layers.concatenate(encoded_features)
    return all_features

In [140]:
LEARNING_RATE = 0.001

def run_experiment(model, train_ds, val_ds, test_ds):
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE)
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = [tf.keras.metrics.AUC(from_logits=True)]
    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=metrics,
    )

    print("Start training the model...")
    history = model.fit(train_ds, epochs=num_epochs, validation_data=val_ds, verbose=2)
    print("Model training finished")

    loss, auc = model.evaluate(test_ds, verbose=0)
    print(f"Test AUC::{round(auc * 100, 2)}%")


dropout_rate = 0.2
batch_size = 32
num_epochs = 20
hidden_units = [64, 32]

In [141]:
def create_baseline_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)

    for units in hidden_units:
        features = tf.keras.layers.Dense(units)(features)
        features = tf.keras.layers.BatchNormalization()(features)
        features = tf.keras.layers.ReLU()(features)
        features = tf.keras.layers.Dropout(dropout_rate)(features)

    outputs = tf.keras.layers.Dense(units=1)(features)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model


baseline_model = create_baseline_model()
# tf.keras.utils.plot_model(baseline_model, show_shapes=True, rankdir="LR")
run_experiment(baseline_model, train_ds, val_ds, test_ds)

Start training the model...
Epoch 1/20


4758/4758 - 4s - loss: 0.3270 - auc_10: 0.6451 - val_loss: 0.2178 - val_auc_10: 0.7103 - 4s/epoch - 862us/step
Epoch 2/20
4758/4758 - 4s - loss: 0.3091 - auc_10: 0.6915 - val_loss: 0.2172 - val_auc_10: 0.7166 - 4s/epoch - 739us/step
Epoch 3/20
4758/4758 - 3s - loss: 0.3055 - auc_10: 0.7060 - val_loss: 0.2236 - val_auc_10: 0.7170 - 3s/epoch - 733us/step
Epoch 4/20
4758/4758 - 3s - loss: 0.3036 - auc_10: 0.7131 - val_loss: 0.2228 - val_auc_10: 0.7189 - 3s/epoch - 729us/step
Epoch 5/20
4758/4758 - 3s - loss: 0.3026 - auc_10: 0.7176 - val_loss: 0.2228 - val_auc_10: 0.7164 - 3s/epoch - 730us/step
Epoch 6/20
4758/4758 - 4s - loss: 0.3013 - auc_10: 0.7212 - val_loss: 0.2221 - val_auc_10: 0.7086 - 4s/epoch - 739us/step
Epoch 7/20
4758/4758 - 3s - loss: 0.2995 - auc_10: 0.7291 - val_loss: 0.2188 - val_auc_10: 0.7185 - 3s/epoch - 734us/step
Epoch 8/20
4758/4758 - 3s - loss: 0.2985 - auc_10: 0.7320 - val_loss: 0.2260 - val_auc_10: 0.7038 - 3s/epoch - 732us/step
Epoch 9/20
4758/4758 - 3s - loss: 0

In [142]:
def create_wide_and_deep_model():
    inputs = create_model_inputs()
    wide = encode_inputs(inputs)
    wide = tf.keras.layers.BatchNormalization()(wide)

    deep = encode_inputs(inputs)
    for units in hidden_units:
        deep = tf.keras.layers.Dense(units)(deep)
        deep = tf.keras.layers.BatchNormalization()(deep)
        deep = tf.keras.layers.ReLU()(deep)
        deep = tf.keras.layers.Dropout(dropout_rate)(deep)

    merged = tf.keras.layers.concatenate([wide, deep])
    outputs = tf.keras.layers.Dense(units=1)(merged)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model


wide_and_deep_model = create_wide_and_deep_model()
# keras.utils.plot_model(wide_and_deep_model, show_shapes=True, rankdir="LR")

run_experiment(wide_and_deep_model,train_ds, val_ds, test_ds)

Start training the model...
Epoch 1/20
4758/4758 - 16s - loss: 0.3175 - auc_11: 0.6705 - val_loss: 0.2269 - val_auc_11: 0.6972 - 16s/epoch - 3ms/step
Epoch 2/20
4758/4758 - 4s - loss: 0.3086 - auc_11: 0.6935 - val_loss: 0.2255 - val_auc_11: 0.7179 - 4s/epoch - 893us/step
Epoch 3/20
4758/4758 - 4s - loss: 0.3069 - auc_11: 0.7001 - val_loss: 0.2235 - val_auc_11: 0.7093 - 4s/epoch - 901us/step
Epoch 4/20
4758/4758 - 4s - loss: 0.3050 - auc_11: 0.7070 - val_loss: 0.2315 - val_auc_11: 0.7202 - 4s/epoch - 898us/step
Epoch 5/20
4758/4758 - 4s - loss: 0.3040 - auc_11: 0.7112 - val_loss: 0.2221 - val_auc_11: 0.7159 - 4s/epoch - 901us/step
Epoch 6/20
4758/4758 - 4s - loss: 0.3023 - auc_11: 0.7174 - val_loss: 0.2199 - val_auc_11: 0.7128 - 4s/epoch - 911us/step
Epoch 7/20
4758/4758 - 4s - loss: 0.3009 - auc_11: 0.7222 - val_loss: 0.2241 - val_auc_11: 0.7187 - 4s/epoch - 913us/step
Epoch 8/20
4758/4758 - 4s - loss: 0.2996 - auc_11: 0.7267 - val_loss: 0.2211 - val_auc_11: 0.7142 - 4s/epoch - 910us/s

In [143]:
wide_and_deep_model.save('./hh_quant_tf_wdl_model')
reloaded_model = tf.keras.models.load_model('./hh_quant_tf_wdl_model')

INFO:tensorflow:Assets written to: ./hh_quant_tf_wdl_model/assets


INFO:tensorflow:Assets written to: ./hh_quant_tf_wdl_model/assets


In [157]:
samples = df_to_dataset(test_data.iloc[:100, :], shuffle=False, batch_size=10)

In [179]:
predictions = reloaded_model.predict(samples)
prob = tf.nn.sigmoid(tf.squeeze(predictions))

# print(
#     "This particular pet had a %.4f percent probability "
#     "of getting adopted." % (100 * prob)
# )

 1/10 [==>...........................] - ETA: 0s



In [183]:
prob.numpy()

array([0.00348943, 0.00317763, 0.00133001, 0.01444007, 0.00292474,
       0.00119652, 0.00156069, 0.05393143, 0.01038108, 0.01554941,
       0.00767644, 0.00408939, 0.00223689, 0.00453854, 0.00095397,
       0.00954288, 0.00606288, 0.0027912 , 0.00702181, 0.01829319,
       0.01770207, 0.0032179 , 0.01009125, 0.07406902, 0.02187479,
       0.0148585 , 0.02194769, 0.00408939, 0.00220401, 0.00091424,
       0.03934461, 0.004161  , 0.01706548, 0.01176338, 0.01534522,
       0.00326095, 0.03925095, 0.03460063, 0.00453671, 0.03560941,
       0.02112727, 0.05165057, 0.00223689, 0.01535452, 0.00936906,
       0.05116212, 0.00188385, 0.00317763, 0.00320557, 0.00073819,
       0.00220401, 0.00091424, 0.00172525, 0.00472252, 0.02044583,
       0.01364129, 0.00300095, 0.00408939, 0.006973  , 0.006973  ,
       0.00453854, 0.00172525, 0.00304535, 0.0040098 , 0.00479647,
       0.00348943, 0.00164652, 0.0015653 , 0.01353635, 0.0015653 ,
       0.00870013, 0.00606334, 0.01058601, 0.00702181, 0.01402

In [174]:
# tf.nn.sigmoid(predictions)