In [1]:
import warnings
warnings.filterwarnings("ignore")

import math

import pandas as pd
import akshare as ak

from datetime import datetime, timedelta
from tqdm import tqdm

pd.options.display.max_rows=None
pd.options.display.max_columns=None

In [2]:
# 1. 获取中证50（000016）的股票列表
stock_code_list = ak.index_stock_cons('000016')['品种代码'].to_list()
stock_code_list[:5]

['688981', '688041', '601988', '601601', '600150']

In [3]:
# 2. 获取这些股票的个股信息
all_stock_individual_info = pd.DataFrame([ak.stock_individual_info_em(symbol=stock_code).set_index('item').to_dict()['value'] for stock_code in tqdm(stock_code_list)]).rename(columns={
                    "总市值": "total_market_cap",
                    "流通市值": "circulating_market_cap",
                    "行业": "industry",
                    "上市时间": "listing_date",
                    "股票代码": "stock_code",
                    "股票简称": "stock_name",
                    "总股本": "total_shares",
                    "流通股": "circulating_shares",
                })
all_stock_individual_info.head()

100%|██████████| 50/50 [00:04<00:00, 10.29it/s]


Unnamed: 0,stock_code,stock_name,total_shares,circulating_shares,total_market_cap,circulating_market_cap,industry,listing_date
0,688981,中芯国际,7946658000.0,1973609000.0,349652900000.0,86838800000.0,半导体,20200716
1,688041,海光信息,2324338000.0,880557200.0,178323200000.0,67556350000.0,半导体,20220812
2,601988,中国银行,294387800000.0,210765500000.0,1268811000000.0,908399400000.0,银行,20060705
3,601601,中国太保,9620341000.0,6845041000.0,250321300000.0,178108000000.0,保险,20071225
4,600150,中国船舶,4472429000.0,4472429000.0,140434300000.0,140434300000.0,船舶制造,19980520


In [4]:
# 3. 获取这些股票的历史数据
all_stock_history_info = []
for stock_code in tqdm(stock_code_list):
    stock_history_info = ak.stock_zh_a_hist(symbol=stock_code, adjust='hfq').rename(
            columns={
                "日期": "datetime",
                "开盘": "open",
                "最高": "high",
                "最低": "low",
                "收盘": "close",
                "成交量": "volume",
                "成交额": "turnover",
                "振幅": "amplitude",
                "涨跌幅": "change_pct",
                "涨跌额": "change_amount",
                "换手率": "turnover_rate",
            }
        )
    stock_history_info.insert(0, "stock_code", stock_code)
    all_stock_history_info.append(stock_history_info)
all_stock_history_info = pd.concat(all_stock_history_info)
all_stock_history_info.head()

100%|██████████| 50/50 [00:14<00:00,  3.55it/s]


Unnamed: 0,stock_code,datetime,open,close,high,low,volume,turnover,amplitude,change_pct,change_amount,turnover_rate
0,688981,2020-07-16,95.0,82.92,95.0,80.0,5522480,47979120000.0,54.62,201.97,55.46,53.09
1,688981,2020-07-17,79.0,77.06,84.9,75.0,2195971,17397820000.0,11.94,-7.07,-5.86,21.11
2,688981,2020-07-20,77.19,79.17,80.51,70.02,2286412,17009810000.0,13.61,2.74,2.11,21.98
3,688981,2020-07-21,78.3,78.63,82.89,77.77,1619190,12981260000.0,6.47,-0.68,-0.54,15.57
4,688981,2020-07-22,77.8,79.57,81.78,77.2,1339817,10685590000.0,5.82,1.2,0.94,12.88


In [112]:
# 4. 构建label
all_stock_label_info = all_stock_history_info[['stock_code', 'datetime', 'close']]
all_stock_label_info = all_stock_label_info.sort_values(by=["stock_code", "datetime"])
# 计算日收益率 & 历史窗口期（10天）内的平均收益率&标准差
all_stock_label_info["daily_return"] = all_stock_label_info.groupby("stock_code")["close"].pct_change()
all_stock_label_info["mean_return"] = all_stock_label_info.groupby("stock_code")["daily_return"].transform(lambda x: x.rolling(10).mean())
all_stock_label_info["std_return"] = all_stock_label_info.groupby("stock_code")["daily_return"].transform(lambda x: x.rolling(10).std())
# 计算未来5天的收益率
all_stock_label_info["close_in_5_days"] = all_stock_label_info.groupby("stock_code")["close"].shift(-5)
all_stock_label_info["return_5_days"] = all_stock_label_info["close_in_5_days"] / all_stock_label_info["close"] - 1
# 构建label列
all_stock_label_info["target"] = 0  # 默认设置为0
all_stock_label_info.loc[all_stock_label_info["return_5_days"] > all_stock_label_info["mean_return"] + 2 * all_stock_label_info["std_return"], "target"] = 1
all_stock_label_info.loc[all_stock_label_info["return_5_days"] < all_stock_label_info["mean_return"] - 2 * all_stock_label_info["std_return"], "target"] = 2
# 删除有NaN值的行，因为历史统计和未来数据可能不完整
all_stock_label_info.dropna(subset=["mean_return", "std_return", "close_in_5_days"], inplace=True)
# 构建Label表
all_stock_label_info = all_stock_label_info[["stock_code", "datetime", "target"]]
all_stock_label_info.head()

Unnamed: 0,stock_code,datetime,target
10,600028,2001-08-22,0
11,600028,2001-08-23,2
12,600028,2001-08-27,0
13,600028,2001-08-28,0
14,600028,2001-08-29,0


In [113]:
# 5. label表join特征表
wide_table_info = all_stock_label_info.merge(all_stock_individual_info[['stock_code', 'industry']], how='left').merge(all_stock_history_info, on=["stock_code", "datetime"], how="left")
wide_table_info.head()

Unnamed: 0,stock_code,datetime,target,industry,open,close,high,low,volume,turnover,amplitude,change_pct,change_amount,turnover_rate
0,600028,2001-08-22,0,石油行业,4.24,4.14,4.25,4.13,781415,324714000.0,2.83,-2.36,-0.1,5.07
1,600028,2001-08-23,2,石油行业,4.12,4.11,4.13,4.02,353094,143967000.0,2.66,-0.72,-0.03,2.29
2,600028,2001-08-27,0,石油行业,4.05,3.96,4.06,3.91,338673,134824000.0,3.65,-3.65,-0.15,2.2
3,600028,2001-08-28,0,石油行业,3.93,4.0,4.01,3.91,261038,103591000.0,2.53,1.01,0.04,1.7
4,600028,2001-08-29,0,石油行业,4.01,3.97,4.03,3.97,127005,50647000.0,1.5,-0.75,-0.03,0.82


In [114]:
# 使用tensorflow处理原始数据
import numpy as np
import pandas as pd

import tensorflow as tf

print(tf.__version__)

2.15.0


In [115]:
# 6. 选择固定时间区间的数据
train_start_date = pd.to_datetime('2000-01-01')
train_end_date = pd.to_datetime('2020-12-31')
val_start_date = pd.to_datetime('2021-01-01')
val_end_date = pd.to_datetime('2021-12-31')
test_start_date = pd.to_datetime('2022-01-01')
test_end_date = pd.to_datetime('2022-12-31')

train_data = wide_table_info[(pd.to_datetime(wide_table_info['datetime']) >= train_start_date) & (pd.to_datetime(wide_table_info['datetime']) <= train_end_date)]
val_data = wide_table_info[(pd.to_datetime(wide_table_info['datetime']) >= val_start_date) & (pd.to_datetime(wide_table_info['datetime']) <= val_end_date)]
test_data = wide_table_info[(pd.to_datetime(wide_table_info['datetime']) >= test_start_date) & (pd.to_datetime(wide_table_info['datetime']) <= test_end_date)]

train_data = train_data[['target', 'industry', 'open', 'close', 'high', 'low', 'volume', 'turnover', 'amplitude','change_pct', 'change_amount', 'turnover_rate']]
validation_data = val_data[['target', 'industry', 'open', 'close', 'high', 'low', 'volume', 'turnover', 'amplitude','change_pct', 'change_amount', 'turnover_rate']]
test_data = test_data[['target', 'industry', 'open', 'close', 'high', 'low', 'volume', 'turnover', 'amplitude','change_pct', 'change_amount', 'turnover_rate']]

In [119]:
train_0, train_1, train_2 = np.bincount(train_data['target'])
train_total = train_0 + train_1 + train_2
print('Train:\nTotal: {}, Normal: {},Positive: {}, Negative:{} \n'.format(train_total, train_0, train_1, train_2))

val_0, val_1, val_2 = np.bincount(validation_data['target'])
val_total = val_0 + val_1 + val_2
print('Validation:\nTotal: {}, Normal: {},Positive: {}, Negative:{} \n'.format(val_total, val_0, val_1, val_2))

test_0, test_1, test_2 = np.bincount(test_data['target'])
test_total = test_0 + test_1 + test_2
print('Test:\nTotal: {}, Normal: {},Positive: {}, Negative:{} \n'.format(test_total, test_0, test_1, test_2))


# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / train_0) * (train_total / 2.0)
weight_for_1 = (1 / train_1) * (train_total / 2.0)
weight_for_2 = (1 / train_2) * (train_total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
print('Weight for class 2: {:.2f}'.format(weight_for_2))

Train:
Total: 151865, Normal: 92334,Positive: 32386, Negative:27145 

Validation:
Total: 11619, Normal: 7303,Positive: 2278, Negative:2038 

Test:
Total: 11936, Normal: 7484,Positive: 2191, Negative:2261 

Weight for class 0: 0.82
Weight for class 1: 2.34
Weight for class 2: 2.80


In [125]:
def get_numeric_boundaries(series, num_bins=20):
    return pd.qcut(series, num_bins, retbins=True)[1].tolist()

TARGET_FEATURE_NAME = "target"
TARGET_FEATURE_LABELS = ["0", "1", "2"]
TARGET_FEATURE_LENGTH = len(TARGET_FEATURE_LABELS)

# 连续特征分桶
NUMERIC_FEATURES_WITH_BOUNDARIES = {
    'open': get_numeric_boundaries(train_data['open']),
    'close': get_numeric_boundaries(train_data['close']),
    'high': get_numeric_boundaries(train_data['high']),
    'low': get_numeric_boundaries(train_data['low']),
    'volume': get_numeric_boundaries(train_data['volume']),
    'turnover': get_numeric_boundaries(train_data['turnover']),
    'amplitude': get_numeric_boundaries(train_data['amplitude']),
    'change_pct': get_numeric_boundaries(train_data['change_pct']),
    'change_amount': get_numeric_boundaries(train_data['change_amount']),
    'turnover_rate': get_numeric_boundaries(train_data['turnover_rate'])
}
NUMERIC_FEATURE_NAMES = list(NUMERIC_FEATURES_WITH_BOUNDARIES.keys())

# 离散特征embedding
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "industry": sorted(list(train_data["industry"].unique())),
}
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

In [121]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop(TARGET_FEATURE_NAME)
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

train_ds = df_to_dataset(train_data)
val_ds = df_to_dataset(test_data)
test_ds = df_to_dataset(test_data)

In [122]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = tf.keras.layers.Input(
                name=feature_name, shape=(), dtype="float32"
            )
        else:
            inputs[feature_name] = tf.keras.layers.Input(
                name=feature_name, shape=(), dtype="string"
            )
    return inputs

def encode_inputs(inputs):
    encoded_features = []
    embedding_dim = 4
    for feature_name in inputs:
        if feature_name in NUMERIC_FEATURE_NAMES: # 处理连续特征
            embedding_size = len(NUMERIC_FEATURES_WITH_BOUNDARIES[feature_name]) * 2
            embedding = tf.keras.layers.Embedding(
                input_dim=embedding_size, output_dim=embedding_dim
            )
            lookup_layer = tf.keras.layers.Discretization(bin_boundaries=NUMERIC_FEATURES_WITH_BOUNDARIES[feature_name],output_mode='int')
            encoded_feature = embedding(lookup_layer(inputs[feature_name]))
        elif feature_name in CATEGORICAL_FEATURE_NAMES: # 处理类别特征
            embedding_size = len(CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]) * 2
            embedding = tf.keras.layers.Embedding(
                input_dim=embedding_size, output_dim=embedding_dim
            )
            lookup_layer = tf.keras.layers.Hashing(num_bins=embedding_size)
            encoded_feature = embedding(lookup_layer(inputs[feature_name]))
        
        # print(encoded_feature)
        encoded_features.append(encoded_feature)
    
    all_features = tf.keras.layers.concatenate(encoded_features)
    return all_features

In [130]:
LEARNING_RATE = 5e-3
NUM_EPOCH = 50

def run_experiment(model, train_ds, val_ds, test_ds):
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE)
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=metrics,
    )

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_sparse_categorical_accuracy',
        verbose=1,
        patience=10,
        mode='max',
        restore_best_weights=True
    )
    print("Start training the model...")
    history = model.fit(
        train_ds, 
        epochs=NUM_EPOCH, 
        validation_data=val_ds, 
        verbose=2,
        callbacks=[early_stopping],
        class_weight=class_weight
    )
    print("Model training finished")

    # loss, auc = model.evaluate(test_ds, verbose=0)
    # print(f"Test AUC::{round(auc * 100, 2)}%")

In [131]:
dropout_rate = 0.2
hidden_units = [64, 32]

In [132]:
def create_baseline_model(output_bias=None):
    inputs = create_model_inputs()
    features = encode_inputs(inputs)

    for units in hidden_units:
        features = tf.keras.layers.Dense(units)(features)
        features = tf.keras.layers.BatchNormalization()(features)
        features = tf.keras.layers.ReLU()(features)
        features = tf.keras.layers.Dropout(dropout_rate)(features)

    # outputs = tf.keras.layers.Dense(units=1, activation="sigmoid")(features)
    outputs = tf.keras.layers.Dense(units=TARGET_FEATURE_LENGTH, activation="softmax")(features)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

baseline_model = create_baseline_model()
# tf.keras.utils.plot_model(baseline_model, show_shapes=True, rankdir="LR")
run_experiment(baseline_model, train_ds, val_ds, test_ds)

Start training the model...
Epoch 1/20
4746/4746 - 4s - loss: 1.6707 - sparse_categorical_accuracy: 0.3124 - val_loss: 1.0877 - val_sparse_categorical_accuracy: 0.3660 - 4s/epoch - 939us/step
Epoch 2/20
4746/4746 - 4s - loss: 1.6441 - sparse_categorical_accuracy: 0.3202 - val_loss: 1.1016 - val_sparse_categorical_accuracy: 0.2666 - 4s/epoch - 794us/step
Epoch 3/20
4746/4746 - 4s - loss: 1.6419 - sparse_categorical_accuracy: 0.3321 - val_loss: 1.0973 - val_sparse_categorical_accuracy: 0.3288 - 4s/epoch - 797us/step
Epoch 4/20
4746/4746 - 4s - loss: 1.6400 - sparse_categorical_accuracy: 0.3355 - val_loss: 1.1016 - val_sparse_categorical_accuracy: 0.2805 - 4s/epoch - 796us/step
Epoch 5/20
4746/4746 - 4s - loss: 1.6392 - sparse_categorical_accuracy: 0.3363 - val_loss: 1.0998 - val_sparse_categorical_accuracy: 0.2874 - 4s/epoch - 793us/step
Epoch 6/20
4746/4746 - 4s - loss: 1.6374 - sparse_categorical_accuracy: 0.3379 - val_loss: 1.1015 - val_sparse_categorical_accuracy: 0.2721 - 4s/epoch -

In [133]:
def create_wide_and_deep_model():
    inputs = create_model_inputs()
    wide = encode_inputs(inputs)
    wide = tf.keras.layers.BatchNormalization()(wide)

    deep = encode_inputs(inputs)
    for units in hidden_units:
        deep = tf.keras.layers.Dense(units)(deep)
        deep = tf.keras.layers.BatchNormalization()(deep)
        deep = tf.keras.layers.ReLU()(deep)
        deep = tf.keras.layers.Dropout(dropout_rate)(deep)

    merged = tf.keras.layers.concatenate([wide, deep])
    # outputs = tf.keras.layers.Dense(units=1)(merged)
    outputs = tf.keras.layers.Dense(units=TARGET_FEATURE_LENGTH, activation="softmax")(merged)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model


wide_and_deep_model = create_wide_and_deep_model()
# keras.utils.plot_model(wide_and_deep_model, show_shapes=True, rankdir="LR")

run_experiment(wide_and_deep_model,train_ds, val_ds, test_ds)

Start training the model...
Epoch 1/20
4746/4746 - 6s - loss: 1.6681 - sparse_categorical_accuracy: 0.3398 - val_loss: 1.1178 - val_sparse_categorical_accuracy: 0.2932 - 6s/epoch - 1ms/step
Epoch 2/20
4746/4746 - 5s - loss: 1.6443 - sparse_categorical_accuracy: 0.3438 - val_loss: 1.1131 - val_sparse_categorical_accuracy: 0.2723 - 5s/epoch - 955us/step
Epoch 3/20
4746/4746 - 5s - loss: 1.6425 - sparse_categorical_accuracy: 0.3458 - val_loss: 1.0958 - val_sparse_categorical_accuracy: 0.3335 - 5s/epoch - 965us/step
Epoch 4/20
4746/4746 - 5s - loss: 1.6409 - sparse_categorical_accuracy: 0.3446 - val_loss: 1.0939 - val_sparse_categorical_accuracy: 0.3311 - 5s/epoch - 958us/step
Epoch 5/20
4746/4746 - 5s - loss: 1.6398 - sparse_categorical_accuracy: 0.3436 - val_loss: 1.1007 - val_sparse_categorical_accuracy: 0.3138 - 5s/epoch - 968us/step
Epoch 6/20
4746/4746 - 5s - loss: 1.6381 - sparse_categorical_accuracy: 0.3452 - val_loss: 1.0999 - val_sparse_categorical_accuracy: 0.2998 - 5s/epoch - 9

In [134]:
wide_and_deep_model.save('./hh_quant_tf_wdl_model')
reloaded_model = tf.keras.models.load_model('./hh_quant_tf_wdl_model')

INFO:tensorflow:Assets written to: ./hh_quant_tf_wdl_model/assets


INFO:tensorflow:Assets written to: ./hh_quant_tf_wdl_model/assets


In [135]:
samples = df_to_dataset(test_data.iloc[:100, :], shuffle=False, batch_size=10)

In [143]:
predictions = reloaded_model.predict(samples)
prob = tf.nn.softmax(tf.squeeze(predictions))

# print(
#     "This particular pet had a %.4f percent probability "
#     "of getting adopted." % (100 * prob)
# )

 1/10 [==>...........................] - ETA: 0s



In [146]:
prob.numpy().argmax(axis=-1)

array([1, 1, 0, 0, 2, 1, 1, 2, 0, 0, 0, 1, 1, 1, 2, 0, 2, 1, 0, 0, 0, 1,
       2, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 2, 2, 1, 2, 1, 1, 1, 1,
       1, 2, 1, 2, 1, 1, 2, 1, 0, 1, 1, 2])