In [54]:
import warnings
warnings.filterwarnings("ignore")

import os

# Only the TensorFlow backend supports string inputs.
os.environ["KERAS_BACKEND"] = "tensorflow"

import math

import pandas as pd
import akshare as ak

from datetime import datetime, timedelta
from tqdm import tqdm

pd.options.display.max_rows=None
pd.options.display.max_columns=None

In [2]:
# 1. 获取中证50（000016）的股票列表
stock_code_list = ak.index_stock_cons('000016')['品种代码'].to_list()
stock_code_list[:5]

['688981', '688041', '601988', '601601', '600150']

In [3]:
# 2. 获取这些股票的个股信息
all_stock_individual_info = pd.DataFrame([ak.stock_individual_info_em(symbol=stock_code).set_index('item').to_dict()['value'] for stock_code in tqdm(stock_code_list)]).rename(columns={
                    "总市值": "total_market_cap",
                    "流通市值": "circulating_market_cap",
                    "行业": "industry",
                    "上市时间": "listing_date",
                    "股票代码": "stock_code",
                    "股票简称": "stock_name",
                    "总股本": "total_shares",
                    "流通股": "circulating_shares",
                })
all_stock_individual_info.head()

100%|██████████| 50/50 [00:04<00:00, 10.11it/s]


Unnamed: 0,stock_code,stock_name,total_shares,circulating_shares,total_market_cap,circulating_market_cap,industry,listing_date
0,688981,中芯国际,7946658000.0,1973609000.0,349652900000.0,86838800000.0,半导体,20200716
1,688041,海光信息,2324338000.0,880557200.0,178323200000.0,67556350000.0,半导体,20220812
2,601988,中国银行,294387800000.0,210765500000.0,1268811000000.0,908399400000.0,银行,20060705
3,601601,中国太保,9620341000.0,6845041000.0,250321300000.0,178108000000.0,保险,20071225
4,600150,中国船舶,4472429000.0,4472429000.0,140434300000.0,140434300000.0,船舶制造,19980520


In [4]:
# 3. 获取这些股票的历史数据
all_stock_history_info = []
for stock_code in tqdm(stock_code_list):
    stock_history_info = ak.stock_zh_a_hist(symbol=stock_code, adjust='hfq').rename(
            columns={
                "日期": "datetime",
                "开盘": "open",
                "最高": "high",
                "最低": "low",
                "收盘": "close",
                "成交量": "volume",
                "成交额": "turnover",
                "振幅": "amplitude",
                "涨跌幅": "change_pct",
                "涨跌额": "change_amount",
                "换手率": "turnover_rate",
            }
        )
    stock_history_info.insert(0, "stock_code", stock_code)
    all_stock_history_info.append(stock_history_info)
all_stock_history_info = pd.concat(all_stock_history_info)
all_stock_history_info.head()

100%|██████████| 50/50 [00:12<00:00,  4.02it/s]


Unnamed: 0,stock_code,datetime,open,close,high,low,volume,turnover,amplitude,change_pct,change_amount,turnover_rate
0,688981,2020-07-16,95.0,82.92,95.0,80.0,5522480,47979120000.0,54.62,201.97,55.46,53.09
1,688981,2020-07-17,79.0,77.06,84.9,75.0,2195971,17397820000.0,11.94,-7.07,-5.86,21.11
2,688981,2020-07-20,77.19,79.17,80.51,70.02,2286412,17009810000.0,13.61,2.74,2.11,21.98
3,688981,2020-07-21,78.3,78.63,82.89,77.77,1619190,12981260000.0,6.47,-0.68,-0.54,15.57
4,688981,2020-07-22,77.8,79.57,81.78,77.2,1339817,10685590000.0,5.82,1.2,0.94,12.88


In [5]:
# 4. 构建label表（10天后的收益率 > 10%）
all_stock_label_info = all_stock_history_info[['stock_code', 'datetime', 'close']]
all_stock_label_info = all_stock_label_info.sort_values(['stock_code', 'datetime'])
all_stock_label_info['close_in_10_days'] = all_stock_label_info.groupby('stock_code')['close'].shift(-10)
all_stock_label_info['return_10_days'] = all_stock_label_info['close_in_10_days'] / all_stock_label_info['close'] - 1
# all_stock_label_info['label'] = 0
# all_stock_label_info.loc[all_stock_label_info['return_10_days'] >= 0.1, 'label'] = 1
all_stock_label_info = all_stock_label_info[['stock_code', 'datetime', 'return_10_days']]
all_stock_label_info.head()

Unnamed: 0,stock_code,datetime,return_10_days
0,600028,2001-08-08,-0.050459
1,600028,2001-08-09,-0.037471
2,600028,2001-08-10,-0.087558
3,600028,2001-08-13,-0.069767
4,600028,2001-08-14,-0.07243


In [6]:
# 5. label表join特征表
wide_table_info = all_stock_label_info.merge(all_stock_individual_info[['stock_code', 'industry']], how='left').merge(all_stock_history_info, on=["stock_code", "datetime"], how="left")
wide_table_info.head()

Unnamed: 0,stock_code,datetime,return_10_days,industry,open,close,high,low,volume,turnover,amplitude,change_pct,change_amount,turnover_rate
0,600028,2001-08-08,-0.050459,石油行业,4.6,4.36,4.7,4.31,6775533,3015767000.0,9.24,3.32,0.14,44.0
1,600028,2001-08-09,-0.037471,石油行业,4.34,4.27,4.35,4.23,1568098,671175000.0,2.75,-2.06,-0.09,10.18
2,600028,2001-08-10,-0.087558,石油行业,4.27,4.34,4.4,4.25,808393,349733000.0,3.51,1.64,0.07,5.25
3,600028,2001-08-13,-0.069767,石油行业,4.33,4.3,4.36,4.27,380127,163571000.0,2.07,-0.92,-0.04,2.47
4,600028,2001-08-14,-0.07243,石油行业,4.3,4.28,4.35,4.27,282869,121521000.0,1.86,-0.47,-0.02,1.84


In [9]:
# 使用tensorflow处理原始数据
import numpy as np
import pandas as pd

import tensorflow as tf

print(tf.__version__)

2.15.0


In [47]:
# 6. 选择固定时间区间的数据
wide_table_info['target'] = np.where(wide_table_info['return_10_days']>=0.1, 1, 0)
# dataframe = wide_table_info

train_start_date = pd.to_datetime('2000-01-01')
train_end_date = pd.to_datetime('2009-12-31')
val_start_date = pd.to_datetime('2010-01-01')
val_end_date = pd.to_datetime('2011-12-31')
test_start_date = pd.to_datetime('2012-01-01')
test_end_date = pd.to_datetime('2013-12-31')

train_data = wide_table_info[(pd.to_datetime(wide_table_info['datetime']) >= train_start_date) & (pd.to_datetime(wide_table_info['datetime']) <= train_end_date)]
val_data = wide_table_info[(pd.to_datetime(wide_table_info['datetime']) >= val_start_date) & (pd.to_datetime(wide_table_info['datetime']) <= val_end_date)]
test_data = wide_table_info[(pd.to_datetime(wide_table_info['datetime']) >= test_start_date) & (pd.to_datetime(wide_table_info['datetime']) <= test_end_date)]

train_data = train_data[['target', 'industry', 'open', 'close', 'high', 'low', 'volume', 'turnover', 'amplitude','change_pct', 'change_amount', 'turnover_rate']]
val_data = val_data[['target', 'industry', 'open', 'close', 'high', 'low', 'volume', 'turnover', 'amplitude','change_pct', 'change_amount', 'turnover_rate']]
test_data = test_data[['target', 'industry', 'open', 'close', 'high', 'low', 'volume', 'turnover', 'amplitude','change_pct', 'change_amount', 'turnover_rate']]

In [48]:
print(train_data['target'].value_counts())
print(val_data['target'].value_counts())
print(test_data['target'].value_counts())

target
0    39558
1     7138
Name: count, dtype: int64
target
0    15560
1     1108
Name: count, dtype: int64
target
0    16860
1     1114
Name: count, dtype: int64


In [56]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

train_ds = df_to_dataset(train_data)
val_ds = df_to_dataset(test_data)
test_ds = df_to_dataset(test_data)

In [57]:
TARGET_FEATURE_NAME = "target"
TARGET_FEATURE_LABELS = ["0", "1"]

NUMERIC_FEATURE_NAMES = [
    'open',
    'close',
    'high',
    'low',
    'volume',
    'turnover',
    'amplitude',
    'change_pct',
    'change_amount',
    'turnover_rate'
]

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "industry": list(train_data["industry"].unique()),
}

CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

NUM_CLASSES = len(TARGET_FEATURE_LABELS)

In [60]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = tf.keras.layers.Input(
                name=feature_name, shape=(), dtype="float32"
            )
        else:
            inputs[feature_name] = tf.keras.layers.Input(
                name=feature_name, shape=(), dtype="string"
            )
    return inputs

def encode_inputs(inputs, use_embedding=False):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            lookup = tf.keras.layers.StringLookup(
                vocabulary=vocabulary,
                mask_token=None,
                num_oov_indices=0,
                output_mode="int" if use_embedding else "binary",
            )
            if use_embedding:
                # Convert the string input values into integer indices.
                encoded_feature = lookup(inputs[feature_name])
                embedding_dims = int(math.sqrt(len(vocabulary)))
                # Create an embedding layer with the specified dimensions.
                embedding = tf.keras.layers.Embedding(
                    input_dim=len(vocabulary), output_dim=embedding_dims
                )
                # Convert the index values to embedding representations.
                encoded_feature = embedding(encoded_feature)
            else:
                # Convert the string input values into a one hot encoding.
                encoded_feature = lookup(
                    tf.expand_dims(inputs[feature_name], -1)
                )
        else:
            # Use the numerical features as-is.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)

        encoded_features.append(encoded_feature)

    all_features = tf.keras.layers.concatenate(encoded_features)
    return all_features

In [64]:
def run_experiment(model, train_ds, val_ds, test_ds):
    model.compile(
        optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=["accuracy"],
    )

    print("Start training the model...")
    history = model.fit(train_ds, epochs=num_epochs, validation_data=val_ds, verbose=2)
    print("Model training finished")

    _, accuracy = model.evaluate(test_ds, verbose=0)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

learning_rate = 0.001
dropout_rate = 0.2
batch_size = 26564
num_epochs = 20
hidden_units = [64, 32]

def create_baseline_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)

    for units in hidden_units:
        features = tf.keras.layers.Dense(units)(features)
        features = tf.keras.layers.BatchNormalization()(features)
        features = tf.keras.layers.ReLU()(features)
        features = tf.keras.layers.Dropout(dropout_rate)(features)

    outputs = tf.keras.layers.Dense(units=1)(features)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model


baseline_model = create_baseline_model()
# tf.keras.utils.plot_model(baseline_model, show_shapes=True, rankdir="LR")

In [66]:
run_experiment(baseline_model, train_ds, val_ds, test_ds)



Start training the model...
Epoch 1/20
1460/1460 - 2s - loss: 0.4460 - accuracy: 0.8456 - val_loss: 0.2959 - val_accuracy: 0.9380 - 2s/epoch - 1ms/step
Epoch 2/20
1460/1460 - 1s - loss: 0.4339 - accuracy: 0.8471 - val_loss: 0.2998 - val_accuracy: 0.9380 - 1s/epoch - 712us/step
Epoch 3/20
1460/1460 - 1s - loss: 0.4308 - accuracy: 0.8471 - val_loss: 0.2929 - val_accuracy: 0.9380 - 1s/epoch - 721us/step
Epoch 4/20
1460/1460 - 1s - loss: 0.4296 - accuracy: 0.8471 - val_loss: 0.2947 - val_accuracy: 0.9380 - 1s/epoch - 720us/step
Epoch 5/20
1460/1460 - 1s - loss: 0.4289 - accuracy: 0.8471 - val_loss: 0.2924 - val_accuracy: 0.9380 - 1s/epoch - 717us/step
Epoch 6/20
1460/1460 - 1s - loss: 0.4283 - accuracy: 0.8471 - val_loss: 0.2921 - val_accuracy: 0.9380 - 1s/epoch - 714us/step
Epoch 7/20
1460/1460 - 1s - loss: 0.4275 - accuracy: 0.8471 - val_loss: 0.2833 - val_accuracy: 0.9380 - 1s/epoch - 719us/step
Epoch 8/20
1460/1460 - 1s - loss: 0.4275 - accuracy: 0.8471 - val_loss: 0.2773 - val_accurac

In [70]:
def create_wide_and_deep_model():
    inputs = create_model_inputs()
    wide = encode_inputs(inputs)
    wide = tf.keras.layers.BatchNormalization()(wide)

    deep = encode_inputs(inputs, use_embedding=True)
    for units in hidden_units:
        deep = tf.keras.layers.Dense(units)(deep)
        deep = tf.keras.layers.BatchNormalization()(deep)
        deep = tf.keras.layers.ReLU()(deep)
        deep = tf.keras.layers.Dropout(dropout_rate)(deep)

    merged = tf.keras.layers.concatenate([wide, deep])
    outputs = tf.keras.layers.Dense(units=1)(merged)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model


wide_and_deep_model = create_wide_and_deep_model()
# keras.utils.plot_model(wide_and_deep_model, show_shapes=True, rankdir="LR")

In [71]:
run_experiment(wide_and_deep_model,train_ds, val_ds, test_ds)



Start training the model...
Epoch 1/20
1460/1460 - 2s - loss: 0.4453 - accuracy: 0.8439 - val_loss: 0.3324 - val_accuracy: 0.9143 - 2s/epoch - 1ms/step
Epoch 2/20
1460/1460 - 1s - loss: 0.4211 - accuracy: 0.8470 - val_loss: 0.3186 - val_accuracy: 0.9143 - 1s/epoch - 789us/step
Epoch 3/20
1460/1460 - 1s - loss: 0.4175 - accuracy: 0.8471 - val_loss: 0.3490 - val_accuracy: 0.9143 - 1s/epoch - 795us/step
Epoch 4/20
1460/1460 - 1s - loss: 0.4176 - accuracy: 0.8471 - val_loss: 0.3166 - val_accuracy: 0.9143 - 1s/epoch - 789us/step
Epoch 5/20
1460/1460 - 1s - loss: 0.4163 - accuracy: 0.8471 - val_loss: 0.3040 - val_accuracy: 0.9143 - 1s/epoch - 792us/step
Epoch 6/20
1460/1460 - 1s - loss: 0.4163 - accuracy: 0.8471 - val_loss: 0.2772 - val_accuracy: 0.9143 - 1s/epoch - 798us/step
Epoch 7/20
1460/1460 - 1s - loss: 0.4160 - accuracy: 0.8471 - val_loss: 0.2877 - val_accuracy: 0.9143 - 1s/epoch - 791us/step
Epoch 8/20
1460/1460 - 1s - loss: 0.4161 - accuracy: 0.8471 - val_loss: 0.2918 - val_accurac