In [2]:
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import pandas_ta as ta
import akshare as ak

from datetime import datetime, timedelta
from tqdm import tqdm

pd.options.display.max_rows=None
pd.options.display.max_columns=None

In [7]:
# 1. 获取中证50（000016）的股票列表
stock_code_list = ak.index_stock_cons('000016')['品种代码'].to_list()
stock_code_list[:5]

['688981', '688041', '601988', '601601', '600150']

In [16]:
# 2. 获取这些股票的个股信息
all_stock_individual_info = pd.DataFrame([ak.stock_individual_info_em(symbol=stock_code).set_index('item').to_dict()['value'] for stock_code in tqdm(stock_code_list)]).rename(columns={
                    "总市值": "total_market_cap",
                    "流通市值": "circulating_market_cap",
                    "行业": "industry",
                    "上市时间": "listing_date",
                    "股票代码": "stock_code",
                    "股票简称": "stock_name",
                    "总股本": "total_shares",
                    "流通股": "circulating_shares",
                })
all_stock_individual_info.head()

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:04<00:00, 10.95it/s]


Unnamed: 0,total_market_cap,circulating_market_cap,industry,listing_date,stock_code,stock_name,total_shares,circulating_shares
0,340911600000.0,84667830000.0,半导体,20200716,688981,中芯国际,7946658000.0,1973609000.0
1,176068600000.0,66702210000.0,半导体,20220812,688041,海光信息,2324338000.0,880557200.0
2,1280587000000.0,916830000000.0,银行,20060705,601988,中国银行,294387800000.0,210765500000.0
3,247531400000.0,176122900000.0,保险,20071225,601601,中国太保,9620341000.0,6845041000.0
4,141641800000.0,141641800000.0,船舶制造,19980520,600150,中国船舶,4472429000.0,4472429000.0


In [35]:
# 3. 获取这些股票的历史数据
all_stock_history_info = []
for stock_code in tqdm(stock_code_list):
    stock_history_info = ak.stock_zh_a_hist(symbol=stock_code, adjust='hfq').rename(
            columns={
                "日期": "datetime",
                "开盘": "open",
                "最高": "high",
                "最低": "low",
                "收盘": "close",
                "成交量": "volume",
                "成交额": "turnover",
                "振幅": "amplitude",
                "涨跌幅": "change_pct",
                "涨跌额": "change_amount",
                "换手率": "turnover_rate",
            }
        )
    stock_history_info.insert(0, "stock_code", stock_code)
    all_stock_history_info.append(stock_history_info)
all_stock_history_info = pd.concat(all_stock_history_info)
all_stock_history_info.head()

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:12<00:00,  3.86it/s]


Unnamed: 0,stock_code,datetime,open,close,high,low,volume,turnover,amplitude,change_pct,change_amount,turnover_rate
0,688981,2020-07-16,95.0,82.92,95.0,80.0,5522480,47979120000.0,54.62,201.97,55.46,53.09
1,688981,2020-07-17,79.0,77.06,84.9,75.0,2195971,17397820000.0,11.94,-7.07,-5.86,21.11
2,688981,2020-07-20,77.19,79.17,80.51,70.02,2286412,17009810000.0,13.61,2.74,2.11,21.98
3,688981,2020-07-21,78.3,78.63,82.89,77.77,1619190,12981260000.0,6.47,-0.68,-0.54,15.57
4,688981,2020-07-22,77.8,79.57,81.78,77.2,1339817,10685590000.0,5.82,1.2,0.94,12.88


In [61]:
# 4. 构建label表（10天后的收益率 > 10%）
all_stock_label_info = all_stock_history_info[['stock_code', 'datetime', 'close']]
all_stock_label_info = all_stock_label_info.sort_values(['stock_code', 'datetime'])
all_stock_label_info['close_in_10_days'] = all_stock_label_info.groupby('stock_code')['close'].shift(-10)
all_stock_label_info['return_10_days'] = all_stock_label_info['close_in_10_days'] / all_stock_label_info['close'] - 1
all_stock_label_info['label'] = 0
all_stock_label_info.loc[all_stock_label_info['return_10_days'] >= 0.1, 'label'] = 1
all_stock_label_info = all_stock_label_info[['stock_code', 'datetime', 'label']]
all_stock_label_info.head()

Unnamed: 0,stock_code,datetime,label
0,600028,2001-08-08,0
1,600028,2001-08-09,0
2,600028,2001-08-10,0
3,600028,2001-08-13,0
4,600028,2001-08-14,0


In [64]:
# 5. label表join特征表
wide_table_info = all_stock_label_info.merge(all_stock_individual_info[['stock_code', 'industry']], how='left').merge(all_stock_history_info, on=["stock_code", "datetime"], how="left")
wide_table_info.head()

Unnamed: 0,stock_code,datetime,label,industry,open,close,high,low,volume,turnover,amplitude,change_pct,change_amount,turnover_rate
0,600028,2001-08-08,0,石油行业,4.6,4.36,4.7,4.31,6775533,3015767000.0,9.24,3.32,0.14,44.0
1,600028,2001-08-09,0,石油行业,4.34,4.27,4.35,4.23,1568098,671175000.0,2.75,-2.06,-0.09,10.18
2,600028,2001-08-10,0,石油行业,4.27,4.34,4.4,4.25,808393,349733000.0,3.51,1.64,0.07,5.25
3,600028,2001-08-13,0,石油行业,4.33,4.3,4.36,4.27,380127,163571000.0,2.07,-0.92,-0.04,2.47
4,600028,2001-08-14,0,石油行业,4.3,4.28,4.35,4.27,282869,121521000.0,1.86,-0.47,-0.02,1.84


In [80]:
wide_table_info.head()

Unnamed: 0,stock_code,datetime,label,industry,open,close,high,low,volume,turnover,amplitude,change_pct,change_amount,turnover_rate
0,600028,2001-08-08,0,石油行业,4.6,4.36,4.7,4.31,6775533,3015767000.0,9.24,3.32,0.14,44.0
1,600028,2001-08-09,0,石油行业,4.34,4.27,4.35,4.23,1568098,671175000.0,2.75,-2.06,-0.09,10.18
2,600028,2001-08-10,0,石油行业,4.27,4.34,4.4,4.25,808393,349733000.0,3.51,1.64,0.07,5.25
3,600028,2001-08-13,0,石油行业,4.33,4.3,4.36,4.27,380127,163571000.0,2.07,-0.92,-0.04,2.47
4,600028,2001-08-14,0,石油行业,4.3,4.28,4.35,4.27,282869,121521000.0,1.86,-0.47,-0.02,1.84


In [83]:
# 6. 选择固定时间区间的数据
train_start_date = pd.to_datetime('2000-01-01')
train_end_date = pd.to_datetime('2009-12-31')
test_start_date = pd.to_datetime('2010-01-01')
test_end_date = pd.to_datetime('2012-12-31')

# train_data = wide_table_info[(wide_table_info[['datetime']] >= train_start_date) & (wide_table_info[['datetime']] <= train_end_date)]
test_data = wide_table_info[(wide_table_info[['datetime']] >= test_start_date) & (wide_table_info[['datetime']] <= test_end_date)]

In [86]:
wide_table_info[['datetime']].head()

Unnamed: 0,datetime
0,2001-08-08
1,2001-08-09
2,2001-08-10
3,2001-08-13
4,2001-08-14
