In [3]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


from get_stock_data import Downloader, mkdir
import os
import argparse
from tqdm import tqdm
import pandas as pd
import numpy as np
import catboost as cb
import datetime
import functools
from collections import defaultdict, Counter

from utils import reduce_mem


# 使用该股对应的板块的open close等，作为特征
def concept_feature(data):
    print('Begin concept feature')
    data.code = data.code.apply(lambda x:int(x[3:]))
    data.date = data.date.apply(lambda x: int(x.replace('-', '')))
    concept_df = pd.read_csv(params.concept_path)[['代码', '板块名称']]
    concept_hist_df = pd.read_csv(params.concept_hist_path)
    concept_df['代码'] = concept_df['代码'].apply(lambda x:str(x).zfill(6))
    
    concept_hist_df['日期'] = concept_hist_df['日期'].apply(lambda x:int(x.replace('-', '')))
    
    concept_counter = Counter([c for c in concept_df['板块名称'].values if '昨日' not in c])

    concept_dic = defaultdict(list)
    for code, concept in concept_df.values:
        if '昨日' in concept:continue
        concept_dic[int(code)].append(concept)

    def compare_concept(x, y):
        x, y = concept_counter[x], concept_counter[y]
        if x < y:return -1
        if x > y: return 1
        return 0
    for k, v in concept_dic.items():
        concept_dic[k] = sorted(v, key=functools.cmp_to_key(compare_concept))

    data['concept_0'] = data.code.apply(lambda x:concept_dic[int(x)][0] if len(concept_dic[x])>0 else np.nan)
    data['concept_1'] = data.code.apply(lambda x:concept_dic[int(x)][1] if len(concept_dic[x])>1 else np.nan)
    data['concept_2'] = data.code.apply(lambda x:concept_dic[int(x)][2] if len(concept_dic[x])>2 else np.nan)

    data['concept_-3'] = data.code.apply(lambda x:concept_dic[int(x)][-3] if len(concept_dic[x])>3 else np.nan)
    data['concept_-2'] = data.code.apply(lambda x:concept_dic[int(x)][-2] if len(concept_dic[x])>4 else np.nan)
    data['concept_-1'] = data.code.apply(lambda x:concept_dic[int(x)][-1] if len(concept_dic[x])>5 else np.nan)
    
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_0'], right_on=['日期', '板块名称'], how='left')
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_1'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_1'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_2'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_2'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_-3'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_-3'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_-2'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_-2'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_-1'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_-1'))
    
    
    # label encoder
    concept_labelencoder = {c:i for i, c in enumerate(np.unique(concept_df['板块名称'].values))}
    concept_labelencoder.update({np.nan:np.nan})
    data['concept_0'] = data['concept_0'].apply(lambda x:concept_labelencoder[x])
    data['concept_1'] = data['concept_1'].apply(lambda x:concept_labelencoder[x])
    data['concept_2'] = data['concept_2'].apply(lambda x:concept_labelencoder[x])
    
    data['concept_-3'] = data['concept_-3'].apply(lambda x:concept_labelencoder[x])
    data['concept_-2'] = data['concept_-2'].apply(lambda x:concept_labelencoder[x])
    data['concept_-1'] = data['concept_-1'].apply(lambda x:concept_labelencoder[x])
    return data

def feature_engineer(data, window_size):
    data = data.sort_values(by=['code', 'date'])
    alpha_list = ['open', 'high', 'low', 'close', 'volume', 'amount', 'adjustflag', 'turn', 'pctChg', 'peTTM', 'psTTM', 'pcfNcfTTM', 'pbMRQ']
    
    stock_industry = pd.read_csv(params.industry_path, encoding="gbk")
#     stock_industry.code = stock_industry.code.apply(lambda x:int(x[3:]))s
    
    from sklearn.preprocessing import LabelEncoder
    lbe = LabelEncoder()
    stock_industry['industry'] = lbe.fit_transform(stock_industry['industry'])
    data = pd.merge(data, stock_industry[['code', 'industry']], how='left', on='code')
    
    # concept feature
    data = concept_feature(data)
    alpha_list += [f'{x}{i}' for x in ['收盘', '换手率', '成交额'] for i in ['', '_1', '_2', '_-3', '_-2', '_-1']]
    data = reduce_mem(data, list(data))

    # alpha net
    length = window_size
    
    
    for name in tqdm(alpha_list):
#     for name in tqdm(['open']):
        roll_feature = []
        for i, group in data.groupby('code', sort=False)[name]:
            values = group.tolist()
            values = [0]*(length - 1) + values
            roll_feature = roll_feature + [values[i:i+length] for i in range(len(group))]
        roll_columns = [f'{name}_dt{i}' for i in range(length)]
        data = pd.concat([data, pd.DataFrame(roll_feature, columns=roll_columns)], axis=1).reset_index(drop=True)
        data = reduce_mem(data, roll_columns)
    
    # generate label
    data['label'] = data.groupby('code', sort=False).close.transform(lambda x:(x.shift(-14) - x) / (x + 1e-7) )
#     data = data.dropna(subset = ['label'], inplace=False)
#     data = data.replace(np.nan, 0)
    train, test = data[data['date'] <= params.date_split].reset_index(drop=True), data[data['date'] == params.date_split].reset_index(drop=True)
    train = train.dropna(subset = ['label'], inplace=False)
    return train, test

def update_dataset():
#     downloader = Downloader(params.raw_train_path, 
#                             date_start=params.date_start, # start和 end都包含
#                             date_end=params.date_end, 
#                             frequency='d',
#                             header=False,
#                             mode='all')
#     downloader.run()

    def df_concat(csv_path):
        df_arr = []
        for file_path in tqdm(os.listdir(csv_path)):
            try:
                if file_path.split('.') != 'csv':pass
                temp = pd.read_csv(csv_path + '/' + file_path, engine='python')
                temp['intDate'] = temp.date.apply(lambda x: int(x.replace('-', '')))
                temp = temp[temp['intDate'] >= params.date_abandon].reset_index(drop=True)
                if len(temp)==0:continue
                df_arr.append(temp)
            except:
                pass
        return pd.concat(df_arr)


    train = df_concat(params.raw_train_path)
    train = train.drop('intDate', axis=1)
    assert params.date_end in train.date.unique()
    train.to_csv(params.train_path, index=False)
    return

def preprocess():
    mode = params.mode
    if mode == 'debug':
        data = pd.read_csv(params.train_path, nrows=100000)
    else:
        data = pd.read_csv(params.train_path)

#     data.date = data.date.apply(lambda x: int(x.replace('-', '')))
#     data.code = data.code.apply(lambda x:x[3:])
#     data = data[data['date'] >= params.date_abandon].reset_index(drop=True)
    return data

def infer_model(train, test):
    ycol = 'label'
    feature_names = list(
        filter(lambda x: x not in [ycol, 'code', 'date', ''] and '日期' not in x and '板块名称' not in x, train.columns))

    # print(feature_names)

    quantile_30, quantile_70 = train.label.quantile([0.3, 0.7]).values
    print('quantile_30:', quantile_30)
    print('quantile_70:', quantile_70)
    def label_quantile(x):
        if x<quantile_30:return 0
        elif x<quantile_70:return 1
        else:return 2

    train.label = train.label.apply(label_quantile)
    test.label = test.label.apply(label_quantile)
    X_train = train[feature_names];Y_train = train[ycol]
    X_val = test[feature_names];Y_val = test[ycol]

    model = cb.CatBoostClassifier(eval_metric="AUC", task_type='GPU', **params.model_params)
    cat_model = model.fit(X_train,
                          Y_train,
                          eval_set=(X_val, Y_val),
                          verbose=500)

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': cat_model.feature_importances_,
    })

    # cat_model.save_model(f'cb_{frequency}.model')

    pd.set_option('display.max_rows', None)
    print(df_importance)
    cat_model.save_model(f'model/concept_{params.window_size}_debug.model')
    
    # prediction and save
    prediction = cat_model.predict_proba(X_val)
    test['label_-1'] = prediction[:,0]
    test['label_0'] = prediction[:,1]
    test['label_1'] = prediction[:,2]
    test.to_csv(params.submit_path, index=False)

def train_model(train, test):
    ycol = 'label'
    feature_names = list(
        filter(lambda x: x not in [ycol, 'code', 'date', ''] and '日期' not in x and '板块名称' not in x, train.columns))

    # print(feature_names)

    quantile_30, quantile_70 = train.label.quantile([0.3, 0.7]).values
    print('quantile_30:', quantile_30)
    print('quantile_70:', quantile_70)
    def label_quantile(x):
        if x<quantile_30:return 0
        elif x<quantile_70:return 1
        else:return 2

    train.label = train.label.apply(label_quantile)
    test.label = test.label.apply(label_quantile)
    X_train = train[feature_names];Y_train = train[ycol]
    X_val = test[feature_names];Y_val = test[ycol]

    model = cb.CatBoostClassifier(eval_metric="AUC", task_type='GPU', **params.model_params)
    cat_model = model.fit(X_train,
                          Y_train,
                          eval_set=(X_val, Y_val),
                          verbose=500)

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': cat_model.feature_importances_,
    })

    # cat_model.save_model(f'cb_{frequency}.model')

    pd.set_option('display.max_rows', None)
    print(df_importance)
    cat_model.save_model(f'model/concept_{params.window_size}_debug.model')
    
    # prediction and save
    prediction = cat_model.predict_proba(X_val)
    test['label_-1'] = prediction[:,0]
    test['label_0'] = prediction[:,1]
    test['label_1'] = prediction[:,2]
    test.to_csv(params.submit_path, index=False)

def main():
    # 获取全部股票的日K线数据
    mkdir('stockdata/d_data')
    
    # update dataset to today
#     update_dataset()

    # preprocess 
    data = preprocess()

    # feature engineer
    train, test = feature_engineer(data, params.window_size)
    
    # train model
    train_model(train, test)

class params:
    mode = 'debug'
    window_size = 14
    
    date_abandon = 20210101
    date_split = 20221111
    date_start = '2022-06-02'
    date_end = '2022-11-11' # 不能选择今天的日期，周末的日期
    
    raw_train_path = 'stockdata/d_train'
    raw_test_path = 'stockdata/d_test'
    train_path = 'stockdata/d_data/train.csv'
    test_path = 'stockdata/d_data/test.csv'
    industry_path = 'stockdata/stock_industry.csv'
    concept_path = 'stockdata/concept_df.csv'
    concept_hist_path = 'stockdata/concept_hist_df.csv'
    submit_path = f'submit/{date_end}.csv'    
    
    model_params = {'n_estimators':5000,
          'learning_rate': 0.05,
          'max_depth': 7,
          'early_stopping_rounds':1000,
          'loss_function':'MultiClass',
           'classes_count':3,
          'max_bin':512,
          'subsample':0.8,
          'bootstrap_type':'Poisson',
          'random_seed':np.random.randint(0,2021)}
    
    
if __name__ == '__main__':
    main()

Begin concept feature


  0%|          | 0/31 [00:00<?, ?it/s]

74.77 Mb, 26.89 Mb (64.03 %)


  3%|▎         | 1/31 [00:00<00:23,  1.29it/s]

36.81 Mb, 28.80 Mb (21.76 %)


  6%|▋         | 2/31 [00:01<00:20,  1.43it/s]

39.48 Mb, 31.47 Mb (20.29 %)


 10%|▉         | 3/31 [00:02<00:18,  1.48it/s]

42.15 Mb, 34.14 Mb (19.00 %)


 13%|█▎        | 4/31 [00:02<00:18,  1.50it/s]

44.82 Mb, 36.81 Mb (17.87 %)


 16%|█▌        | 5/31 [00:03<00:17,  1.53it/s]

47.49 Mb, 42.15 Mb (11.24 %)


 19%|█▉        | 6/31 [00:04<00:16,  1.53it/s]

52.83 Mb, 47.49 Mb (10.11 %)


 23%|██▎       | 7/31 [00:04<00:17,  1.41it/s]

58.17 Mb, 48.83 Mb (16.07 %)


 26%|██▌       | 8/31 [00:05<00:16,  1.38it/s]

59.51 Mb, 51.50 Mb (13.46 %)


 29%|██▉       | 9/31 [00:06<00:16,  1.35it/s]

62.18 Mb, 54.17 Mb (12.88 %)


 32%|███▏      | 10/31 [00:07<00:15,  1.39it/s]

64.85 Mb, 56.84 Mb (12.35 %)


 35%|███▌      | 11/31 [00:07<00:14,  1.41it/s]

67.52 Mb, 59.51 Mb (11.86 %)


 39%|███▊      | 12/31 [00:08<00:13,  1.43it/s]

70.19 Mb, 64.85 Mb (7.61 %)


 42%|████▏     | 13/31 [00:09<00:12,  1.43it/s]

75.53 Mb, 67.52 Mb (10.61 %)


 45%|████▌     | 14/31 [00:09<00:11,  1.43it/s]

78.20 Mb, 70.19 Mb (10.24 %)


 48%|████▊     | 15/31 [00:10<00:11,  1.43it/s]

80.87 Mb, 72.86 Mb (9.91 %)


 52%|█████▏    | 16/31 [00:11<00:10,  1.42it/s]

83.54 Mb, 75.53 Mb (9.59 %)


 55%|█████▍    | 17/31 [00:11<00:09,  1.41it/s]

86.21 Mb, 78.20 Mb (9.29 %)


 58%|█████▊    | 18/31 [00:12<00:09,  1.37it/s]

88.88 Mb, 80.87 Mb (9.01 %)


 61%|██████▏   | 19/31 [00:13<00:08,  1.37it/s]

91.55 Mb, 83.54 Mb (8.75 %)


 65%|██████▍   | 20/31 [00:14<00:08,  1.34it/s]

94.22 Mb, 86.21 Mb (8.50 %)


 68%|██████▊   | 21/31 [00:14<00:07,  1.34it/s]

96.89 Mb, 88.88 Mb (8.27 %)


 71%|███████   | 22/31 [00:15<00:06,  1.35it/s]

99.56 Mb, 91.55 Mb (8.05 %)


 74%|███████▍  | 23/31 [00:16<00:05,  1.35it/s]

102.23 Mb, 94.22 Mb (7.84 %)


 77%|███████▋  | 24/31 [00:17<00:05,  1.33it/s]

104.90 Mb, 96.89 Mb (7.64 %)


 81%|████████  | 25/31 [00:18<00:04,  1.28it/s]

107.57 Mb, 99.56 Mb (7.45 %)


 84%|████████▍ | 26/31 [00:18<00:03,  1.28it/s]

110.24 Mb, 104.90 Mb (4.84 %)


 87%|████████▋ | 27/31 [00:19<00:03,  1.29it/s]

115.59 Mb, 110.24 Mb (4.62 %)


 90%|█████████ | 28/31 [00:20<00:02,  1.28it/s]

120.93 Mb, 115.59 Mb (4.42 %)


 94%|█████████▎| 29/31 [00:21<00:01,  1.26it/s]

126.27 Mb, 120.93 Mb (4.23 %)


 97%|█████████▋| 30/31 [00:22<00:00,  1.25it/s]

131.61 Mb, 126.27 Mb (4.06 %)


100%|██████████| 31/31 [00:22<00:00,  1.36it/s]

136.95 Mb, 131.61 Mb (3.90 %)





quantile_30: -0.04883117675781257
quantile_70: 0.037139892578125


AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0000000	best: 0.0000000 (0)	total: 192ms	remaining: 16m
500:	test: 0.0000000	best: 0.0000000 (0)	total: 40.2s	remaining: 6m
1000:	test: 0.0000000	best: 0.0000000 (0)	total: 1m 19s	remaining: 5m 17s
bestTest = 0
bestIteration = 0
              column  importance
0               open    0.172851
1               high    0.407215
2                low    0.191159
3              close    0.471232
4           preclose    0.182935
5             volume    0.179168
6             amount    1.175502
7         adjustflag    0.000000
8               turn    1.064915
9        tradestatus    0.000000
10            pctChg    0.611603
11             peTTM    0.590218
12             psTTM    0.773675
13         pcfNcfTTM    1.241107
14             pbMRQ    1.223850
15              isST    0.048746
16          industry    0.681678
17         concept_0    0.993355
18         concept_1    0.821528
19         concept_2    0.308214
20        concept_-3    0.375083
21        concept_-2    0.244870
2

In [None]:
## 需要预测未来的股价涨跌百分比排行