In [None]:
from get_stock_data import Downloader, mkdir
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import catboost as cb
import datetime
import talib as ta
import functools
from collections import defaultdict, Counter

from utils import reduce_mem

# 获取全部股票的日K线数据
mkdir('stockdata/d_data')
raw_train_path = 'stockdata/d_train'
raw_test_path = 'stockdata/d_test'
train_path = 'stockdata/d_data/train.csv'
test_path = 'stockdata/d_data/test.csv'
industry_path = 'stockdata/stock_industry.csv'
mode = 'debug'


concept_path = 'stockdata/concept_df.csv'
concept_hist_path = 'stockdata/concept_hist_df.csv'

In [3]:
if mode == 'debug':
    train = pd.read_csv(train_path, nrows=100000)
    test = pd.read_csv(test_path, nrows=100000)
else:
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

In [None]:
train.date = train.date.apply(lambda x: int(x.replace('-', '')))
test.date = test.date.apply(lambda x: int(x.replace('-', '')))

train.code = train.code.apply(lambda x:x[3:])
test.code = test.code.apply(lambda x:x[3:])

In [None]:
train = train[train['date'] >= 20210101].reset_index(drop=True)

In [None]:
def concept_feature(data):
    print('Begin concept feature')
    concept_df = pd.read_csv(concept_path)[['代码', '板块名称']]
    concept_hist_df = pd.read_csv(concept_hist_path)
    concept_df['代码'] = concept_df['代码'].apply(lambda x:str(x).zfill(6))
    
    concept_hist_df['日期'] = concept_hist_df['日期'].apply(lambda x:int(x.replace('-', '')))
    
    concept_counter = Counter([c for c in concept_df['板块名称'].values if '昨日' not in c])

    concept_dic = defaultdict(list)
    for code, concept in concept_df.values:
        if '昨日' in concept:continue
        concept_dic[code].append(concept)

    def compare_concept(x, y):
        x, y = concept_counter[x], concept_counter[y]
        if x < y:return -1
        if x > y: return 1
        return 0
    for k, v in concept_dic.items():
        concept_dic[k] = sorted(v, key=functools.cmp_to_key(compare_concept))

    data['concept_0'] = data.code.apply(lambda x:concept_dic[x][0] if len(concept_dic[x])>0 else np.nan)
    data['concept_1'] = data.code.apply(lambda x:concept_dic[x][1] if len(concept_dic[x])>1 else np.nan)
    data['concept_2'] = data.code.apply(lambda x:concept_dic[x][2] if len(concept_dic[x])>2 else np.nan)

    data['concept_-3'] = data.code.apply(lambda x:concept_dic[x][-3] if len(concept_dic[x])>3 else np.nan)
    data['concept_-2'] = data.code.apply(lambda x:concept_dic[x][-2] if len(concept_dic[x])>4 else np.nan)
    data['concept_-1'] = data.code.apply(lambda x:concept_dic[x][-1] if len(concept_dic[x])>5 else np.nan)
    
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_0'], right_on=['日期', '板块名称'], how='left')
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_1'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_1'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_2'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_2'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_-3'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_-3'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_-2'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_-2'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_-1'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_-1'))
    
    
    # label encoder
    concept_labelencoder = {c:i for i, c in enumerate(np.unique(concept_df['板块名称'].values))}
    concept_labelencoder.update({np.nan:np.nan})
    data['concept_0'] = data['concept_0'].apply(lambda x:concept_labelencoder[x])
    data['concept_1'] = data['concept_1'].apply(lambda x:concept_labelencoder[x])
    data['concept_2'] = data['concept_2'].apply(lambda x:concept_labelencoder[x])
    
    data['concept_-3'] = data['concept_-3'].apply(lambda x:concept_labelencoder[x])
    data['concept_-2'] = data['concept_-2'].apply(lambda x:concept_labelencoder[x])
    data['concept_-1'] = data['concept_-1'].apply(lambda x:concept_labelencoder[x])
    return data

def feature_engineer(train, test, split=20220501):
    train_len = len(train)
    data = pd.concat((train, test), sort=False).reset_index(drop=True)
    data = data.sort_values(by=['code', 'date'])
    
    stock_industry = pd.read_csv(industry_path, encoding="gbk")
    from sklearn.preprocessing import LabelEncoder
    lbe = LabelEncoder()
    stock_industry['industry'] = lbe.fit_transform(stock_industry['industry'])
    data = pd.merge(data, stock_industry[['code', 'industry']], how='left', on='code')
    
    # concept feature
    data = concept_feature(data)
    data = reduce_mem(data, list(data))

    # alpha net 
    length = 30
    alpha_list = ['open', 'high', 'low', 'close', 'volume', 'amount', 'adjustflag', 'turn', 'pctChg', 'peTTM', 'psTTM', 'pcfNcfTTM', 'pbMRQ']
    alpha_list += [f'{x}{i}' for x in ['收盘', '换手率', '成交额'] for i in ['', '_1', '_2', '_-3', '_-2', '_-1']]
    for name in tqdm(alpha_list):
#     for name in tqdm(['open']):
        roll_feature = []
        for i, group in data.groupby('code', sort=False)[name]:
            values = group.tolist()
            values = [0]*(length - 1) + values
            roll_feature = roll_feature + [values[i:i+length] for i in range(len(group))]
        roll_columns = [f'{name}_dt{i}' for i in range(length)]
        data = pd.concat([data, pd.DataFrame(roll_feature, columns=roll_columns)], axis=1).reset_index(drop=True)
        data = reduce_mem(data, roll_columns)
    
    # generate label
    data['label'] = data.groupby('code', sort=False).close.transform(lambda x:(x.shift(-14) - x) / (x + 1e-7) )
    data = data.dropna(subset = ['label'], inplace=False)
#     data = data.replace(np.nan, 0)
    return data[data['date'] <= split].reset_index(drop=True), data[data['date'] > split].reset_index(drop=True)

In [None]:
train, test = feature_engineer(train, test)

# f_train_path = 'stockdata/d_data/f_train_debug.csv'
# f_test_path = 'stockdata/d_data/f_test_debug.csv'
# train.to_csv(f_train_path, index=False)
# test.to_csv(f_test_path, index=False)

Begin concept feature
83.58 Mb, 31.55 Mb (62.24 %)


  3%|███▍                                                                                                      | 1/31 [00:03<01:55,  3.84s/it]

56.29 Mb, 37.10 Mb (34.09 %)


  6%|██████▊                                                                                                   | 2/31 [00:07<01:48,  3.73s/it]

62.68 Mb, 43.49 Mb (30.61 %)


 10%|██████████▎                                                                                               | 3/31 [00:11<01:43,  3.69s/it]

69.08 Mb, 49.89 Mb (27.78 %)


 13%|█████████████▋                                                                                            | 4/31 [00:15<01:42,  3.80s/it]

75.47 Mb, 56.29 Mb (25.42 %)


 16%|█████████████████                                                                                         | 5/31 [00:19<01:39,  3.84s/it]

81.87 Mb, 69.08 Mb (15.62 %)


 19%|████████████████████▌                                                                                     | 6/31 [00:22<01:35,  3.84s/it]

94.66 Mb, 81.87 Mb (13.51 %)


 23%|███████████████████████▉                                                                                  | 7/31 [00:26<01:33,  3.89s/it]

107.46 Mb, 85.07 Mb (20.83 %)


 26%|███████████████████████████▎                                                                              | 8/31 [00:30<01:30,  3.91s/it]

110.65 Mb, 91.47 Mb (17.34 %)


 29%|██████████████████████████████▊                                                                           | 9/31 [00:34<01:25,  3.89s/it]

117.05 Mb, 97.86 Mb (16.39 %)


 32%|█████████████████████████████████▊                                                                       | 10/31 [00:38<01:20,  3.85s/it]

123.45 Mb, 108.95 Mb (11.74 %)


 35%|█████████████████████████████████████▎                                                                   | 11/31 [00:42<01:16,  3.83s/it]

134.53 Mb, 120.03 Mb (10.78 %)


 39%|████████████████████████████████████████▋                                                                | 12/31 [00:46<01:13,  3.84s/it]

145.62 Mb, 131.12 Mb (9.96 %)


 42%|████████████████████████████████████████████                                                             | 13/31 [00:49<01:09,  3.87s/it]

156.71 Mb, 137.52 Mb (12.24 %)


 45%|███████████████████████████████████████████████▍                                                         | 14/31 [00:53<01:06,  3.90s/it]

163.10 Mb, 143.91 Mb (11.76 %)


 48%|██████████████████████████████████████████████████▊                                                      | 15/31 [00:57<01:02,  3.92s/it]

169.50 Mb, 150.31 Mb (11.32 %)


 52%|██████████████████████████████████████████████████████▏                                                  | 16/31 [01:01<00:59,  3.96s/it]

175.89 Mb, 156.71 Mb (10.91 %)


 55%|█████████████████████████████████████████████████████████▌                                               | 17/31 [01:06<00:57,  4.08s/it]

182.29 Mb, 163.10 Mb (10.53 %)


 58%|████████████████████████████████████████████████████████████▉                                            | 18/31 [01:10<00:52,  4.07s/it]

188.69 Mb, 169.50 Mb (10.17 %)


 61%|████████████████████████████████████████████████████████████████▎                                        | 19/31 [01:14<00:48,  4.05s/it]

195.08 Mb, 175.89 Mb (9.84 %)


 65%|███████████████████████████████████████████████████████████████████▋                                     | 20/31 [01:18<00:45,  4.11s/it]

201.48 Mb, 182.29 Mb (9.52 %)


 68%|███████████████████████████████████████████████████████████████████████▏                                 | 21/31 [01:22<00:41,  4.13s/it]

207.88 Mb, 188.69 Mb (9.23 %)


 71%|██████████████████████████████████████████████████████████████████████████▌                              | 22/31 [01:26<00:36,  4.08s/it]

214.27 Mb, 195.08 Mb (8.96 %)


 74%|█████████████████████████████████████████████████████████████████████████████▉                           | 23/31 [01:30<00:32,  4.02s/it]

220.67 Mb, 201.48 Mb (8.70 %)


 77%|█████████████████████████████████████████████████████████████████████████████████▎                       | 24/31 [01:34<00:27,  3.94s/it]

227.06 Mb, 207.88 Mb (8.45 %)


 81%|████████████████████████████████████████████████████████████████████████████████████▋                    | 25/31 [01:38<00:23,  3.89s/it]

233.46 Mb, 214.27 Mb (8.22 %)


 84%|████████████████████████████████████████████████████████████████████████████████████████                 | 26/31 [01:41<00:19,  3.84s/it]

239.86 Mb, 227.06 Mb (5.33 %)


 87%|███████████████████████████████████████████████████████████████████████████████████████████▍             | 27/31 [01:45<00:15,  3.88s/it]

252.65 Mb, 239.86 Mb (5.06 %)


 90%|██████████████████████████████████████████████████████████████████████████████████████████████▊          | 28/31 [01:50<00:11,  3.97s/it]

265.44 Mb, 252.65 Mb (4.82 %)


 94%|██████████████████████████████████████████████████████████████████████████████████████████████████▏      | 29/31 [01:54<00:08,  4.09s/it]

278.23 Mb, 265.44 Mb (4.60 %)


 97%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 30/31 [01:58<00:04,  4.15s/it]

291.03 Mb, 278.23 Mb (4.40 %)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [02:03<00:00,  3.97s/it]

303.82 Mb, 291.03 Mb (4.21 %)





In [None]:
ycol = 'label'
feature_names = list(
    filter(lambda x: x not in [ycol, 'code', 'date', ''] and '日期' not in x and '板块名称' not in x, train.columns))

# print(feature_names)

In [None]:
quantile_30, quantile_70 = train.label.quantile([0.3, 0.7]).values
print('quantile_30:', quantile_30)
print('quantile_70:', quantile_70)

quantile_30: -0.048828125
quantile_70: 0.03344726562499997


In [None]:
def label_quantile(x):
    if x<quantile_30:
        return 0
    elif x<quantile_70:
        return 1
    else:
        return 2

In [None]:
train.label = train.label.apply(label_quantile)
test.label = test.label.apply(label_quantile)

In [None]:
params = {'n_estimators':5000,
      'learning_rate': 0.05,
      'max_depth': 7,
      'early_stopping_rounds':1000,
      'loss_function':'MultiClass',
       'classes_count':3,
      'max_bin':512,
#       'subsample':0.8,
#       'bootstrap_type':'Poisson',
      'random_seed':np.random.randint(0,2021)}

model = cb.CatBoostClassifier(eval_metric="AUC", task_type='GPU', **params)

X_train = train[feature_names]
Y_train = train[ycol]

X_val = test[feature_names]
Y_val = test[ycol]


cat_model = model.fit(X_train,
                      Y_train,
                      eval_set=(X_val, Y_val),
                      verbose=500)


df_importance = pd.DataFrame({
    'column': feature_names,
    'importance': cat_model.feature_importances_,
})

# cat_model.save_model(f'cb_{frequency}.model')

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.5507103	best: 0.5507103 (0)	total: 222ms	remaining: 18m 31s
500:	test: 0.5743410	best: 0.5936335 (111)	total: 1m 16s	remaining: 11m 27s
1000:	test: 0.5661202	best: 0.5936335 (111)	total: 2m 19s	remaining: 9m 18s
bestTest = 0.5936334871
bestIteration = 111
Shrink model to first 112 iterations.


In [None]:
pd.set_option('display.max_rows', None)
print(df_importance)

               column  importance
0                open    0.372966
1                high    1.100948
2                 low    0.054398
3               close    2.413380
4            preclose    0.000000
5              volume    0.148314
6              amount    0.952151
7          adjustflag    0.000000
8                turn    2.864082
9         tradestatus    0.000000
10             pctChg    0.212371
11              peTTM    0.321489
12              psTTM    0.323412
13          pcfNcfTTM    1.706374
14              pbMRQ    2.119947
15               isST    0.000000
16           industry    0.000000
17          concept_0    0.087004
18          concept_1    0.000000
19          concept_2    0.198053
20         concept_-3    0.000000
21         concept_-2    0.000000
22         concept_-1    0.000000
23                 开盘    1.034097
24                 收盘    0.106624
25                 最高    0.871178
26                 最低    0.000000
27                涨跌幅    0.085364
28            

In [14]:
cat_model.save_model(f'model/concept.model')