In [1]:
from get_stock_data import Downloader, mkdir
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import catboost as cb
import datetime

# 获取全部股票的日K线数据
mkdir('stockdata/d_data')
raw_train_path = 'stockdata/d_train'
raw_test_path = 'stockdata/d_test'
train_path = 'stockdata/d_data/train.csv'
test_path = 'stockdata/d_data/test.csv'
mode = 'train'

In [2]:
if mode == 'debug':
    train = pd.read_csv(train_path, nrows=100000)
    test = pd.read_csv(test_path, nrows=100000)
else:
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

In [3]:
train.date = train.date.apply(lambda x: int(x.replace('-', '')))
test.date = test.date.apply(lambda x: int(x.replace('-', '')))

In [4]:
train = train[train['date'] >= 20220301].reset_index(drop=True)

In [6]:
train.date.unique()

array([20220301, 20220302, 20220303, 20220304, 20220307, 20220308,
       20220309, 20220310, 20220311, 20220314, 20220315, 20220316,
       20220317, 20220318, 20220321, 20220322, 20220323, 20220324,
       20220325, 20220328, 20220329, 20220330, 20220331, 20220401,
       20220406, 20220407, 20220408, 20220411, 20220412, 20220413,
       20220414, 20220415, 20220418, 20220419, 20220420, 20220421,
       20220422, 20220425, 20220426, 20220427, 20220428, 20220429,
       20220505, 20220506, 20220509, 20220510, 20220511, 20220512,
       20220513, 20220516, 20220517, 20220518, 20220519, 20220520,
       20220523, 20220524, 20220525, 20220526, 20220527, 20220530,
       20220531, 20220601])

In [5]:
def feature_engineer(train, test, split=20220501):
    train_len = len(train)
    data = pd.concat((train, test), sort=False).reset_index(drop=True)
    data = data.sort_values(by=['code', 'date'])
    
    stock_industry = pd.read_csv("stock_industry.csv", encoding="gbk")
    from sklearn.preprocessing import LabelEncoder
    lbe = LabelEncoder()
    stock_industry['industry'] = lbe.fit_transform(stock_industry['industry'])
    data = pd.merge(data, stock_industry[['code', 'industry']], how='left', on='code')

    # alpha net 
    length = 30
    for name in tqdm(['open', 'high', 'low', 'close', 'volume', 'amount', 'adjustflag', 'turn', 'pctChg', 'peTTM', 'psTTM', 'pcfNcfTTM', 'pbMRQ']):
#     for name in tqdm(['open']):
        roll_feature = []
        for i, group in data.groupby('code', sort=False)[name]:
            values = group.tolist()
            values = [0]*(length - 1) + values
            roll_feature = roll_feature + [values[i:i+length] for i in range(len(group))]
        data = pd.concat([data, pd.DataFrame(roll_feature, columns=[f'{name}_{i}' for i in range(length)])], axis=1).reset_index(drop=True)
    
    
    # generate label
    data['label'] = data.groupby('code').close.transform(lambda x:(x - x.shift(-14)) / (x + 1e-7) )
    data = data.dropna(subset = ['label'], inplace=False)
    data = data.replace(np.nan, 0)
    return data[data['date'] <= split].reset_index(drop=True), data[data['date'] > split].reset_index(drop=True)

In [6]:
train, test = feature_engineer(train, test)

# f_train_path = 'stockdata/d_data/f_train_debug.csv'
# f_test_path = 'stockdata/d_data/f_test_debug.csv'
# train.to_csv(f_train_path, index=False)
# test.to_csv(f_test_path, index=False)

100%|██████████| 12/12 [00:40<00:00,  3.37s/it]


In [7]:


ycol = 'label'
feature_names = list(
    filter(lambda x: x not in [ycol, 'code', 'date', ''], train.columns))

# print(feature_names)

In [8]:
quantile_30, quantile_70 = train.label.quantile([0.3, 0.7]).values

In [9]:
def label_quantile(x):
    if x<quantile_30:
        return 0
    elif x<quantile_70:
        return 1
    else:
        return 2

In [10]:
train.label = train.label.apply(label_quantile)
test.label = test.label.apply(label_quantile)

In [11]:
params = {'n_estimators':5000,
      'learning_rate': 0.05,
      'max_depth': 7,
      'early_stopping_rounds':1000,
      'loss_function':'MultiClass',
       'classes_count':3,
      'max_bin':512,
#       'subsample':0.8,
#       'bootstrap_type':'Poisson',
      'random_seed':np.random.randint(0,2021)}

model = cb.CatBoostClassifier(eval_metric="AUC", task_type='CPU', **params)

X_train = train[feature_names]
Y_train = train[ycol]

X_val = test[feature_names]
Y_val = test[ycol]


cat_model = model.fit(X_train,
                      Y_train,
                      eval_set=(X_val, Y_val),
                      plot=False,
                      verbose=500)


df_importance = pd.DataFrame({
    'column': feature_names,
    'importance': cat_model.feature_importances_,
})

# cat_model.save_model(f'cb_{frequency}.model')

0:	test: 0.5061205	best: 0.5061205 (0)	total: 282ms	remaining: 23m 29s
500:	test: 0.5279494	best: 0.5474646 (18)	total: 1m 48s	remaining: 16m 14s
1000:	test: 0.5234501	best: 0.5474646 (18)	total: 3m 36s	remaining: 14m 26s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.5474646431
bestIteration = 18

Shrink model to first 19 iterations.


In [12]:
print(df_importance)

       column  importance
0        open    0.000000
1        high    0.000000
2         low    0.973224
3       close    0.685092
4    preclose    0.000000
..        ...         ...
372  pbMRQ_25    1.467207
373  pbMRQ_26    0.000000
374  pbMRQ_27    1.753499
375  pbMRQ_28    1.217345
376  pbMRQ_29    3.572795

[377 rows x 2 columns]


In [None]:
cat_model.save_model(f'next_2week_alphanet30.model')