In [8]:
from get_stock_data import Downloader, mkdir
from tqdm import tqdm
import os
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.preprocessing import LabelEncoder
import time

stockdata_path = './stockdata'
mode = "all"
train_path = './stockdata/m_train'
test_path = './stockdata/m_test'
is_download = False

def feature_engineer(train, test):
    train_len = len(train)
    data = pd.concat((train, test), sort=False).reset_index(drop=True)
    
    stock_industry = pd.read_csv("stock_industry.csv", encoding="gbk")
    from sklearn.preprocessing import LabelEncoder
    lbe = LabelEncoder()
    stock_industry['industry'] = lbe.fit_transform(stock_industry['industry'])
    data = pd.merge(data, stock_industry[['code', 'industry']], how='left', on='code')
    
    for name in tqdm(['close', 'volume', 'amount', 'turn', 'pctChg']):
        for day in ['7', '30']:
            rolling = data.groupby('code')[f'{name}'].rolling(window=int(day), center=False)
            data[f'{name}_rolling_{day}_mean'] = rolling.mean().reset_index(drop=True)
            data[f'{name}_rolling_{day}_max'] = rolling.max().reset_index(drop=True)
            data[f'{name}_rolling_{day}_min'] = rolling.min().reset_index(drop=True)
            data[f'{name}_rolling_{day}_sum'] = rolling.sum().reset_index(drop=True)
            data[f'{name}_rolling_{day}_median'] = rolling.median().reset_index(drop=True)
            data[f'{name}_rolling_{day}_skew'] = rolling.skew().reset_index(drop=True)
            data[f'{name}_rolling_{day}_kurt'] = rolling.kurt().reset_index(drop=True)
            data[f'{name}_rolling_{day}_std'] = rolling.std().reset_index(drop=True)
#             data[f'{name}_rolling_{day}_mad'] = rolling.mad()
#             data[f'{name}_rolling_{day}_autocorr1'] = rolling.autocorr(1)
#             data[f'{name}_rolling_{day}_autocorr2'] = rolling.autocorr(2)
    
            
    
    return data.iloc[:train_len].reset_index(drop=True), data.iloc[train_len:].reset_index(drop=True)

def download(is_download=False):
    if is_download:
        mkdir(train_path)
        downloader = Downloader(train_path, 
                                date_start='1990-12-19', 
                                date_end='2022-05-23', 
                                frequency='m',
                                mode=mode)
        downloader.run()

        mkdir(test_path)
        downloader = Downloader(test_path, 
                                date_start='2022-05-23', 
                                date_end='2022-07-23', 
                                frequency='m',
                                mode=mode)
        downloader.run()
    return
        
def df_concat(csv_path):
    df_arr = []
    for path in tqdm(os.listdir(csv_path)):
        try:
            temp = pd.read_csv(csv_path + '/' + path, engine='python')
            df_arr.append(temp)
        except:
            pass
    return pd.concat(df_arr)

In [7]:
download(is_download)

train = df_concat(train_path)
test = df_concat(test_path)

train, test = feature_engineer(train, test)

lbe = LabelEncoder()
train['code'] = lbe.fit_transform(train['code'])

lbe = LabelEncoder()
test['code'] = lbe.fit_transform(test['code'])

train['label'] = [0 if x>0 else 1 for x in (train.close - train.shift(-1).close)]
test['label'] = [0 if x>0 else 1 for x in (test.close - test.shift(-1).close)]
train.to_csv(stockdata_path + '/' + 'm_train.csv', index=False)
test.to_csv(stockdata_path + '/' + 'm_test.csv', index=False)

In [3]:
# 如果数据已经存在

In [5]:
train = pd.read_csv(stockdata_path + '/' + 'm_train.csv', engine='python', nrows=10000)

In [9]:
train["timestamp"] = train["date"].apply(lambda x: time.mktime(time.strptime(x,"%Y-%m-%d")))

In [10]:
date_threshold = time.mktime(time.strptime('2022-01-01',"%Y-%m-%d"))
train, test = train[train['timestamp']<=date_threshold], train[train['timestamp']>date_threshold]

In [12]:
ycol = 'label'
# feature_names = list(
#     filter(lambda x: x not in [ycol, 'date', '', 'Unnamed: 0', 'timestamp'], train.columns))
feature_names = ['code','open','high','low','close','volume','amount', 'pctChg', 'industry', 'turn']

In [26]:
params = {'n_estimators':5000,
      'learning_rate': 0.05,
      'max_depth': 7,
      'early_stopping_rounds':1000,
      'loss_function':'Logloss',
      'max_bin':512,
      'subsample':0.8,
      'random_seed':np.random.randint(0,2021),
      'eval_metric':'AUC',}


train_pool = cb.Pool(data=train[feature_names], label=train[ycol])
val_pool = cb.Pool(data=test[feature_names], label=test[ycol])
lgb_model = cb.train(params=params,
                     pool=train_pool,
                     eval_set=val_pool,
                     init_model='cb_month.model', # 如果init_model不为None，那么就是在init_model基础上接着训练
                     verbose=200)


df_importance = pd.DataFrame({
    'column': feature_names,
    'importance': lgb_model.feature_importances_,
})

# lgb_model.save_model(f'cb_{frequency}.model')

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

CatBoostError: catboost/libs/data/model_dataset_compatibility.cpp:81: At position 8 should be feature with name turn (found industry).

In [10]:
lgb_model.save_model(f'cb_month.model')

In [56]:
df_importance = df_importance.sort_values(by='importance')

In [57]:
df_importance.to_csv('df_importance.csv', index=False)