In [1]:
from get_stock_data import Downloader, mkdir
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import catboost as cb
import datetime
import functools
from collections import defaultdict, Counter
from sklearn import preprocessing

from utils import reduce_mem, StockDNN

import torch
from torchmetrics import AUROC, Recall, Accuracy, F1Score

# 获取全部股票的日K线数据
mkdir('stockdata/d_data')
raw_train_path = 'stockdata/d_train'
raw_test_path = 'stockdata/d_test'
train_path = 'stockdata/d_data/train.csv'
test_path = 'stockdata/d_data/test.csv'
industry_path = 'stockdata/stock_industry.csv'
mode = 'debug'


concept_path = 'stockdata/concept_df.csv'
concept_hist_path = 'stockdata/concept_hist_df.csv'

In [2]:
if mode == 'debug':
    train = pd.read_csv(train_path, nrows=100000)
    test = pd.read_csv(test_path, nrows=100000)
else:
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

In [3]:
train.date = train.date.apply(lambda x: int(x.replace('-', '')))
test.date = test.date.apply(lambda x: int(x.replace('-', '')))

train.code = train.code.apply(lambda x:x[3:])
test.code = test.code.apply(lambda x:x[3:])

In [4]:
train = train[train['date'] >= 20210101].reset_index(drop=True)

In [5]:
def concept_feature(data):
    print('Begin concept feature')
    concept_df = pd.read_csv(concept_path)[['代码', '板块名称']]
    concept_hist_df = pd.read_csv(concept_hist_path)
    concept_df['代码'] = concept_df['代码'].apply(lambda x:str(x).zfill(6))
    
    concept_hist_df['日期'] = concept_hist_df['日期'].apply(lambda x:int(x.replace('-', '')))
    
    concept_counter = Counter([c for c in concept_df['板块名称'].values if '昨日' not in c])

    concept_dic = defaultdict(list)
    for code, concept in concept_df.values:
        if '昨日' in concept:continue
        concept_dic[code].append(concept)

    def compare_concept(x, y):
        x, y = concept_counter[x], concept_counter[y]
        if x < y:return -1
        if x > y: return 1
        return 0
    for k, v in concept_dic.items():
        concept_dic[k] = sorted(v, key=functools.cmp_to_key(compare_concept))

    data['concept_0'] = data.code.apply(lambda x:concept_dic[x][0] if len(concept_dic[x])>0 else np.nan)
    data['concept_1'] = data.code.apply(lambda x:concept_dic[x][1] if len(concept_dic[x])>1 else np.nan)
    data['concept_2'] = data.code.apply(lambda x:concept_dic[x][2] if len(concept_dic[x])>2 else np.nan)

    data['concept_-3'] = data.code.apply(lambda x:concept_dic[x][-3] if len(concept_dic[x])>3 else np.nan)
    data['concept_-2'] = data.code.apply(lambda x:concept_dic[x][-2] if len(concept_dic[x])>4 else np.nan)
    data['concept_-1'] = data.code.apply(lambda x:concept_dic[x][-1] if len(concept_dic[x])>5 else np.nan)
    
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_0'], right_on=['日期', '板块名称'], how='left')
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_1'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_1'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_2'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_2'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_-3'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_-3'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_-2'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_-2'))
    data = pd.merge(data, concept_hist_df, left_on=['date', 'concept_-1'], right_on=['日期', '板块名称'], how='left', suffixes=(None, '_-1'))
    
    
    # label encoder
    concept_labelencoder = {c:i for i, c in enumerate(np.unique(concept_df['板块名称'].values))}
    concept_labelencoder.update({np.nan:np.nan})
    data['concept_0'] = data['concept_0'].apply(lambda x:concept_labelencoder[x])
    data['concept_1'] = data['concept_1'].apply(lambda x:concept_labelencoder[x])
    data['concept_2'] = data['concept_2'].apply(lambda x:concept_labelencoder[x])
    
    data['concept_-3'] = data['concept_-3'].apply(lambda x:concept_labelencoder[x])
    data['concept_-2'] = data['concept_-2'].apply(lambda x:concept_labelencoder[x])
    data['concept_-1'] = data['concept_-1'].apply(lambda x:concept_labelencoder[x])
    return data

def feature_engineer(train, test, split=20220501):
    train_len = len(train)
    data = pd.concat((train, test), sort=False).reset_index(drop=True)
    data = data.sort_values(by=['code', 'date'])
    
    stock_industry = pd.read_csv(industry_path, encoding="gbk")
    from sklearn.preprocessing import LabelEncoder
    lbe = LabelEncoder()
    stock_industry['industry'] = lbe.fit_transform(stock_industry['industry'])
    data = pd.merge(data, stock_industry[['code', 'industry']], how='left', on='code')
    
    # concept feature
    data = concept_feature(data)
    data = reduce_mem(data, list(data))

    # alpha net 
    length = 30
    alpha_list = ['open', 'high', 'low', 'close', 'volume', 'amount', 'adjustflag', 'turn', 'pctChg', 'peTTM', 'psTTM', 'pcfNcfTTM', 'pbMRQ']
    alpha_list += [f'{x}{i}' for x in ['收盘', '换手率', '成交额'] for i in ['', '_1', '_2', '_-3', '_-2', '_-1']]
    for name in tqdm(alpha_list):
#     for name in tqdm(['open']):
        roll_feature = []
        for i, group in data.groupby('code', sort=False)[name]:
            values = group.tolist()
            values = [0]*(length - 1) + values
            roll_feature = roll_feature + [values[i:i+length] for i in range(len(group))]
        roll_columns = [f'{name}_dt{i}' for i in range(length)]
        data = pd.concat([data, pd.DataFrame(roll_feature, columns=roll_columns)], axis=1).reset_index(drop=True)
        data = reduce_mem(data, roll_columns)
    
    # generate label
    data['label'] = data.groupby('code', sort=False).close.transform(lambda x:(x.shift(-14) - x) / (x + 1e-7) )
    data = data.dropna(subset = ['label'], inplace=False)
    data = data.replace(np.nan, 0)
    return data[data['date'] <= split].reset_index(drop=True), data[data['date'] > split].reset_index(drop=True)

def minmax_scaler(train, test, feature_names):
    for name in feature_names:
#         if 'concept' in name:continue
        max_value = train[name].max()
        min_value = train[name].min()
        train[name] = (train[name] - min_value)/(1e-7 + max_value - min_value)
        test[name] = (test[name] - min_value)/(1e-7 + max_value - min_value)
    return train, test

In [6]:
train, test = feature_engineer(train, test)

ycol = 'label'
feature_names = list(
    filter(lambda x: x not in [ycol, 'code', 'date', ''] and '日期' not in x and '板块名称' not in x, train.columns))

# print(feature_names)

train, test = minmax_scaler(train, test, feature_names)

# f_train_path = 'stockdata/d_data/f_train_debug.csv'
# f_test_path = 'stockdata/d_data/f_test_debug.csv'
# train.to_csv(f_train_path, index=False)
# test.to_csv(f_test_path, index=False)

Begin concept feature


  0%|          | 0/31 [00:00<?, ?it/s]

83.58 Mb, 31.55 Mb (62.24 %)


  3%|▎         | 1/31 [00:03<01:39,  3.31s/it]

56.29 Mb, 37.10 Mb (34.09 %)


  6%|▋         | 2/31 [00:06<01:36,  3.31s/it]

62.68 Mb, 43.49 Mb (30.61 %)


 10%|▉         | 3/31 [00:10<01:34,  3.38s/it]

69.08 Mb, 49.89 Mb (27.78 %)


 13%|█▎        | 4/31 [00:13<01:31,  3.39s/it]

75.47 Mb, 56.29 Mb (25.42 %)


 16%|█▌        | 5/31 [00:16<01:28,  3.40s/it]

81.87 Mb, 69.08 Mb (15.62 %)


 19%|█▉        | 6/31 [00:20<01:24,  3.38s/it]

94.66 Mb, 81.87 Mb (13.51 %)


 23%|██▎       | 7/31 [00:23<01:22,  3.44s/it]

107.46 Mb, 85.07 Mb (20.83 %)


 26%|██▌       | 8/31 [00:27<01:19,  3.44s/it]

110.65 Mb, 91.47 Mb (17.34 %)


 29%|██▉       | 9/31 [00:30<01:15,  3.44s/it]

117.05 Mb, 97.86 Mb (16.39 %)


 32%|███▏      | 10/31 [00:34<01:12,  3.45s/it]

123.45 Mb, 108.95 Mb (11.74 %)


 35%|███▌      | 11/31 [00:37<01:08,  3.45s/it]

134.53 Mb, 120.03 Mb (10.78 %)


 39%|███▊      | 12/31 [00:41<01:05,  3.46s/it]

145.62 Mb, 131.12 Mb (9.96 %)


 42%|████▏     | 13/31 [00:44<01:02,  3.49s/it]

156.71 Mb, 137.52 Mb (12.24 %)


 45%|████▌     | 14/31 [00:48<00:59,  3.51s/it]

163.10 Mb, 143.91 Mb (11.76 %)


 48%|████▊     | 15/31 [00:51<00:56,  3.52s/it]

169.50 Mb, 150.31 Mb (11.32 %)


 52%|█████▏    | 16/31 [00:55<00:52,  3.53s/it]

175.89 Mb, 156.71 Mb (10.91 %)


 55%|█████▍    | 17/31 [00:58<00:49,  3.55s/it]

182.29 Mb, 163.10 Mb (10.53 %)


 58%|█████▊    | 18/31 [01:02<00:46,  3.57s/it]

188.69 Mb, 169.50 Mb (10.17 %)


 61%|██████▏   | 19/31 [01:06<00:43,  3.59s/it]

195.08 Mb, 175.89 Mb (9.84 %)


 65%|██████▍   | 20/31 [01:09<00:39,  3.63s/it]

201.48 Mb, 182.29 Mb (9.52 %)


 68%|██████▊   | 21/31 [01:13<00:36,  3.65s/it]

207.88 Mb, 188.69 Mb (9.23 %)


 71%|███████   | 22/31 [01:17<00:33,  3.71s/it]

214.27 Mb, 195.08 Mb (8.96 %)


 74%|███████▍  | 23/31 [01:21<00:29,  3.71s/it]

220.67 Mb, 201.48 Mb (8.70 %)


 77%|███████▋  | 24/31 [01:25<00:26,  3.76s/it]

227.06 Mb, 207.88 Mb (8.45 %)


 81%|████████  | 25/31 [01:28<00:22,  3.75s/it]

233.46 Mb, 214.27 Mb (8.22 %)


 84%|████████▍ | 26/31 [01:32<00:18,  3.77s/it]

239.86 Mb, 227.06 Mb (5.33 %)


 87%|████████▋ | 27/31 [01:36<00:15,  3.76s/it]

252.65 Mb, 239.86 Mb (5.06 %)


 90%|█████████ | 28/31 [01:40<00:11,  3.78s/it]

265.44 Mb, 252.65 Mb (4.82 %)


 94%|█████████▎| 29/31 [01:43<00:07,  3.79s/it]

278.23 Mb, 265.44 Mb (4.60 %)


 97%|█████████▋| 30/31 [01:47<00:03,  3.83s/it]

291.03 Mb, 278.23 Mb (4.40 %)


100%|██████████| 31/31 [01:51<00:00,  3.61s/it]

303.82 Mb, 291.03 Mb (4.21 %)





In [7]:
quantile_30, quantile_70 = train.label.quantile([0.3, 0.7]).values
print('quantile_30:', quantile_30)
print('quantile_70:', quantile_70)

quantile_30: -0.048828125
quantile_70: 0.03344726562499997


In [8]:
def label_quantile(x):
    if x<quantile_30:
        return 0
    elif x<quantile_70:
        return 1
    else:
        return 2

In [9]:
train.label = train.label.apply(label_quantile)
test.label = test.label.apply(label_quantile)

In [10]:
def mlp_train(train, test, feature_names, ycol):
    epochs = 20
    batch_size = 512
    input_dim = len(feature_names)
    
#     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    device = torch.device('cpu')
    model = StockDNN(input_dim=input_dim, output_dim=3).to(device)
    optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, model.parameters()), lr=3e-4)
    print(model)
    
    X_train = torch.Tensor(train[feature_names].to_numpy())
    Y_train = torch.Tensor(train[ycol].to_numpy()).long()
    X_val = torch.Tensor(test[feature_names].to_numpy())
    Y_val = torch.Tensor(test[ycol].to_numpy()).long()
    n_samples = len(X_train)
    iterations = n_samples // batch_size
    idx = np.arange(n_samples)
    
    
    for _ in range(epochs):
        for i in range(0, n_samples, batch_size):
            batch_idx = idx[i:i+batch_size]
            batch_data = X_train[batch_idx]
            batch_target = Y_train[batch_idx]

            optimizer.zero_grad()
            loss = model.compute_loss(batch_data.to(device), batch_target.to(device))
            loss.backward()
            optimizer.step()
                

        np.random.shuffle(idx)
        torch.save(model, 'model/concept_mlp.pt')
        
        # metrics
        preds = model(X_val)
        auroc = AUROC(num_classes=3)
        accuracy = Accuracy(nums_classes=3)
        f1 = F1Score(num_classes=3, threshold=0.5)
        recall = Recall(num_classes=3, threshold=0.5)
        print('AUROC:', auroc(preds, Y_val), ' | ', 'Accuracy:', accuracy(preds, Y_val), '|', 'Recall:', recall(preds, Y_val), '|', 'F1Score:', f1(preds, Y_val))

In [11]:
mlp_train(train, test, feature_names, ycol)

StockDNN(
  (loss_func): CrossEntropyLoss()
  (mlp): Sequential(
    (0): Linear(in_features=1013, out_features=1024, bias=True)
    (1): Dropout(p=0.5, inplace=False)
    (2): LeakyReLU(negative_slope=0.2)
    (3): Linear(in_features=1024, out_features=512, bias=True)
    (4): Dropout(p=0.5, inplace=False)
    (5): LeakyReLU(negative_slope=0.2)
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): Dropout(p=0.5, inplace=False)
    (8): LeakyReLU(negative_slope=0.2)
    (9): Linear(in_features=256, out_features=3, bias=True)
  )
)




AUROC: tensor(0.5066)  |  Accuracy: tensor(0.4138) | Recall: tensor(0.4138) | F1Score: tensor(0.4138)
AUROC: tensor(0.5217)  |  Accuracy: tensor(0.4097) | Recall: tensor(0.4097) | F1Score: tensor(0.4097)
AUROC: tensor(0.5300)  |  Accuracy: tensor(0.3924) | Recall: tensor(0.3924) | F1Score: tensor(0.3924)
AUROC: tensor(0.5246)  |  Accuracy: tensor(0.2956) | Recall: tensor(0.2956) | F1Score: tensor(0.2956)
AUROC: tensor(0.5208)  |  Accuracy: tensor(0.2269) | Recall: tensor(0.2269) | F1Score: tensor(0.2269)
AUROC: tensor(0.5216)  |  Accuracy: tensor(0.2351) | Recall: tensor(0.2351) | F1Score: tensor(0.2351)
AUROC: tensor(0.5221)  |  Accuracy: tensor(0.2228) | Recall: tensor(0.2228) | F1Score: tensor(0.2228)
AUROC: tensor(0.5254)  |  Accuracy: tensor(0.2264) | Recall: tensor(0.2264) | F1Score: tensor(0.2264)
AUROC: tensor(0.5257)  |  Accuracy: tensor(0.2478) | Recall: tensor(0.2478) | F1Score: tensor(0.2478)
AUROC: tensor(0.5261)  |  Accuracy: tensor(0.2582) | Recall: tensor(0.2582) | F1Sc