In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set()

In [2]:
# Variable
timesteps = 32
batch_size = 8
train_to_val_ratio = 7
units = 32
data_dim = 8
div_class = [-0.5, 0.5]
#div_class = [-0.5, -0.3, -0.1, 0.1, 0.3, 0.5]
num_classes = len(div_class) + 1

In [3]:
def data_load():
    data_dir = '../data/'
    day_index1 = data_dir + '1day/index/2000_2009.csv'
    day_index2 = data_dir + '1day/index/2010_2016.csv'
    data1 = pd.read_csv(day_index1,encoding='gbk', parse_dates=['date'])
    data2 = pd.read_csv(day_index2,encoding='gbk', parse_dates=['date'])
    data = pd.concat([data1, data2])
    data.set_index(['code', 'date'], inplace=True)
    return data

In [4]:
def split_data(data, timesteps, batch_size, query_str):
    match = data.query(query_str)
    min_num = int(batch_size) * int(timesteps) 
    size = len(match)
    m = int(size / min_num)
    if m < 1:
        return None
    
    end_num = min_num * m
    print("data set: {0}:{1}".format(0, end_num))
    result = match.iloc[:end_num, :].copy()

    return result

In [5]:
def split_train_validate1(data, timesteps, batch_size, train_to_val_ratio):
    min_test_num = batch_size * timesteps 
    min_train_num = min_test_num * train_to_val_ratio
    
    min_all = min_train_num + min_test_num
    size = len(data)
    if size < min_all:
        return None, None
    
    m = int(size / min_all)
    print("train set: {0}:{1}".format(0, min_train_num*m))
    train = data.iloc[:min_train_num*m, :]
    
    test_begin = min_train_num * m
    test_end = int(size / min_test_num) * min_test_num
    print("test set: {0}:{1}".format(test_begin, test_end))
    test = data.iloc[test_begin:test_end, :]
    return train, test

In [6]:
def data_clean(data):
    new_data = pd.DataFrame()
    for column in ['open', 'high', 'low', 'close']:
        new_data[column] = data[column] / 10000.0

    # new_data.dropna(inplace=True)
    return new_data

In [7]:
# 阴阳线
def stick_type(x):
    stick_type = -1
    if x['close'] >= x['open']:
        stick_type = 1
    return stick_type

# 上引线
def up_line(x):
    return x['high'] - x[['open','close']].max()

# 下引线
def down_line(x):
    return x[['open','close']].min() - x['low']

# 实体长度
def body_size(x):
    return x['close'] - x['open']

def range_to_class(x):
    cls = []
    size = len(div_class)
    n = -1;
    if x < div_class[0]:
        n = 1
    elif x >= div_class[size-1]:
        n = size + 1
    else:    
        for i in range(0, size-1):
            if div_class[i] <= x < div_class[i+1]:
                n = i + 2
                # print("n=", n)
                break
    if n >= 0:
        for i in range(0, size+1):
            if i == n - 1:
                cls.append(1.0)
            else:
                cls.append(0.0)
    else:
        print(x)
        print("Error: n less than 0")
    return cls

In [8]:
def data_transform(data):
    new_data = pd.DataFrame()
    pre_close = data['close'].shift(1)
    for column in ['open', 'high', 'low', 'close']:
        new_data[column] = (data[column] - pre_close) / pre_close
    # new_data.dropna(inplace=True)
        
    # change percentage enlarge
    new_data = new_data * 100.0
        
    # add new feature
    new_data['stick_type'] = new_data.apply(stick_type, axis=1)
    new_data['up_line'] = new_data.apply(up_line, axis=1)
    new_data['down_line'] = new_data.apply(down_line, axis=1)
    new_data['close_open'] = new_data.apply(body_size, axis=1)
        
    # classes
    new_data['class'] = new_data['close'].shift(-1)
    # new_data.dropna(inplace=True)
    new_data.fillna(0, inplace=True)
    new_data['class']= new_data['class'].map(range_to_class)
        
    # new_data.dropna(inplace=True)
    return new_data

In [9]:
def data_reform(data, batch_size, timesteps):
    print("shape: {0}".format(data.shape))
    size = len(data)
    if size % (int(batch_size) * int(timesteps)) != 0:
        print("data size not match, size: {0}, batch_size: {1}, timesteps: {2}".format(size, batch_size, timesteps))
        return None, None
   
    X, Y0 = data[:, :-1], data[:, -1]
    
    X = X.reshape((-1, timesteps, X.shape[1]))
    
    Y = np.array([np.array(y) for y in Y0])
    
    Y = Y.reshape((-1, timesteps, Y.shape[1]))
    
    print("X.shape: {0} Y.shape: {1}".format(X.shape, Y.shape))
    
    return X, Y

In [10]:
# data_all.query('date>"20150101"')

In [11]:
# data_all.query('code==990905').tail(600).head()

In [35]:
data_all_raw = data_load()

In [36]:
data_all_raw.name.unique()

array(['沪深300', '上证380', '新综指', '180金融', '治理指数', '中型综指', '180治理', '沪公司债',
       '沪分离债', '180基建', '180资源', '180运输', '180成长', '180价值', '180R成长',
       '180R价值', '上证能源', '上证材料', '上证工业', '上证可选', '上证消费', '上证医药', '上证金融',
       '上证信息', '上证电信', '上证公用', '上证央企', '超大盘', '上证中盘', '上证小盘', '上证中小',
       '上证全指', '责任指数', '上证民企', '50等权', '180等权', '50基本', '180基本', '上证海外',
       '上证地企', '上证国企', '全指成长', '全指价值', '全R成长', '全R价值', '沪企债30', '上证沪企',
       '上证周期', '非周期', '上证龙头', '上证商品', '上证新兴', '上证资源', '消费80', '沪财中小',
       '资源50', '180分层', '上证上游', '上证中游', '上证下游', '高端装备', '上证F200', '上证F300',
       '上证F500', '沪投资品', '沪消费品', '380能源', '380材料', '380工业', '380可选',
       '380消费', '380医药', '380金融', '380信息', '380电信', '380公用', '380成长',
       '380价值', '380R成长', '380R价值', '医药主题', '消费50', '380基本', '180波动',
       '380波动', '上证100', '上证150', '上证银行', '180高贝', '180低贝', '380高贝',
       '380低贝', '上证转债', '优势资源', '优势制造', '优势消费', '消费领先', '180红利', '380红利',
       '上国红利', '上央红利', '上民红利', '市值百强', '资源80', '500沪市', '300波动', '500波

In [47]:
zz500 = data_all_raw[data_all_raw['name']=='中证500']

In [68]:
zz500.reset_index(level=0)['name'].resample('A').count()

date
2007-12-31    452
2008-12-31    492
2009-12-31    441
2010-12-31    247
2011-12-31    244
2012-12-31    243
2013-12-31    238
2014-12-31    245
2015-12-31    244
2016-12-31    244
Freq: A-DEC, Name: name, dtype: int64

In [29]:
names = ['沪深300', '中证500', '上证50', '中小板指', '创业板指']
data_all = data_all_raw[data_all_raw.name.isin(names)]

In [49]:
query_str = 'code=={0}'.format(990905)
data_slice = data_all.query(query_str).copy()

In [69]:
data_slice.reset_index(level=0)['name'].resample('A').count()

date
2007-12-31    226
2008-12-31    246
2009-12-31    244
2010-12-31    242
2011-12-31    244
2012-12-31    243
2013-12-31    238
2014-12-31    245
2015-12-31    244
2016-12-31    244
Freq: A-DEC, Name: name, dtype: int64

In [30]:
# test set
idx=990905
query_str = 'code=={0}'.format(idx)
data_slice = data_all.query(query_str).copy()

query_str = 'date>"20141101"'
raw_test = split_data(data_slice, timesteps, batch_size, query_str)
test_c = data_clean(raw_test)
test_t = data_transform(test_c)
print(test_t.iloc[0,:])
print(test_t.iloc[-1,:])
test_X, test_Y = data_reform(test_t.values, batch_size, timesteps)
print(test_X.shape, test_Y.shape)

data set: 0:512
open                        0
high                        0
low                         0
close                       0
stick_type                 -1
up_line                     0
down_line                   0
close_open                  0
class         [1.0, 0.0, 0.0]
Name: (990905, 2014-11-03 00:00:00), dtype: object
open                -0.900462
high                0.0248797
low                  -1.07178
close               -0.536073
stick_type                  1
up_line              0.560953
down_line            0.171313
close_open           0.364389
class         [0.0, 1.0, 0.0]
Name: (990905, 2016-12-05 00:00:00), dtype: object
shape: (512, 9)
X.shape: (16, 32, 8) Y.shape: (16, 32, 3)
(16, 32, 8) (16, 32, 3)


In [31]:
# train sets
idxes = data_all.index.get_level_values('code').unique()
train_dict = {}
size = len(idxes)
for i in range(size):
    print("========================")
    print("dict length: {0}".format(len(train_dict)))
    idx = idxes[i]
    print('{0}/{1}, Idx: {2}'.format(i, size, idx))
    query_str = 'code=={0}'.format(idxes[i])
    data_slice = data_all.query(query_str).copy()
    query_str = 'date<"20150101"'
    raw  = split_data(data_slice, timesteps, batch_size, query_str)
    if raw is None:
        continue
    c = data_clean(raw)
    t = data_transform(c)
    print(t.iloc[0,:])
    print(t.iloc[-1,:])
    X, Y = data_reform(t.values, batch_size, timesteps)
    if X is None or Y is None:
        continue
    print(X.shape, Y.shape)
    train_dict[idx] = (X, Y)

dict length: 0
0/7, Idx: 300
data set: 0:2304
open                        0
high                        0
low                         0
close                       0
stick_type                 -1
up_line                     0
down_line                   0
close_open                  0
class         [1.0, 0.0, 0.0]
Name: (300, 2005-04-08 00:00:00), dtype: object
open                -0.156908
high                 0.527832
low                 -0.758061
close               -0.341515
stick_type                 -1
up_line               0.68474
down_line            0.416547
close_open          -0.184607
class         [0.0, 1.0, 0.0]
Name: (300, 2014-10-14 00:00:00), dtype: object
shape: (2304, 9)
X.shape: (72, 32, 8) Y.shape: (72, 32, 3)
(72, 32, 8) (72, 32, 3)
dict length: 1
1/7, Idx: 990905
data set: 0:1792
open                        0
high                        0
low                         0
close                       0
stick_type                 -1
up_line                     0
down_l

In [None]:
all_loss_and_metrics = []
nb_epochs = 1000
for e in range(nb_epochs):
    print("epochs: {0}".format(e))
    for idx in train_dict:
        # print("epochs: {0}, idx: {1}".format(e, idx))
        train_X, train_Y = train_dict[idx]
        # print(train_X.shape, train_Y.shape, test_X.shape, test_Y.shape)

        model.reset_states()
        # history = model.fit(train_X, train_Y, epochs=100, batch_size=batch_size, validation_data=(test_X, test_Y), verbose=1, shuffle=False)
        model.fit(train_X, train_Y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
    loss_and_metrics = model.evaluate(test_X, test_Y, batch_size=batch_size)
    all_loss_and_metrics.append(loss_and_metrics)
    print("loss_and_metrics: {0}".format(loss_and_metrics))


In [None]:
for k in train_dict:
    print(k)

### 分类问题

In [12]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

Using TensorFlow backend.


In [13]:
model = Sequential()

In [14]:
# input layer
# activation='relu' 
# dropout=0.5
# kernel_initializer="uniform"
model.add(LSTM(units, stateful=True, return_sequences=True, batch_input_shape=(batch_size, timesteps, data_dim), kernel_initializer="uniform"))

In [15]:
# hidden layer
model.add(LSTM(units, return_sequences=True, stateful=True, dropout=0.5, kernel_initializer="uniform"))
model.add(LSTM(units, return_sequences=True, stateful=True, dropout=0.5, kernel_initializer="uniform"))
model.add(LSTM(units, return_sequences=True, stateful=True, dropout=0.5, kernel_initializer="uniform"))

In [16]:
#model.add(LSTM(units, activation='relu', dropout=0.5, return_sequences=True))
#model.add(LSTM(units, activation='relu', dropout=0.5, return_sequences=True))
#model.add(LSTM(units, activation='relu', dropout=0.5, return_sequences=True))
#model.add(LSTM(units, activation='relu', dropout=0.5, return_sequences=True))
#model.add(LSTM(units, activation='relu', dropout=0.5, return_sequences=True))

### 回归问题

In [17]:
# # output layer
# # model.add(Dense(no_classes, activation='softmax'))
# model.add(Dense(1, activation='sigmoid')) # or sigmoid?

In [18]:
# # model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
# model.compile(optimizer='RMSProp', loss='mse') #mse

### 分类问题

In [19]:
from keras.layers import LSTM, Dense, Dropout, BatchNormalization, Activation

In [20]:
# output layer
model.add(Dense(num_classes, activation='softmax'))
# model.add(Dense(num_classes, kernel_initializer="uniform"))
# model.add(BatchNormalization())
# model.add(Activation('softmax'))

In [21]:
from keras.optimizers import SGD, Adam, RMSprop
model.compile(optimizer=RMSprop(0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (8, 32, 32)               5248      
_________________________________________________________________
lstm_2 (LSTM)                (8, 32, 32)               8320      
_________________________________________________________________
lstm_3 (LSTM)                (8, 32, 32)               8320      
_________________________________________________________________
lstm_4 (LSTM)                (8, 32, 32)               8320      
_________________________________________________________________
dense_1 (Dense)              (8, 32, 3)                99        
Total params: 30,307
Trainable params: 30,307
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(train_X, train_Y, epochs=100, batch_size=batch_size, validation_data=(test_X, test_Y), verbose=1, shuffle=False)

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show()

In [None]:
predict_Y=model.predict(x=test_X,batch_size=batch_size)

In [None]:
predict_Y.shape

In [None]:
predict_value = np.argmax(predict_Y,axis=2)

In [None]:
predict_value.shape

In [None]:
test_set = idx300n.iloc[1800:2400, :].copy()

In [None]:
test_set.head()

In [None]:
def class_argmax(x):
    return np.argmax(x)

In [None]:
test_set['class']= test_set['class'].map(class_argmax)

In [None]:
test_set.head()

In [None]:
cmp_predict = pd.DataFrame()
cmp_predict['class'] = test_set['class']

In [None]:
cmp_predict['predict_class'] = predict_value.reshape((-1, ))

In [None]:
cmp_predict['close'] = idx300.loc[cmp_predict.index, 'close']

In [None]:
cmp_predict['diff'] = cmp_predict['predict_class'] - cmp_predict['class']

In [None]:
cmp_predict.head()

In [None]:
cmp_predict=cmp_predict.reset_index()

In [None]:
cmp_predict = cmp_predict.reset_index()

In [None]:
cmp_predict.head()

In [None]:
def color(x):
    c = 'm'
    if x < 0:
        c = 'g'
    elif x > 0:
        c = 'r'
    else:
        c = 'b'
    return c
cmp_predict['color'] = cmp_predict['diff'].map(color)

In [None]:
cmp_predict.plot(kind='scatter', x='index', y='close', s=60, c=cmp_predict.color, figsize=(21, 7))

In [None]:
idx300n['close'].head(600).plot.hist( bins=30, figsize=(21, 7))

In [None]:
cmp_predict.tail()

In [None]:
cmp_predict['predict_class'].plot.hist()

In [None]:
cmp_predict['class'].plot.hist()

In [None]:
cmp_predict['next_close'] = cmp_predict['close'].shift(-1)

In [None]:
cmp_predict['profit'] = cmp_predict['next_close'] - cmp_predict['close']

In [None]:
cmp_predict.head()

In [None]:
def predict_profit(x):
    profit = 0
    if x['predict_class'] == 2:
        profit = x['profit']
    elif x['predict_class'] == 0:
        profit = -x['profit']
    else:
        profit = 0
    return profit
cmp_predict['predict_profit'] = cmp_predict.apply(predict_profit, axis=1)

In [None]:
cmp_predict['predict_profit'].cumsum().plot(figsize=(21, 7))

In [None]:
cmp_predict.head(400).tail()

In [None]:
cmp_predict.head()

In [None]:
cmp_predict[cmp_predict['predict_class']==2]['predict_profit'].cumsum().plot(figsize=(21, 7))

In [None]:
cmp_predict[cmp_predict['predict_class']==0]['predict_profit'].cumsum().plot(figsize=(21, 7))