In [24]:
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np # linear algebra
import pandas as pd
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
# tqdm.pandas()

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, TimeDistributed
from tensorflow.keras.models import Model, Sequential
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras import layers, optimizers
from tensorflow.keras.callbacks import EarlyStopping



%matplotlib inline
import matplotlib.pyplot as plt

#multiprocess use
import workers_agg
from multiprocessing import Pool, TimeoutError, Lock, Value, current_process, cpu_count
import pickle
import math

In [2]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))


In [193]:
print(workers_agg.__file__)

D:\Strategic_Trading\VolumeForecast\workers_agg.py


In [60]:
x = pd.read_csv("D:\\Strategic_Trading\\VolumeForecast\\data\\2330\\LSTM_Train_new.csv", converters = {"ts": str})
x['ts'] = pd.to_datetime(x['ts'])

In [61]:
index = pd.read_csv("D:\\Strategic_Trading\\index\\min_agg_all.csv", converters={"ts":str})
index["ts"] = pd.to_datetime(index["ts"])

In [62]:
x = pd.merge(x, index, on="ts", how="left")

In [43]:
x[["open_index", "close_index", "high_index", "low_index", "vol_index", "return_index"]] = x[["open_index", "close_index", "high_index", "low_index", "vol_index", "return_index"]].astype(np.float32)

In [35]:
x.to_csv("D:\\Strategic_Trading\\VolumeForecast\\data\\2330\\LSTM_Train_float32.csv", index=False)

In [4]:
first_price = x.groupby(x['ts'].dt.date, as_index = False).head(1)
for _, v in tqdm(first_price.iterrows(), total=first_price.shape[0]):
    x.loc[x['ts'].dt.date==(v['ts'].date()),'first_price'] = v['open']

HBox(children=(IntProgress(value=0, max=1229), HTML(value='')))




In [7]:
train_date_idx = int(len(x['ts'].dt.date.unique())*0.6)
val_date_idx = int(len(x['ts'].dt.date.unique())*0.8)
d1 = x['ts'].dt.date.unique()[train_date_idx]
d2 = x['ts'].dt.date.unique()[val_date_idx]

In [8]:
split_train_val_idx = x.loc[(x['ts'].dt.date==d1)].index[-1]
split_val_test_idx = x.loc[(x['ts'].dt.date==d2)].index[-1]

train_df = x.iloc[:split_train_val_idx+1]
val_df = x.iloc[split_train_val_idx+1:split_val_test_idx+1]
test_df = x.iloc[split_val_test_idx+1:]

In [9]:
test_df.head()

Unnamed: 0,ts,open,high,low,close,vol,VWAP,close_lag,high_lag,low_lag,...,date,time,time_index,sin_time,open_ETF,high_ETF,low_ETF,close_ETF,vol_ETF,return_ETF
261724,2017-07-04 09:00:00,208.0,208.5,208.0,208.0,296000.0,208.01859,209.0,209.0,209.0,...,2017-07-04,09:00:00,1,0.0,10412.79,10412.79,10397.93,10402.39,90298,-10.4
261725,2017-07-04 09:01:00,208.0,208.5,208.0,208.0,12000.0,208.25,208.0,208.5,208.0,...,2017-07-04,09:01:00,2,0.011855,10402.36,10409.63,10400.75,10409.63,49254,7.24
261726,2017-07-04 09:02:00,208.0,208.5,208.0,208.0,42000.0,208.16667,208.0,208.5,208.0,...,2017-07-04,09:02:00,3,0.023708,10402.08,10411.44,10402.08,10405.02,54233,-4.61
261727,2017-07-04 09:03:00,208.0,208.0,207.5,207.5,249000.0,207.94377,208.0,208.5,208.0,...,2017-07-04,09:03:00,4,0.035558,10407.82,10413.92,10400.68,10400.68,58483,-4.34
261728,2017-07-04 09:04:00,207.5,208.0,207.5,208.0,13000.0,207.84616,207.5,208.0,207.5,...,2017-07-04,09:04:00,5,0.047402,10404.97,10410.39,10401.97,10408.11,72304,7.43


In [10]:
val_df.head()

Unnamed: 0,ts,open,high,low,close,vol,VWAP,close_lag,high_lag,low_lag,...,date,time,time_index,sin_time,open_ETF,high_ETF,low_ETF,close_ETF,vol_ETF,return_ETF
196288,2016-06-30 09:00:00,161.0,161.0,160.5,160.5,1482000.0,160.9919,159.0,159.0,159.0,...,2016-06-30,09:00:00,1,0.0,8586.56,8634.57,8586.56,8631.2,121199,44.64
196289,2016-06-30 09:01:00,160.5,161.0,160.5,160.5,143000.0,160.56644,160.5,161.0,160.5,...,2016-06-30,09:01:00,2,0.011855,8630.61,8633.74,8625.2,8627.29,55423,-3.91
196290,2016-06-30 09:02:00,160.5,160.5,160.5,160.5,25000.0,160.5,160.5,161.0,160.5,...,2016-06-30,09:02:00,3,0.023708,8624.6,8629.29,8622.83,8622.9,63108,-4.39
196291,2016-06-30 09:03:00,161.0,161.0,160.5,160.5,132000.0,160.55682,160.5,160.5,160.5,...,2016-06-30,09:03:00,4,0.035558,8622.46,8626.76,8619.18,8619.45,70485,-3.45
196292,2016-06-30 09:04:00,160.5,160.5,160.5,160.5,220000.0,160.5,160.5,161.0,160.5,...,2016-06-30,09:04:00,5,0.047402,8618.21,8618.21,8614.0,8614.0,77708,-5.45


In [11]:
train_df_list = [group[1] for group in train_df.groupby(train_df['ts'].dt.date)]
val_df_list = [group[1] for group in val_df.groupby(val_df['ts'].dt.date)]
test_df_list = [group[1] for group in test_df.groupby(test_df['ts'].dt.date)]

In [11]:
train_list = []
if __name__ == '__main__':
    with Pool(processes=12) as pool:
        for i, x in enumerate(tqdm(pool.imap_unordered(workers_agg.create_dataset, train_df_list), total=len(train_df_list)), 1):
            for xx in x:
                train_list.append(xx.tolist())


HBox(children=(IntProgress(value=0, max=738), HTML(value='')))




In [15]:
val_list = []
if __name__ == '__main__':
    with Pool(processes=12) as pool:
        for i, x in enumerate(tqdm(pool.imap_unordered(workers_agg.create_dataset, val_df_list), total=len(val_df_list)), 1):
            for xx in x:
                val_list.append(xx.tolist())


HBox(children=(IntProgress(value=0, max=246), HTML(value='')))




In [16]:
test_list = []
if __name__ == '__main__':
    with Pool(processes=12) as pool:
        for i, x in enumerate(tqdm(pool.imap_unordered(workers_agg.create_dataset, test_df_list), total=len(test_df_list)), 1):
            for xx in x:
                test_list.append(xx.tolist())


HBox(children=(IntProgress(value=0, max=245), HTML(value='')))




In [13]:
look_back = 30
shift = 1

In [18]:
save_list = [train_list,val_list,test_list]

with open(f'agg_shift_{shift}_lookback_{look_back}_withETF_fp32', 'wb') as fp:
    pickle.dump(save_list, fp)

In [14]:
with open (f'agg_shift_{shift}_lookback_{look_back}_withETF_fp32', 'rb') as fp:
    load_list = pickle.load(fp)

In [15]:
X_train = load_list[0]
X_val = load_list[1]
X_test = load_list[2]

In [16]:
np.array(X_train).shape

(174886, 30, 28)

In [17]:
len(X_train)+len(X_val)+len(X_test)

291253

In [18]:
y_train = []
for x in train_df_list:
    y_train.append(x['y'].values[look_back-1::shift])
y_train = [str(item) for sublist in y_train for item in sublist]

y_val = []
for x in val_df_list:
    y_val.append(x['y'].values[look_back-1::shift])
y_val = [str(item) for sublist in y_val for item in sublist]

y_test = []
for x in test_df_list:
    y_test.append(x['y'].values[look_back-1::shift])
y_test = [str(item) for sublist in y_test for item in sublist]

In [19]:
list(y_train).count("up")/len(y_train)

0.1803860800750203

In [20]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)
y_train_int, y_val_int, y_test_int = label_encoder.fit_transform(y_train), label_encoder.fit_transform(y_val), label_encoder.fit_transform(y_test)
y_train_int, y_val_int, y_test_int = y_train_int.reshape(len(y_train_int), 1), y_val_int.reshape(len(y_val_int), 1), y_test_int.reshape(len(y_test_int), 1)
y_train_final, y_val_final, y_test_final = onehot_encoder.fit_transform(y_train_int), onehot_encoder.fit_transform(y_val_int), onehot_encoder.fit_transform(y_test_int)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [21]:
(len(X_train)==len(y_train_final)) and (len(X_val)==len(y_val_final)) and (len(X_test)==len(y_test_final))

True

In [100]:
print(tf.__version__)

1.14.0


In [26]:
import random

In [39]:
param_grid = {
    'l1_drop': list(np.linspace(0, 0.4)),
    'l2_drop': list(np.linspace(0, 0.4)),
    'l1_out': [75, 100, 125],
    'l2_out': [75, 100, 125],
    'batch_size': [128, 256, 512],
    "epochs": [10, 20, 30],
    "patience": [2,3,4,5]
}

In [40]:
# function to optimize mnist model
def Objective(X_train, Y_train, X_val, Y_val, X_test, Y_test, hyperparameters, iteration):
    model = Sequential()
    model.add(LSTM(
                name='lstm_0',
                units=hyperparameters["l1_out"],
                return_sequences=True,
                stateful=False,
                dropout=hyperparameters["l1_drop"],
                recurrent_dropout=hyperparameters["l1_drop"],
                activation='relu'))
    model.add(LSTM(
                name='lstm_3',
                units=hyperparameters["l2_out"],
                return_sequences=False,
                stateful=False,
                dropout=hyperparameters["l2_drop"],
                recurrent_dropout=hyperparameters["l2_drop"],
                activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                    optimizer=optimizers.Nadam(lr=0.001),
                    metrics=['accuracy'])
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=hyperparameters["patience"], verbose=1)
    model.fit(X_train, Y_train, validation_data=(X_val, Y_val), batch_size = hyperparameters["batch_size"], epochs=hyperparameters["epochs"], verbose=2, callbacks=[early_stopping])
    
    score = model.evaluate(X_test, Y_test, batch_size=4096)
    print("Loss:\t{0} \t Accuracy:\t{1}".format(score[0], score[1]))
    
    return [score, hyperparameters, iteration]


In [22]:
score, params, iteration = Objective(np.array(X_train), y_train_final, np.array(X_val), y_val_final, random_params, 1)

W0911 13:31:14.860780 10268 deprecation.py:506] From C:\Users\011553\AppData\Local\Continuum\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Train on 169219 samples, validate on 56309 samples


W0911 13:31:15.641968 10268 deprecation.py:323] From C:\Users\011553\AppData\Local\Continuum\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 00008: early stopping
LOSS:	0.979281980170073 	 ACCURACY:	0.6811522245407104


In [41]:
def random_search_optimal(data, param_grid, max_evals):
    """Random search for hyperparameter optimization"""
    
    optimal = [0,0]
    
    # Keep searching until reach max evaluations
    for i in tqdm(range(max_evals), total=max_evals):
        
        # Choose random hyperparameters
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}

        # Evaluate randomly selected hyperparameters
        eval_results = Objective(np.array(data[0]), data[1], np.array(data[2]), data[3], np.array(data[4]), data[5], hyperparameters)
        
        if eval_results[0] > optimal[0]:
            optimal[0] = eval_results[0]
            optimal[1] = eval_results[1]
        else:
            continue
    
    return optimal 

In [None]:
def random_search_all(data, param_grid, max_evals):
    """Random search for hyperparameter optimization"""
    
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'], index = list(range(max_evals)))
    
    # Keep searching until reach max evaluations
    for i in tqdm(range(max_evals), total=max_evals):
        
        # Choose random hyperparameters
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}

        # Evaluate randomly selected hyperparameters
        eval_results = Objective(np.array(data[0]), data[1], np.array(data[2]), data[3], np.array(data[4]), data[5], hyperparameters)
        
        results.loc[i, :] = eval_results
        
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    
    return optimal 

In [42]:
data = [X_train, y_train_final, X_val, y_val_final, X_test, y_test_final]

In [43]:
result = random_search(data, param_grid, 20)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Train on 169219 samples, validate on 56309 samples
Epoch 1/10
169219/169219 - 101s - loss: 221.1593 - acc: 0.6268 - val_loss: 0.9198 - val_acc: 0.6280
Epoch 2/10
169219/169219 - 100s - loss: 2.6456 - acc: 0.6301 - val_loss: 0.9200 - val_acc: 0.6280
Epoch 3/10
169219/169219 - 99s - loss: 1.5067 - acc: 0.6303 - val_loss: 0.9181 - val_acc: 0.6280
Epoch 4/10
169219/169219 - 99s - loss: 2.3098 - acc: 0.6303 - val_loss: 0.9186 - val_acc: 0.6280
Epoch 5/10
169219/169219 - 99s - loss: 0.9890 - acc: 0.6302 - val_loss: 0.9182 - val_acc: 0.6280
Epoch 6/10
169219/169219 - 100s - loss: 0.9615 - acc: 0.6304 - val_loss: 0.9187 - val_acc: 0.6280
Epoch 7/10
169219/169219 - 100s - loss: 117.6788 - acc: 0.6272 - val_loss: 0.9194 - val_acc: 0.6280
Epoch 00007: early stopping
LOSS:	0.9193597874827465 	 ACCURACY:	0.6279635429382324
Train on 169219 samples, validate on 56309 samples
Epoch 1/20
169219/169219 - 100s - loss: 0.8022 - acc: 0.6759 - val_loss: 0.8064 - val_acc: 0.6635
Epoch 2/20
169219/169219 - 99

In [44]:
display(result)

Unnamed: 0,index,score,params,iteration
0,15,"[1.3204682296690333, 0.67335594]","{'l1_drop': 0.05714285714285715, 'l2_drop': 0....",15
1,18,"[1.0803821782505707, 0.6859472]","{'l1_drop': 0.0326530612244898, 'l2_drop': 0.3...",18
2,0,"[0.9193597874827465, 0.62796354]","{'l1_drop': 0.2122448979591837, 'l2_drop': 0.0...",0
3,10,"[0.9177259302981446, 0.62796354]","{'l1_drop': 0.08979591836734695, 'l2_drop': 0....",10
4,12,"[0.8334968795047507, 0.6860715]","{'l1_drop': 0.1469387755102041, 'l2_drop': 0.0...",12
5,5,"[0.8014551081970988, 0.6794118]","{'l1_drop': 0.04081632653061225, 'l2_drop': 0....",5
6,17,"[0.799786336652641, 0.68719035]","{'l1_drop': 0.1469387755102041, 'l2_drop': 0.1...",17
7,8,"[0.7970577607591552, 0.68596494]","{'l1_drop': 0.1959183673469388, 'l2_drop': 0.0...",8
8,14,"[0.7951639830059472, 0.6866931]","{'l1_drop': 0.13877551020408163, 'l2_drop': 0....",14
9,4,"[0.7895905229043909, 0.6826085]","{'l1_drop': 0.0, 'l2_drop': 0.4, 'l1_out': 75,...",4


In [66]:
model = Sequential()
model.add(LSTM(
            name='lstm_0',
            units=125,
            return_sequences=True,
            stateful=False,
            dropout=0.3673469387755102,
            recurrent_dropout=0.3673469387755102,
            activation='relu'))
model.add(LSTM(
            name='lstm_3',
            units=100,
            return_sequences=False,
            stateful=False,
            dropout=0.1469387755102041,
            recurrent_dropout=0.1469387755102041,
            activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',
                optimizer=optimizers.Nadam(lr=0.001),
                metrics=['accuracy'])
    
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
model.fit(np.array(X_train), y_train_final, validation_data=(np.array(X_val), y_val_final), batch_size = 256, epochs=10, verbose=2, callbacks=[early_stopping])
    


Train on 174886 samples, validate on 58302 samples
Epoch 1/10
174886/174886 - 58s - loss: 0.7566 - acc: 0.7088 - val_loss: 0.8165 - val_acc: 0.6671
Epoch 2/10
174886/174886 - 56s - loss: 0.6676 - acc: 0.7603 - val_loss: 0.8398 - val_acc: 0.6691
Epoch 3/10
174886/174886 - 56s - loss: 0.6606 - acc: 0.7654 - val_loss: 0.8158 - val_acc: 0.6687
Epoch 4/10
174886/174886 - 56s - loss: 0.6582 - acc: 0.7667 - val_loss: 0.8326 - val_acc: 0.6689
Epoch 5/10
174886/174886 - 56s - loss: 0.6558 - acc: 0.7677 - val_loss: 0.8227 - val_acc: 0.6691
Epoch 6/10
174886/174886 - 56s - loss: 0.6547 - acc: 0.7682 - val_loss: 0.8171 - val_acc: 0.6690
Epoch 7/10
174886/174886 - 58s - loss: 0.6535 - acc: 0.7689 - val_loss: 0.8046 - val_acc: 0.6687
Epoch 8/10
174886/174886 - 57s - loss: 33118.3527 - acc: 0.6612 - val_loss: 7681.5383 - val_acc: 0.4596
Epoch 9/10
174886/174886 - 57s - loss: 95466.6958 - acc: 0.4649 - val_loss: 233.2163 - val_acc: 0.3534
Epoch 10/10
174886/174886 - 57s - loss: 20313.4214 - acc: 0.497

<tensorflow.python.keras.callbacks.History at 0x1c4633f9da0>

In [31]:
import time

In [100]:
score = model.evaluate(np.array(X_test),np.array(y_test_final),batch_size=4096)



In [101]:
score

[0.765579505864116, 0.71244293]

In [67]:
tStart = time.time()
prediction = new_model.predict(np.array(X_test),batch_size=4096)
tEnd = time.time()
print("It cost %f sec" % (tEnd - tStart))

NameError: name 'time' is not defined

In [33]:
r = prediction.tolist()
y_pred = []
for i, item in tqdm(enumerate(r), total=len(r)):
    y_pred.append(item.index(max(item)))

y_true = [item for sublist in y_test_int for item in sublist]

HBox(children=(IntProgress(value=0, max=58065), HTML(value='')))




In [34]:
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.58      0.57      0.58     10626
     class 1       0.81      0.82      0.81     36772
     class 2       0.58      0.58      0.58     10667

    accuracy                           0.73     58065
   macro avg       0.66      0.66      0.66     58065
weighted avg       0.73      0.73      0.73     58065



In [108]:
model.save('LSTM_optimized.h5')

In [27]:
new_model = tf.keras.models.load_model('LSTM_optimized.h5')

W0917 10:46:38.319078 11160 deprecation.py:506] From C:\Users\011553\AppData\Local\Continuum\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\ops\init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0917 10:46:38.321050 11160 deprecation.py:506] From C:\Users\011553\AppData\Local\Continuum\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0917 10:46:38.323070 11160 deprecation.py:506] From C:\Users\011553\AppData\Local\Continuum\anaconda3\envs\tf-gpu\lib\site-packages\tensorflow\pyth

In [28]:
new_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_0 (LSTM)                multiple                  77000     
_________________________________________________________________
lstm_3 (LSTM)                multiple                  90400     
_________________________________________________________________
dense_2 (Dense)              multiple                  303       
Total params: 167,703
Trainable params: 167,703
Non-trainable params: 0
_________________________________________________________________


In [29]:
score = new_model.evaluate(np.array(X_test),np.array(y_test_final),batch_size=4096)



In [30]:
score

[0.7256923373135795, 0.7279256]