In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
import pandas as pd
from collections import deque
import random
import numpy as np

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model, model_from_json
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization, Bidirectional
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint, ModelCheckpoint
import time
from sklearn import preprocessing
import h5py

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
# From https://www.tensorflow.org/guide/gpu

import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# gpu_devices = tf.config.experimental.list_physical_devices("GPU")
# for device in gpu_devices:
#     tf.config.experimental.set_memory_growth(device, True)

Num GPUs Available:  0


In [4]:
SEQ_LEN = 180  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 5  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "ada_rnn"
EPOCHS = 10  # how many passes through our data
BATCH_SIZE = 128  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [13]:
def classify(current, future):
    if float(future) > float(current):  
        return 1
    else: 
        return 0


def preprocess_df(df):
    df = df.drop("future", 1)  
    
    for col in df.columns:  
        if col != "target":  
            df[col] = df[col].pct_change()  
            df.dropna(inplace=True)  
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  

    sequential_data = []  
    prev_days = deque(maxlen=SEQ_LEN)  
    #print(df.values)
    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  
        if len(prev_days) == SEQ_LEN:  
            sequential_data.append([np.array(prev_days), i[-1]])  
    #print(sequential_data)
    random.shuffle(sequential_data)  
    #print(sequential_data)

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  
        if target == 0:  
            sells.append([seq, target])  
        elif target == 1:  
            buys.append([seq, target])  

    random.shuffle(buys)  
    random.shuffle(sells)  

    lower = min(len(buys), len(sells)) 

    buys = buys[:lower]  
    sells = sells[:lower]  

    sequential_data = buys+sells  
    random.shuffle(sequential_data)  

    X = []
    y = []

    for seq, target in sequential_data:  
        X.append(seq)  
        y.append(target)  

    return np.array(X), y  


main_df = pd.DataFrame() # begin empty

ratios = ["ada_rnn","btc_rnn", "dot_rnn", "eth_rnn"]  
for ratio in ratios:  

    ratio = ratio.split('.csv')[0]  
    print(ratio)
    dataset = f'autoscraper/data/{ratio}.csv' 
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  

    if len(main_df)==0:  
        main_df = df 
    else:  
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  
main_df.dropna(inplace=True)
main_df.drop_duplicates()

main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))

ada_rnn


In [16]:
%%time
# Generate test and train datasets
times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.15*len(times))]

validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

x_train, y_train = preprocess_df(main_df)
x_test, y_test = preprocess_df(validation_main_df)

print(f"train data: {len(x_train)} validation: {len(x_test)}")
print(f"Dont buys: {y_train.count(0)}, buys: {y_train.count(1)}")
print(f"VALIDATION Dont buys: {y_test.count(0)}, buys: {y_test.count(1)}")



train data: 848 validation: 22
Dont buys: 424, buys: 424
VALIDATION Dont buys: 11, buys: 11
Wall time: 313 ms


In [18]:
#Build model
model = Sequential()
model.add(Bidirectional(LSTM(128, input_shape=(x_train.shape[1:]), return_sequences=True)))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))


opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

In [19]:
# # '.h5' or empty for tf format, potential avoid missing information
import os

format_ext = '.h5'  # '.h5' or empty for tf format
model_path = os.path.join('out', 'mnist-classifier{}'.format(format_ext))

In [20]:
%%time
# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

Wall time: 0 ns


In [21]:
tensorboard = TensorBoard(log_dir="logs\\{}".format(NAME)) #Change backslash for windows (logs\\) or (logs/) for ubuntu
filepath = "RNN_Trained-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models\\{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones


In [22]:
# Converted labels to arrays before calling model.fit()
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)

In [23]:
%%time
# Train model
history = model.fit(
    x_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test),
    callbacks=[tensorboard, checkpoint]
)
# Score model
score = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))


Train on 848 samples, validate on 22 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.6212663650512695
Test accuracy: 0.6363636
Wall time: 25min 41s


In [25]:
model.save('model.h5')
model_json = model.to_json()
with open("model.json", "w") as json_file:
            json_file.write(model_json)