In [1]:
# Imports
import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import random
import numpy as np
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

#### Read data from CSV Files

In [2]:
main_df = pd.DataFrame()

# CSV Files 
ratios = ["BTC-USD", "LTC-USD", "ETH-USD", "BCH-USD"]

for ratio in ratios:
    dataset = f'crypto_data/{ratio}.csv'
    
    df = pd.read_csv(dataset,
                names=['time', 'low',' high', 'open', 'close', 'volume']
                )
    df.rename(columns={"close":f"{ratio}_close", "volume":f"{ratio}_volume"}, inplace=True)
    
    df.set_index("time", inplace=True)
    
    # get rid of the open/high/low
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)
print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968660    6489.549805        0.587100      96.580002        9.647200   
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   

            ETH-USD_close  ETH-USD_volume  BCH-USD_close  BCH-USD_volume  
time                                                                      
1528968660            NaN             NaN     871.719971        5.675361  
1528968720      486.01001       26.019083     870.859985       26.856577  
1528968780      486.00000        8.449400     870.099976        1.124300  
1528968840      485.75000       26.994646     870.789978        1.749862  
1528968900      4

We have the sequential data, but we need the targets

In [3]:
# Use 60 min to predict the next 3 minutes of the LTC-USD
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = "LTC-USD"

def classify(current, future):
    # if the price in the future is greater than the current then we return 1. else 0
    # we will teach our RNN model that 1 is good
    if float(future) > float(current):
        return 1
    else:
        return 0

In [4]:
# Let's get the future price for the asset that we want to predict and add that to a FUTURE column
main_df['future'] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)

print(main_df[[f"{RATIO_TO_PREDICT}_close", "future"]].head())

            LTC-USD_close     future
time                                
1528968660      96.580002  96.500000
1528968720      96.660004  96.389999
1528968780      96.570000  96.519997
1528968840      96.500000  96.440002
1528968900      96.389999  96.470001


In [5]:
# Now that we know the current price and the future price
# We can apply our classify function to create a Target column
# This column will show 0 if the future price is less than the current price and 1 if the future price is better than the current price
main_df['target'] = list(map(classify, main_df[f"{RATIO_TO_PREDICT}_close"], main_df["future"]))

print(main_df[[f"{RATIO_TO_PREDICT}_close", "future", "target"]].head())

            LTC-USD_close     future  target
time                                        
1528968660      96.580002  96.500000       0
1528968720      96.660004  96.389999       0
1528968780      96.570000  96.519997       0
1528968840      96.500000  96.440002       0
1528968900      96.389999  96.470001       1


#### Separate our training data from the validation data

In [6]:
times = sorted(main_df.index.values)
# 5% for validation and 95% for training
last_5pct = times[-int(0.05*len(times))]

validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

### Create the function that will pre-process our dataset by normalizing the data and creating sequences

In [7]:
def preprocess_df(df):
    # we don't need the future column. we only needed to create the target and we don't want our model to know the future in advance :)
    df = df.drop('future', 1)
    
    for col in df.columns:
        # target is already a 0/1. we don't need to process it
        if col != "target":
            #normalize the data to show just a % change instead of the actual prices
            # remember that the prices are in different scale, bitcoin, eth, ltc, etc.. this way we can have them in the same scale
            df[col] = df[col].pct_change()
            
            # scale the data from -1 to 1 for everything
            df[col] = preprocessing.scale(df[col].values)
    
    df.dropna(inplace=True)
    
    sequential_data = []
    # de-que = a list of max items = SEQ_LEN. as the list reachs that size it pops out the old records for us
    prev_minutes = deque(maxlen=SEQ_LEN)
    
    # we need to convert our dataset to a list
    for i in df.values:
        # this looks confusing, but we're appending to the list all the columns up to -1. 
        # that is, minus the last column, because we don't want the TARGET
        # if our model already knows the target so there's nothing to predict :)
        prev_minutes.append([n for n in i[:-1]])
        # if we have enough records (whatever SEQ_LEN is let's )
        # add to the list and add the label, the Target. after getting the last X minutes of data, what is the target RIGHT now.
        if len(prev_minutes) == SEQ_LEN:
            sequential_data.append([np.array(prev_minutes), i[-1]])
    #shuffle the data
    random.shuffle(sequential_data)
    
    # we need to balance our data so we have the same amount of buys and sell
    # if our data is unbalanced that will cause our model to give more weight to one side than the other
    buys = []
    sells = []
    
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
    
    random.shuffle(buys)
    random.shuffle(sells)
        
    # check which one we have less data
    lower = min(len(buys), len(sells))
    
    # cut the datasets so they are the same size
    buys = buys[:lower]
    sells = sells[:lower]
        
    sequential_data = buys+sells
    # we don't want our data to be ALL buys and then ALL sells
    random.shuffle(sequential_data)
    
    # we need to break our data into x,y to feed the model
    x = []
    y = []

    for seq, target in sequential_data:
        x.append(seq)
        y.append(target)
    
    return np.array(x), y
    

In [8]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)} ")
print(f"Don't buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION don't buys {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 76380 validation: 3714 
Don't buys: 38190, buys: 38190
VALIDATION don't buys 1857, buys: 1857


## Time to build the model

In [9]:
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [None]:
model = Sequential()

model.add(
    #CuDNNLSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True)
    LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True)
)
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(
    LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True)
)
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(
    LSTM(128, input_shape=(train_x.shape[1:]))
)
model.add(Dropout(0.2))
model.add(BatchNormalization())

#model.add(Dense(32, activation="relu"))
model.add(Dense(32, activation="tanh"))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile(loss="sparse_categorical_crossentropy",
             optimizer=opt,
             metrics=['accuracy']
             )

tensorboard = TensorBoard(log_dir=f'logs/{NAME}')

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}" # unique file name that will include the epoch 
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc',  verbose=1, save_best_only=True, mode='max')) #saves only the best ones

history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint]
)



Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 76380 samples, validate on 3714 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10