In [None]:
# IMPORTING LIBRARIES
from ast import main
import os
from collections import deque
import pandas as pd
from sklearn import preprocessing
from collections import deque
import numpy as np
import random
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization 
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [None]:
# INITIALIZING CONSTANTS
SEQ_LEN = 60 # get data every 60 mins
FUTURE_PERIOD_PREDICT = 3 # predict the next three minutes
RATIO_TO_PREDICT = "BCH-USD" # setting ratio to predict
EPOCHS = 10 # setting number of epochs
BATCH_SIZE = 64 # setting batch size
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}" # create unique name for each model

In [None]:
# CREATING TARGETS (LABELS) FOR DATA
def classify(current, future):
    if float(future) > float(current): # check if price is higher in future than in current training data
        return 1 
    else:
        return 0

In [None]:
# PREPROCESSING DATA
def preprocessDf(df): 
    df = df.drop("future", 1) # drop future column - makes model redundant... (only needed for generating targets in classify())

    for col in df.columns:
        if col != "targets":
            df[col] = df[col].pct_change() # normalizing data
            df.dropna(inplace = True) # clean the data (remove na)
            df[col] = preprocessing.scale(df[col].values) # scale data to get it between 0 to 1

    df.dropna(inplace = True) # clean the data (remove na)

    sequentialData = [] # create new list to hold sequential data
    prevDays = deque(maxlen = SEQ_LEN) # as the list reaches the length, SEQ_LEN, pop out the 'old' items

    for i in df.values: # converting datafram to a 'list of list'
        prevDays.append([n for n in i[:-1]]) # add all columns up to "targets" column
        if len(prevDays) == SEQ_LEN:
            sequentialData.append([np.array(prevDays), i[-1]]) # add in current label

    np.random.shuffle(sequentialData) # shuffle sequences (could remove np)

    buys = [] # initalize list, buys
    sells = [] # initalize list, sells

    for seq, target in sequentialData:
        if target == 0: # check if we should sell
            sells.append([seq, target])
        elif target == 1: # check if we should buy
            buys.append([seq, target])
    
    random.shuffle(buys) # shuffle list data
    random.shuffle(sells) # shuffle list data

    lower = min(len(buys), len(sells)) # get the minimum value of the two lists

    buys = buys[:lower] # get what the buys are go up to
    sells = sells[:lower] # get what the sells are go up to

    sequentialData = buys + sells # put sequential data back together

    random.shuffle(sequentialData) # ensure data is not all buys or all sells

    # split into features and labels
    x = []
    y = []

    for seq, target in sequentialData:
        x.append(seq)
        y.append(target)
    
    return np.array(x), y

In [None]:
# COLLECTING DATA
mainDf = pd.DataFrame() # join all dataframes and put them here

ratios = ["BTC-USD", "LTC-USD", "ETH-USD", "BCH-USD"] # store ratios in a array

for ratio in ratios:
    dataset = f'crypto_data/{ratio}.csv'
    

    df = pd.read_csv(dataset, names=["time", "low", "high", "open", "close", "volume"]) # reading data from csv
    df.rename(columns = {"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace = True) # rename columns - eliminates error when joining dataframes
    
    df.set_index("time", inplace = True) # setting index
    df = df[[f"{ratio}_close", f"{ratio}_volume"]] # creating dataframe

    if len(mainDf) == 0: # mainDf is currently empty
        mainDf = df # just make mainDf the dataframe made
    else:
        mainDf = mainDf.join(df) # mainDf wasn't empty so join dataframe to existing dataframes

mainDf['future'] = mainDf[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT) # future price is based on close column
mainDf['targets'] = list(map(classify, mainDf[f"{RATIO_TO_PREDICT}_close"], mainDf["future"])) # mapping data to targets (needed for model)

# print(mainDf[[f"{RATIO_TO_PREDICT}_close", "future", "targets"]].head(10)) # check if targets are assigned properly

In [None]:
# CREATE TRAINING AND TESTING DATASET
times = sorted(mainDf.index.values) # ensure data is sorted by time
last5Pc = times[-int((0.05)*len(times))] # timestamp of last 5% of data (threshold to seperate data by)

validationMainDf = mainDf[(mainDf.index >= last5Pc)] # obtain testing data where data's time >= threshold timestamp
mainDf = mainDf[(mainDf.index < last5Pc)] # obtain training data where data's time >= threshold timestamp

# get data sets
train_x, train_y = preprocessDf(mainDf) # get training data set (features, labels)
validation_x, validation_y = preprocessDf(validationMainDf) # get testing data set (features, labels)

print(f"train data: {len(train_x)} validation: {len(validation_x)}") # testing to ensure datasets were created properly
print(f"Dont buys: {train_y.count(0)} buys: {train_y.count(1)}") # testing to ensure datasets were created properly
print(f"VALIDATION dont buys: {validation_y.count(0)} buys: {validation_y.count(1)}") # testing to ensure datasets were created properly

train_x = np.asarray(train_x)
train_y = np.asarray(train_y)
validation_x = np.asarray(validation_x)
validation_y = np.asarray(validation_y)

In [None]:
# BUILDING MODEL
model = Sequential() # creating a sequential model
model.add(LSTM(128, input_shape = (train_x.shape[1:]), return_sequences = True)) # 128 nodes in input layer
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape = (train_x.shape[1:]), return_sequences = True)) 
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape = (train_x.shape[1:]))) # creating dense layer
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation = "tanh"))
model.add(Dropout(0.2))

model.add(Dense(2, activation = "softmax")) # final dense layer: binary choice

In [None]:
# COMPILE MODEL AND PREP FOR TRAINING
optim = tf.keras.optimizers.Adam(lr = 0.001, decay = 1e-6)

model.compile(loss = 'sparse_categorical_crossentropy',
            optimizer = optim,
            metrics = ['accuracy'])

tensorboard = TensorBoard(log_dir = f'logs/{NAME}') # create callback, Tensorboard object

filepath = "RNN_Final-{epoch:02d}-{val_accuracy:.3f}"
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor = 'val_accuracy', verbose = 1, save_best_only = True, mode = 'max'))

In [None]:
# TRAIN MODEL
history = model.fit(
    train_x, train_y,
    validation_split = 0.1,
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,
    validation_data = (validation_x, validation_y),
    callbacks = [tensorboard, checkpoint]
)