In [141]:
import pandas as pd
from collections import deque
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint, ModelCheckpoint
import time
from sklearn import preprocessing

SEQ_LEN = 20  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 1  # how far into the future are we trying to predict?
index_TO_PREDICT = "STOXX"
EPOCHS = 5  # how many passes through our data
BATCH_SIZE = 32  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

main_df = pd.DataFrame() # begin empty

indices = ["SNP", "STOXX", "IBEX", "DAX"] # the 4 indices we want to consider
for index in indices:  # begin iteindexn

    index = index.split('.csv')[0]  # split away the ticker from the file-name
    print(index)
    dataset = f'./indices_data/{index}_daily.csv'  # get the full path to the file, change weekly to daily or monthly
    df = pd.read_csv(dataset, header=0, names=['time', 'open', 'high', 'low', 'close', 'adj. close', 'volume'])  # read in specific file
    df[['open','high', 'low', 'close', 'adj. close', 'volume']]=df[['open','high', 'low', 'close', 'adj. close', 'volume']].astype(np.float64)
    #print(df)
    #CONVERT date to int
    df['time']=pd.to_datetime(df['time']).astype(np.int64) // 10**9
    
    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{index}_close", "volume": f"{index}_volume"}, inplace=True)
    
    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"{index}_close", f"{index}_volume"]]  # ignore the other columns besides price and volume
    
    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.replace([0.0,0,np.Inf,-np.Inf],np.nan,inplace=True)
main_df.fillna(method="bfill", inplace=True)
main_df.fillna(method="ffill", inplace=True)# if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
#print(main_df.head())  # how did we do??


def classify(current, future):
    if float(future) > float(current):  # if the future price is higher than the current, that's a buy, or a 1
        return 1
    else:  # otherwise... it's a 0!
        return 0
    
main_df['future'] = main_df[f'{index_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f'{index_TO_PREDICT}_close'], main_df['future']))

main_df.dropna(axis=0,inplace=True)


## here, split away some slice of the future data from the main main_df.
times = sorted(main_df.index.to_numpy())
#print(times)
last_5pct = sorted(main_df.index.to_numpy())[-int(0.2*len(times))]
#print(main_df.index,last_5pct)
validation_main_df = main_df[(main_df.index >= last_5pct)]
#print(validation_main_df)
main_df = main_df[(main_df.index < last_5pct)]

def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.
    
    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            #print(df[col])
            df[col] = df[col].pct_change()  # pct change "normalizes" the different columns
            df.dropna(axis=0,inplace=True)  # remove the nas created by pct_change
            #print(df)
            df[col] = preprocessing.scale(df[col].to_numpy())  # scale between 0 and 1, one could also remove the mean and set valieues ranging from -1,1

    df.dropna(inplace=True)  # cleanup again... jic.


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.to_numpy():  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array!


train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

model = Sequential()
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))


opt = tf.keras.optimizers.Adam(lr=0.0001, decay=1e-5)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

#tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

#filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
#checkpoint = ModelCheckpoint("{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

# Train model
history = model.fit(
    train_x, np.array(train_y),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y))
    #callbacks=[tensorboard, checkpoint])

# Score model
score= model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
#model.save("{}".format(NAME))

SNP
STOXX
IBEX
DAX
train data: 156 validation: 18
Dont buys: 78, buys: 78
VALIDATION Dont buys: 9, buys: 9
Train on 156 samples, validate on 18 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.6925339698791504
Test accuracy: 0.5


In [113]:
np.float64(0)

0.0

In [127]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, 5],
                   [np.nan, 3, np.nan, 4]],
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


Unnamed: 0,A,B,C,D
0,,2.0,,
1,3.0,4.0,,1.0
2,,,,5.0
3,,3.0,,4.0
