In [441]:
import pandas as pd
import numpy as np
from collections import deque
from sklearn.preprocessing import MinMaxScaler
import random
import time
from datetime import datetime

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)

LOOK_AHEAD = 3
TRAIN_RATIO = 0.8
SEQUENCE_LENGTH = 3

EPOCHS = 50
BATCH_SIZE = 4

time_string = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
MODEL_NAME = f'LA-{LOOK_AHEAD}_SL-{SEQUENCE_LENGTH}_{time_string}'

df_main = pd.read_csv('dataset.csv', index_col='Date', parse_dates=['Date'])
df_main

Unnamed: 0_level_0,SPX,DIX,GEX,SPY Volume,USO Price,GLD Price,TY 1 mo,TY 3 mo,TY 6 mo,TY 1 yr,...,TY 5 yr,TY 7 yr,TY 10 yr,TY 20 yr,TY 30 yr,VIX,VIX P/C Ratio,VIX Options Volume,SPX P/C Ratio,SPX Options Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-05-02,1361.219971,0.378842,1.897313e+09,126278700.0,44.930000,150.410004,0.02,0.05,0.10,0.22,...,1.96,2.66,3.31,4.14,4.38,15.99,0.51,182366.0,1.38,567584.0
2011-05-03,1356.619995,0.383411,1.859731e+09,138375000.0,44.080002,149.880005,0.02,0.03,0.09,0.20,...,1.96,2.64,3.28,4.11,4.36,16.70,0.39,318780.0,1.84,540934.0
2011-05-04,1347.319946,0.392122,1.717764e+09,182678500.0,43.259998,147.729996,0.02,0.03,0.07,0.19,...,1.95,2.61,3.25,4.08,4.33,17.08,0.87,369293.0,2.08,432621.0
2011-05-05,1335.099976,0.405457,1.361864e+09,226900000.0,39.320000,143.470001,0.01,0.02,0.07,0.20,...,1.88,2.54,3.18,4.00,4.26,18.20,1.36,446203.0,1.94,601038.0
2011-05-06,1340.199951,0.418649,1.490329e+09,222787200.0,38.869999,145.300003,0.02,0.02,0.07,0.18,...,1.87,2.54,3.19,4.03,4.29,18.40,0.41,382407.0,1.85,601669.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-04-06,2663.680000,0.446698,1.555165e+09,188061200.0,5.480000,156.880005,0.09,0.15,0.17,0.20,...,0.44,0.58,0.67,1.08,1.27,45.24,1.14,536473.0,1.27,1421641.0
2020-04-07,2659.410000,0.482110,1.533925e+09,201427200.0,5.090000,156.039993,0.10,0.14,0.20,0.20,...,0.48,0.64,0.75,1.13,1.32,46.70,1.67,380770.0,1.12,1315103.0
2020-04-08,2749.980000,0.501114,3.331833e+09,153774500.0,5.370000,154.649994,0.14,0.22,0.24,0.23,...,0.47,0.65,0.77,1.18,1.37,43.35,1.11,325232.0,1.39,1262348.0
2020-04-09,2789.820000,0.488574,2.608413e+09,190282700.0,4.980000,158.690002,0.20,0.25,0.24,0.25,...,0.41,0.60,0.73,1.15,1.35,41.67,0.81,439073.0,1.27,1418972.0


In [424]:
df_main.describe()

Unnamed: 0,SPX,DIX,GEX,SPY Volume,USO Price,GLD Price,TY 1 mo,TY 3 mo,TY 6 mo,TY 1 yr,...,TY 5 yr,TY 7 yr,TY 10 yr,TY 20 yr,TY 30 yr,VIX,VIX P/C Ratio,VIX Options Volume,SPX P/C Ratio,SPX Options Volume
count,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2234.0,2234.0,2234.0,2234.0,...,2234.0,2234.0,2234.0,2234.0,2234.0,2251.0,2251.0,2251.0,2251.0,2251.0
mean,2110.910297,0.419626,2180254000.0,117198500.0,21.745957,130.882959,0.619315,0.655962,0.733151,0.817167,...,1.583308,1.927372,2.241209,2.707243,2.977686,16.683016,0.508903,576972.1,1.742443,1059653.0
std,558.412153,0.027776,1632390000.0,68845180.0,11.62565,18.528789,0.813688,0.829858,0.842033,0.842661,...,0.602544,0.522543,0.471629,0.491393,0.486524,7.118057,0.294143,354907.0,0.370808,412507.8
min,1099.22998,0.330555,-2958423000.0,20270000.0,4.21,100.5,0.0,0.0,0.02,0.08,...,0.37,0.51,0.54,0.87,0.99,9.14,0.07,41355.0,0.81,223433.0
25%,1659.420044,0.400926,1157110000.0,71214050.0,11.4,117.870003,0.03,0.04,0.08,0.15,...,1.17,1.51,1.88,2.41,2.74,12.835,0.32,354112.0,1.48,759197.5
50%,2075.810059,0.419258,2121601000.0,99141800.0,14.82,124.279999,0.11,0.11,0.22,0.36,...,1.58,1.98,2.23,2.7,2.99,14.77,0.44,493615.0,1.72,993882.0
75%,2598.055,0.437429,3112933000.0,142854800.0,34.414999,143.724998,1.16,1.29,1.45,1.49,...,1.84,2.24,2.61,2.9775,3.1775,18.06,0.61,689430.5,1.97,1285228.0
max,3386.12,0.513793,11566400000.0,717828700.0,44.93,184.589996,2.51,2.49,2.58,2.74,...,3.09,3.18,3.31,4.14,4.4,82.69,3.09,4336057.0,3.77,3593415.0


In [425]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2251 entries, 2011-05-02 to 2020-04-13
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SPX                 2251 non-null   float64
 1   DIX                 2251 non-null   float64
 2   GEX                 2251 non-null   float64
 3   SPY Volume          2251 non-null   float64
 4   USO Price           2251 non-null   float64
 5   GLD Price           2251 non-null   float64
 6   TY 1 mo             2234 non-null   float64
 7   TY 3 mo             2234 non-null   float64
 8   TY 6 mo             2234 non-null   float64
 9   TY 1 yr             2234 non-null   float64
 10  TY 2 yr             2234 non-null   float64
 11  TY 3 yr             2234 non-null   float64
 12  TY 5 yr             2234 non-null   float64
 13  TY 7 yr             2234 non-null   float64
 14  TY 10 yr            2234 non-null   float64
 15  TY 20 yr            2234 non-null   f

In [426]:
def feature_engineering(df):
    
    df['SPX Price'] = df['SPX']
    
    ln_ratio_replace_cols = [
        'SPX',
        'SPY Volume',
        'USO Price',
        'GLD Price',
        'SPX Options Volume',
        'VIX Options Volume'
    ]
    
    ln_ratio_augment_cols = [
        'VIX'
    ]
    
    diff_replace_cols = [
    ]
    
    TY_cols = [col for col in df.columns if 'TY' in col]
    diff_replace_cols.extend(TY_cols)
    
    diff_augment_cols = [
        'DIX',
        'GEX'
    ]
    
    for col in ln_ratio_replace_cols:
        df[col] = np.log1p(df[col].pct_change())
    for col in ln_ratio_augment_cols:
        df[col + ' lnRatio'] = np.log1p(df[col].pct_change())
    
    for col in diff_replace_cols:
        df[col] = df[col].diff()
    for col in diff_augment_cols:
        df[col + ' Diff'] = df[col].diff()
        
    features_to_use = [
        'SPX Price',
        'SPX',
        'SPY Volume',
        'DIX',
        'GEX',
        'DIX Diff',
        'GEX Diff',
        'SPX Options Volume',
        'VIX Options Volume',
        'SPX P/C Ratio',
        'VIX P/C Ratio'
    ]
    
    return df[features_to_use]

SPX_prices = df_main['SPX']
df_old = df_main.copy()
df_main = feature_engineering(df_main)
df_main

Unnamed: 0_level_0,SPX Price,SPX,SPY Volume,DIX,GEX,DIX Diff,GEX Diff,SPX Options Volume,VIX Options Volume,SPX P/C Ratio,VIX P/C Ratio
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-05-02,1361.219971,,,0.378842,1.897313e+09,,,,,1.38,0.51
2011-05-03,1356.619995,-0.003385,0.091476,0.383411,1.859731e+09,0.004569,-3.758192e+07,-0.048091,0.558486,1.84,0.39
2011-05-04,1347.319946,-0.006879,0.277760,0.392122,1.717764e+09,0.008711,-1.419668e+08,-0.223435,0.147089,2.08,0.87
2011-05-05,1335.099976,-0.009111,0.216782,0.405457,1.361864e+09,0.013335,-3.558996e+08,0.328796,0.189184,1.94,1.36
2011-05-06,1340.199951,0.003813,-0.018292,0.418649,1.490329e+09,0.013192,1.284643e+08,0.001049,-0.154289,1.85,0.41
...,...,...,...,...,...,...,...,...,...,...,...
2020-04-06,2663.680000,0.067968,0.327344,0.446698,1.555165e+09,-0.059777,2.018244e+09,0.042825,0.423945,1.27,1.14
2020-04-07,2659.410000,-0.001604,0.068661,0.482110,1.533925e+09,0.035412,-2.123946e+07,-0.077897,-0.342821,1.12,1.67
2020-04-08,2749.980000,0.033489,-0.269941,0.501114,3.331833e+09,0.019004,1.797908e+09,-0.040942,-0.157657,1.39,1.11
2020-04-09,2789.820000,0.014383,0.213024,0.488574,2.608413e+09,-0.012540,-7.234200e+08,0.116959,0.300127,1.27,0.81


In [427]:
def label(df, k, u=0.85, d=0.6):
    size = len(df)
    labels = []
    prices = df['SPX Price']
    
    for i in range(size):
        base_price = prices.iloc[i]
        label = 1 # hold
        for j in range(i+1, min(i+k+1, size)):
            new_price = prices.iloc[j]
            pct_change = 100 * (new_price-base_price)/base_price
            if pct_change >= u:
                label = 2 # buy
                break
            elif pct_change <= -d:
                label = 0 # sell
                break
        labels.append(label)
    
    df.drop('SPX Price', axis=1, inplace=True)
    df['Signal'] = labels

label(df_main, k=LOOK_AHEAD)
print(df_main['Signal'].value_counts())

df_to_predict = df_main.tail(400).copy()
df_main.tail(30)

1    752
2    752
0    747
Name: Signal, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Signal'] = labels


Unnamed: 0_level_0,SPX,SPY Volume,DIX,GEX,DIX Diff,GEX Diff,SPX Options Volume,VIX Options Volume,SPX P/C Ratio,VIX P/C Ratio,Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-03-02,0.045024,-0.480003,0.448496,-132090200.0,0.068082,1291355000.0,-0.453796,-0.293287,1.54,2.49,0
2020-03-03,-0.02852,0.229023,0.456905,-1094765000.0,0.008409,-962674500.0,-0.370556,0.434617,1.29,0.39,2
2020-03-04,0.041336,-0.530283,0.4203,1292494000.0,-0.036605,2387259000.0,0.274711,-0.994002,1.33,0.75,0
2020-03-05,-0.034464,0.053754,0.442533,-992248100.0,0.022233,-2284742000.0,-0.317448,0.995923,1.9,0.39,0
2020-03-06,-0.017237,0.204551,0.452162,-1026985000.0,0.009629,-34736770.0,0.47391,0.306864,2.29,0.88,0
2020-03-09,-0.079035,0.302423,0.416465,-1664364000.0,-0.035697,-637379500.0,-0.035661,-0.028578,2.33,2.31,2
2020-03-10,0.048233,-0.112682,0.41825,-1000251000.0,0.001785,664113200.0,-0.120284,-0.572283,1.9,1.16,0
2020-03-11,-0.050106,-0.079505,0.439415,-1628877000.0,0.021165,-628625500.0,0.043463,0.341208,2.27,1.1,0
2020-03-12,-0.099945,0.429322,0.377596,-2170934000.0,-0.061818,-542057500.0,0.337811,-0.04862,1.56,1.17,2
2020-03-13,0.088779,-0.174048,0.390936,-1540240000.0,0.013339,630693800.0,0.054848,0.182579,1.58,1.04,0


In [428]:
df_main = df_main[:-LOOK_AHEAD].dropna()

In [429]:
n = len(df_main)
train_index = int(n*TRAIN_RATIO)

df_train = df_main.iloc[ :train_index, : ].dropna()
df_test = df_main.iloc[train_index: , : ].dropna()

y_train = df_train['Signal']
y_test = df_test['Signal']

scaler = MinMaxScaler()
scaler.fit(df_train)
train = scaler.transform(df_train)
test = scaler.transform(df_test)

train[:,-1] = y_train
test[:,-1] = y_test

#print(train)
#print(test)

def preprocess_data(array):
    
    sequences = []
    prev_days = deque(maxlen=SEQUENCE_LENGTH)
    
    for row in array:
        
        prev_days.append([value for value in row[:-1]])
        if len(prev_days) == SEQUENCE_LENGTH:
            
            sequence = np.array(prev_days)
            sequences.append([sequence, row[-1]])
    
    random.shuffle(sequences)
    
    buys, holds, sells = [], [], []
    
    for seq, signal in sequences:
        if signal == 2:
            buys.append([seq, signal])
        elif signal == 1:
            holds.append([seq, signal])
        else:
            sells.append([seq, signal])
            
    smallest = min([len(buys), len(holds), len(sells)])
    buys = buys[:smallest]
    holds = holds[:smallest]
    sells = sells[:smallest]
    
    sequences = buys+holds+sells
    
    random.shuffle(sequences)
    
    X, y = [], []
    
    for seq, signal in sequences:
        X.append(seq)
        y.append(signal)
    
    return np.array(X), np.array(y)

X_train, y_train = preprocess_data(train)
X_test, y_test = preprocess_data(test)

y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

In [430]:
print(X_train[0])
print(y_train[0])

[[0.59553083 0.45919468 0.6376289  0.24962405 0.61594593 0.56566124
  0.5112877  0.68963985 0.51351351 0.07284768]
 [0.5646383  0.60095527 0.53048457 0.23261294 0.35527937 0.55571843
  0.48353936 0.44072406 0.39864865 0.08609272]
 [0.65079508 0.6980367  0.41232873 0.27017098 0.34214906 0.60883655
  0.64581762 0.41259331 0.28040541 0.13245033]]
[1. 0. 0.]


In [431]:
print(len(X_train))
print(len(X_test))

1764
414


In [432]:
model = Sequential()

model.add(LSTM(128, return_sequences=True, input_shape=(X_train.shape[1:]), activation='tanh'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(64, activation='tanh'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(3, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

tensorboard = TensorBoard(log_dir=f'logs/{MODEL_NAME}')

#filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
#checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor="val_acc", verbose=1, save_best_only=True, mode='max')) # saves only the best ones

history = model.fit(X_train,
                    y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=(X_test, y_test),
                    callbacks=[tensorboard])

model.save(f'{MODEL_NAME}.model')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
INFO:tensorflow:Assets written to: LA-3_SL-3_14-04-2020_10-39-59.model\assets


In [433]:
model.summary()

Model: "sequential_33"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_67 (LSTM)               (None, 3, 128)            71168     
_________________________________________________________________
dropout_139 (Dropout)        (None, 3, 128)            0         
_________________________________________________________________
batch_normalization_67 (Batc (None, 3, 128)            512       
_________________________________________________________________
lstm_68 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dropout_140 (Dropout)        (None, 64)                0         
_________________________________________________________________
batch_normalization_68 (Batc (None, 64)                256       
_________________________________________________________________
dense_108 (Dense)            (None, 32)              

In [434]:
actual_signals = df_to_predict['Signal']

to_predict = scaler.transform(df_to_predict)
to_predict[:,-1] = actual_signals

def get_sequences(array):
    
    prev_days = deque(maxlen=SEQUENCE_LENGTH)
    sequences = []
    
    for row in array:
        
        prev_days.append([value for value in row[:-1]])
        if len(prev_days) == SEQUENCE_LENGTH:
            
            sequence = np.array(prev_days)
            sequences.append(sequence)
    
    return np.array(sequences)


In [435]:
X_to_predict = get_sequences(to_predict)

y_pred_arr = model.predict(X_to_predict)
y_pred = list([np.argmax(arr) for arr in y_pred_arr])

y_pred_adj = ['N/A'] * (SEQUENCE_LENGTH-1) + y_pred

df_to_predict['Predicted Signal'] = y_pred_adj

df_to_predict['Signal'] = list(actual_signals[:-SEQUENCE_LENGTH]) + ['N/A'] * SEQUENCE_LENGTH

In [436]:
df_to_predict['Predicted Signal'].value_counts()

1      214
2      182
N/A      2
0        2
Name: Predicted Signal, dtype: int64

In [437]:
df_to_predict

Unnamed: 0_level_0,SPX,SPY Volume,DIX,GEX,DIX Diff,GEX Diff,SPX Options Volume,VIX Options Volume,SPX P/C Ratio,VIX P/C Ratio,Signal,Predicted Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-09-10,0.001889,-0.381391,0.426192,2217880000.0,0.011034,-255493900.0,-0.259641,-0.388234,2.47,0.59,2.0,
2018-09-11,0.003729,0.006345,0.385127,3524474000.0,-0.041065,1306594000.0,-0.017317,0.535827,1.8,0.4,1.0,
2018-09-12,0.00035,0.168609,0.412938,3886324000.0,0.027811,361849800.0,0.122566,0.280867,1.83,0.3,1.0,1.0
2018-09-13,0.005275,-0.15869,0.403668,5055064000.0,-0.00927,1168740000.0,0.074387,0.388377,1.78,0.57,1.0,1.0
2018-09-14,0.000282,0.076289,0.39279,3677162000.0,-0.010878,-1377901000.0,0.078808,-0.310122,1.73,0.26,1.0,1.0
2018-09-17,-0.005582,0.214305,0.425656,2456435000.0,0.032866,-1220727000.0,-0.085642,-0.322925,1.75,0.37,2.0,1.0
2018-09-18,0.005341,-0.097078,0.394043,3278023000.0,-0.031614,821588200.0,-0.097082,0.560884,1.6,0.56,2.0,1.0
2018-09-19,0.001246,-0.232547,0.389525,3951373000.0,-0.004518,673350300.0,0.038595,0.132196,2.39,0.71,1.0,1.0
2018-09-20,0.007813,0.715306,0.401492,5910353000.0,0.011967,1958980000.0,0.603903,-0.897862,1.6,0.43,1.0,1.0
2018-09-21,-0.000522,0.049749,0.429212,2724444000.0,0.02772,-3185909000.0,-0.244328,0.238561,1.68,0.36,0.0,1.0


In [439]:
df_to_predict['Same?'] = list(map(lambda x,y : 1 if x == y else 0, df_to_predict['Signal'], df_to_predict['Predicted Signal']))
df_to_predict['Same?'].value_counts()

0    210
1    190
Name: Same?, dtype: int64