In [633]:
import pandas as pd
import numpy as np
from collections import deque
from sklearn.preprocessing import MinMaxScaler
import random
import time
from datetime import datetime

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)

LOOK_AHEAD = 1
TRAIN_RATIO = 0.8
SEQUENCE_LENGTH = 5

EPOCHS = 10
BATCH_SIZE = 8

time_string = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
MODEL_NAME = f'LA-{LOOK_AHEAD}_SL-{SEQUENCE_LENGTH}_{time_string}'

df_main = pd.read_csv('dataset.csv', index_col='Date', parse_dates=['Date'])
df_main

Unnamed: 0_level_0,SPX,DIX,GEX,SPY Volume,USO Price,GLD Price,TY 1 mo,TY 3 mo,TY 6 mo,TY 1 yr,...,TY 5 yr,TY 7 yr,TY 10 yr,TY 20 yr,TY 30 yr,VIX,VIX P/C Ratio,VIX Options Volume,SPX P/C Ratio,SPX Options Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-05-02,1361.219971,0.378842,1.897313e+09,126278700.0,44.930000,150.410004,0.02,0.05,0.10,0.22,...,1.96,2.66,3.31,4.14,4.38,15.99,0.51,182366.0,1.38,567584.0
2011-05-03,1356.619995,0.383411,1.859731e+09,138375000.0,44.080002,149.880005,0.02,0.03,0.09,0.20,...,1.96,2.64,3.28,4.11,4.36,16.70,0.39,318780.0,1.84,540934.0
2011-05-04,1347.319946,0.392122,1.717764e+09,182678500.0,43.259998,147.729996,0.02,0.03,0.07,0.19,...,1.95,2.61,3.25,4.08,4.33,17.08,0.87,369293.0,2.08,432621.0
2011-05-05,1335.099976,0.405457,1.361864e+09,226900000.0,39.320000,143.470001,0.01,0.02,0.07,0.20,...,1.88,2.54,3.18,4.00,4.26,18.20,1.36,446203.0,1.94,601038.0
2011-05-06,1340.199951,0.418649,1.490329e+09,222787200.0,38.869999,145.300003,0.02,0.02,0.07,0.18,...,1.87,2.54,3.19,4.03,4.29,18.40,0.41,382407.0,1.85,601669.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-04-06,2663.680000,0.446698,1.555165e+09,188061200.0,5.480000,156.880005,0.09,0.15,0.17,0.20,...,0.44,0.58,0.67,1.08,1.27,45.24,1.14,536473.0,1.27,1421641.0
2020-04-07,2659.410000,0.482110,1.533925e+09,201427200.0,5.090000,156.039993,0.10,0.14,0.20,0.20,...,0.48,0.64,0.75,1.13,1.32,46.70,1.67,380770.0,1.12,1315103.0
2020-04-08,2749.980000,0.501114,3.331833e+09,153774500.0,5.370000,154.649994,0.14,0.22,0.24,0.23,...,0.47,0.65,0.77,1.18,1.37,43.35,1.11,325232.0,1.39,1262348.0
2020-04-09,2789.820000,0.488574,2.608413e+09,190282700.0,4.980000,158.690002,0.20,0.25,0.24,0.25,...,0.41,0.60,0.73,1.15,1.35,41.67,0.81,439073.0,1.27,1418972.0


In [634]:
df_main.describe()

Unnamed: 0,SPX,DIX,GEX,SPY Volume,USO Price,GLD Price,TY 1 mo,TY 3 mo,TY 6 mo,TY 1 yr,...,TY 5 yr,TY 7 yr,TY 10 yr,TY 20 yr,TY 30 yr,VIX,VIX P/C Ratio,VIX Options Volume,SPX P/C Ratio,SPX Options Volume
count,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2234.0,2234.0,2234.0,2234.0,...,2234.0,2234.0,2234.0,2234.0,2234.0,2251.0,2251.0,2251.0,2251.0,2251.0
mean,2110.910297,0.419626,2180254000.0,117198500.0,21.745957,130.882959,0.619315,0.655962,0.733151,0.817167,...,1.583308,1.927372,2.241209,2.707243,2.977686,16.683016,0.508903,576972.1,1.742443,1059653.0
std,558.412153,0.027776,1632390000.0,68845180.0,11.62565,18.528789,0.813688,0.829858,0.842033,0.842661,...,0.602544,0.522543,0.471629,0.491393,0.486524,7.118057,0.294143,354907.0,0.370808,412507.8
min,1099.22998,0.330555,-2958423000.0,20270000.0,4.21,100.5,0.0,0.0,0.02,0.08,...,0.37,0.51,0.54,0.87,0.99,9.14,0.07,41355.0,0.81,223433.0
25%,1659.420044,0.400926,1157110000.0,71214050.0,11.4,117.870003,0.03,0.04,0.08,0.15,...,1.17,1.51,1.88,2.41,2.74,12.835,0.32,354112.0,1.48,759197.5
50%,2075.810059,0.419258,2121601000.0,99141800.0,14.82,124.279999,0.11,0.11,0.22,0.36,...,1.58,1.98,2.23,2.7,2.99,14.77,0.44,493615.0,1.72,993882.0
75%,2598.055,0.437429,3112933000.0,142854800.0,34.414999,143.724998,1.16,1.29,1.45,1.49,...,1.84,2.24,2.61,2.9775,3.1775,18.06,0.61,689430.5,1.97,1285228.0
max,3386.12,0.513793,11566400000.0,717828700.0,44.93,184.589996,2.51,2.49,2.58,2.74,...,3.09,3.18,3.31,4.14,4.4,82.69,3.09,4336057.0,3.77,3593415.0


In [635]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2251 entries, 2011-05-02 to 2020-04-13
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SPX                 2251 non-null   float64
 1   DIX                 2251 non-null   float64
 2   GEX                 2251 non-null   float64
 3   SPY Volume          2251 non-null   float64
 4   USO Price           2251 non-null   float64
 5   GLD Price           2251 non-null   float64
 6   TY 1 mo             2234 non-null   float64
 7   TY 3 mo             2234 non-null   float64
 8   TY 6 mo             2234 non-null   float64
 9   TY 1 yr             2234 non-null   float64
 10  TY 2 yr             2234 non-null   float64
 11  TY 3 yr             2234 non-null   float64
 12  TY 5 yr             2234 non-null   float64
 13  TY 7 yr             2234 non-null   float64
 14  TY 10 yr            2234 non-null   float64
 15  TY 20 yr            2234 non-null   f

In [636]:
def feature_engineering(df):
    
    df['SPX Price'] = df['SPX']
    
    ln_ratio_replace_cols = [
        'SPX',
        'SPY Volume',
        'USO Price',
        'GLD Price',
        'SPX Options Volume',
        'VIX Options Volume'
    ]
    
    ln_ratio_augment_cols = [
        'VIX'
    ]
    
    diff_replace_cols = [
    ]
    
    TY_cols = [col for col in df.columns if 'TY' in col]
    diff_replace_cols.extend(TY_cols)
    
    diff_augment_cols = [
        'DIX',
        'GEX'
    ]
    
    for col in ln_ratio_replace_cols:
        df[col] = np.log1p(df[col].pct_change())
    for col in ln_ratio_augment_cols:
        df[col + ' lnRatio'] = np.log1p(df[col].pct_change())
    
    for col in diff_replace_cols:
        df[col] = df[col].diff()
    for col in diff_augment_cols:
        df[col + ' Diff'] = df[col].diff()
        
    features_to_use = [
        'SPX',
        'DIX',
        'GEX',
        'DIX Diff',
        'GEX Diff',
        'SPX P/C Ratio',
        'VIX P/C Ratio',
        'SPX Price'
    ]
    
    return df[features_to_use]

df_old = df_main.copy()
df_main = feature_engineering(df_main)
df_main

Unnamed: 0_level_0,SPX,DIX,GEX,DIX Diff,GEX Diff,SPX P/C Ratio,VIX P/C Ratio,SPX Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-05-02,,0.378842,1.897313e+09,,,1.38,0.51,1361.219971
2011-05-03,-0.003385,0.383411,1.859731e+09,0.004569,-3.758192e+07,1.84,0.39,1356.619995
2011-05-04,-0.006879,0.392122,1.717764e+09,0.008711,-1.419668e+08,2.08,0.87,1347.319946
2011-05-05,-0.009111,0.405457,1.361864e+09,0.013335,-3.558996e+08,1.94,1.36,1335.099976
2011-05-06,0.003813,0.418649,1.490329e+09,0.013192,1.284643e+08,1.85,0.41,1340.199951
...,...,...,...,...,...,...,...,...
2020-04-06,0.067968,0.446698,1.555165e+09,-0.059777,2.018244e+09,1.27,1.14,2663.680000
2020-04-07,-0.001604,0.482110,1.533925e+09,0.035412,-2.123946e+07,1.12,1.67,2659.410000
2020-04-08,0.033489,0.501114,3.331833e+09,0.019004,1.797908e+09,1.39,1.11,2749.980000
2020-04-09,0.014383,0.488574,2.608413e+09,-0.012540,-7.234200e+08,1.27,0.81,2789.820000


In [637]:
def label(df, k, u=0.85, d=0.6):
    size = len(df)
    labels = []
    prices = df['SPX Price']
    
    for i in range(size):
        base_price = prices.iloc[i]
        label = 1 # hold
        for j in range(i+1, min(i+k+1, size)):
            new_price = prices.iloc[j]
            pct_change = 100 * (new_price-base_price)/base_price
            if pct_change >= u:
                label = 2 # buy
                break
            elif pct_change <= -d:
                label = 0 # sell
                break
        labels.append(label)
    
    df.drop('SPX Price', axis=1, inplace=True)
    df['Signal'] = labels

#label(df_main, k=LOOK_AHEAD)
#print(df_main['Signal'].value_counts())


#df_main2['SPX Next'] = df_main2['SPX Price'].shift(periods=-1)
df_main['1-Day Return'] = np.log1p(df_main['SPX Price'].pct_change())
df_main['1-Day Return'] = df_main['1-Day Return'].shift(-1)


df_main.drop('SPX Price', axis=1, inplace=True)

df_to_predict = df_main.tail(200).copy()
df_main


#df_main.tail(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_main['1-Day Return'] = np.log1p(df_main['SPX Price'].pct_change())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_main['1-Day Return'] = df_main['1-Day Return'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0_level_0,SPX,DIX,GEX,DIX Diff,GEX Diff,SPX P/C Ratio,VIX P/C Ratio,1-Day Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-05-02,,0.378842,1.897313e+09,,,1.38,0.51,-0.003385
2011-05-03,-0.003385,0.383411,1.859731e+09,0.004569,-3.758192e+07,1.84,0.39,-0.006879
2011-05-04,-0.006879,0.392122,1.717764e+09,0.008711,-1.419668e+08,2.08,0.87,-0.009111
2011-05-05,-0.009111,0.405457,1.361864e+09,0.013335,-3.558996e+08,1.94,1.36,0.003813
2011-05-06,0.003813,0.418649,1.490329e+09,0.013192,1.284643e+08,1.85,0.41,0.004534
...,...,...,...,...,...,...,...,...
2020-04-06,0.067968,0.446698,1.555165e+09,-0.059777,2.018244e+09,1.27,1.14,-0.001604
2020-04-07,-0.001604,0.482110,1.533925e+09,0.035412,-2.123946e+07,1.12,1.67,0.033489
2020-04-08,0.033489,0.501114,3.331833e+09,0.019004,1.797908e+09,1.39,1.11,0.014383
2020-04-09,0.014383,0.488574,2.608413e+09,-0.012540,-7.234200e+08,1.27,0.81,-0.010156


In [638]:
df_main = df_main.dropna()
df_main

Unnamed: 0_level_0,SPX,DIX,GEX,DIX Diff,GEX Diff,SPX P/C Ratio,VIX P/C Ratio,1-Day Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-05-03,-0.003385,0.383411,1.859731e+09,0.004569,-3.758192e+07,1.84,0.39,-0.006879
2011-05-04,-0.006879,0.392122,1.717764e+09,0.008711,-1.419668e+08,2.08,0.87,-0.009111
2011-05-05,-0.009111,0.405457,1.361864e+09,0.013335,-3.558996e+08,1.94,1.36,0.003813
2011-05-06,0.003813,0.418649,1.490329e+09,0.013192,1.284643e+08,1.85,0.41,0.004534
2011-05-09,0.004534,0.410321,1.677059e+09,-0.008328,1.867301e+08,1.94,0.17,0.008042
...,...,...,...,...,...,...,...,...
2020-04-03,-0.015253,0.506475,-4.630797e+08,0.021254,4.431874e+07,1.72,0.66,0.067968
2020-04-06,0.067968,0.446698,1.555165e+09,-0.059777,2.018244e+09,1.27,1.14,-0.001604
2020-04-07,-0.001604,0.482110,1.533925e+09,0.035412,-2.123946e+07,1.12,1.67,0.033489
2020-04-08,0.033489,0.501114,3.331833e+09,0.019004,1.797908e+09,1.39,1.11,0.014383


In [639]:
n = len(df_main)
train_index = int(n*TRAIN_RATIO)

df_train = df_main.iloc[ :train_index, : ].dropna()
df_test = df_main.iloc[train_index: , : ].dropna()

y_train = df_train['1-Day Return']
y_test = df_test['1-Day Return']

scaler = MinMaxScaler()
scaler.fit(df_train)
train = scaler.transform(df_train)
test = scaler.transform(df_test)

train[:,-1] = y_train
test[:,-1] = y_test

#print(train)
#print(test)

def preprocess_data(array):
    
    sequences = []
    prev_days = deque(maxlen=SEQUENCE_LENGTH)
    
    for row in array:
        
        prev_days.append([value for value in row[:-1]])
        if len(prev_days) == SEQUENCE_LENGTH:
            
            sequence = np.array(prev_days)
            sequences.append([sequence, row[-1]])
    
    random.shuffle(sequences)
    '''
    buys, holds, sells = [], [], []
    
    for seq, signal in sequences:
        if signal == 2:
            buys.append([seq, signal])
        elif signal == 1:
            holds.append([seq, signal])
        else:
            sells.append([seq, signal])
            
    smallest = min([len(buys), len(holds), len(sells)])
    buys = buys[:smallest]
    holds = holds[:smallest]
    sells = sells[:smallest]
    
    sequences = buys+holds+sells
    
    random.shuffle(sequences)
    '''
    X, y = [], []
    
    for seq, ret in sequences:
        X.append(seq)
        y.append(ret)
    
    return np.array(X), np.array(y)

X_train, y_train = preprocess_data(train)
X_test, y_test = preprocess_data(test)

#y_train = tf.keras.utils.to_categorical(y_train)
#y_test = tf.keras.utils.to_categorical(y_test)

In [640]:
print(X_train[0])
print(y_train[0])

[[0.70977304 0.56786167 0.26062897 0.55878954 0.61426828 0.20945946
  0.0794702 ]
 [0.66975893 0.48344423 0.28495326 0.38237928 0.59595469 0.19932432
  0.12582781]
 [0.68554488 0.44411006 0.32823667 0.43613727 0.61440968 0.19256757
  0.12251656]
 [0.72231319 0.45777511 0.35495123 0.49933438 0.5982814  0.16554054
  0.26490066]
 [0.5866044  0.47884286 0.37204691 0.50816147 0.5889183  0.35810811
  0.2781457 ]]
0.0010010579536909561


In [641]:
print(len(X_train))
print(len(X_test))

1795
446


In [642]:
model = Sequential()

model.add(LSTM(128, return_sequences=True, input_shape=(X_train.shape[1:]), activation='tanh'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(64, activation='tanh'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))
'''
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(8, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(4, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(2, activation='relu'))
model.add(Dropout(0.1))
'''
model.add(Dense(1, activation='linear'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile(loss='mse',
              optimizer=opt,
              metrics=['mse'])

tensorboard = TensorBoard(log_dir=f'logs/{MODEL_NAME}')

#filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
#checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor="val_acc", verbose=1, save_best_only=True, mode='max')) # saves only the best ones

history = model.fit(X_train,
                    y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=(X_test, y_test),
                    callbacks=[tensorboard])

#model.save(f'{MODEL_NAME}.model')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [643]:
model.summary()

Model: "sequential_46"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_93 (LSTM)               (None, 5, 128)            69632     
_________________________________________________________________
dropout_195 (Dropout)        (None, 5, 128)            0         
_________________________________________________________________
batch_normalization_93 (Batc (None, 5, 128)            512       
_________________________________________________________________
lstm_94 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dropout_196 (Dropout)        (None, 64)                0         
_________________________________________________________________
batch_normalization_94 (Batc (None, 64)                256       
_________________________________________________________________
dense_151 (Dense)            (None, 32)              

In [644]:
actual_returns = df_to_predict['1-Day Return']

to_predict = scaler.transform(df_to_predict)
to_predict[:,-1] = actual_returns

def get_sequences(array):
    
    prev_days = deque(maxlen=SEQUENCE_LENGTH)
    sequences = []
    
    for row in array:
        
        prev_days.append([value for value in row[:-1]])
        if len(prev_days) == SEQUENCE_LENGTH:
            
            sequence = np.array(prev_days)
            sequences.append(sequence)
    
    return np.array(sequences)


In [645]:
X_to_predict = get_sequences(to_predict)

y_pred = list(model.predict(X_to_predict))
y_pred = list([x[0] for x in y_pred])

y_pred_adj = ['N/A'] * (SEQUENCE_LENGTH-1) + y_pred

df_to_predict['Predicted 1-Day Return'] = y_pred_adj

df_to_predict['1-Day Return'] = list(actual_returns[:-LOOK_AHEAD]) + ['N/A'] * LOOK_AHEAD

In [646]:
#df_to_predict['Predicted Signal'].value_counts()

In [647]:
df_to_predict.tail(100)

Unnamed: 0_level_0,SPX,DIX,GEX,DIX Diff,GEX Diff,SPX P/C Ratio,VIX P/C Ratio,1-Day Return,Predicted 1-Day Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-11-18,0.000503,0.42337,3329456000.0,-0.013422,400057300.0,1.4,0.8,-0.000592739,0.010141
2019-11-19,-0.000593,0.42231,3366308000.0,-0.00106,36852330.0,1.8,1.98,-0.00376327,0.00706595
2019-11-20,-0.003763,0.435133,2652596000.0,0.012822,-713711900.0,1.96,0.62,-0.00158403,0.00195078
2019-11-21,-0.001584,0.431333,2212434000.0,-0.0038,-440162200.0,2.59,1.05,0.00217257,-0.00683027
2019-11-22,0.002173,0.417131,2485388000.0,-0.014202,272953600.0,2.0,1.63,0.0074793,-0.00558343
2019-11-25,0.007479,0.381547,4539122000.0,-0.035584,2053734000.0,1.71,0.47,0.00219312,-0.0115771
2019-11-26,0.002193,0.39053,5855883000.0,0.008983,1316762000.0,1.58,0.31,0.00416578,0.00206712
2019-11-27,0.004166,0.395141,8220195000.0,0.004611,2364311000.0,1.22,0.35,-0.00401932,0.016448
2019-11-29,-0.004019,0.422284,3675461000.0,0.027143,-4544733000.0,2.03,0.11,-0.00866853,0.0202166
2019-12-02,-0.008669,0.406087,2328339000.0,-0.016197,-1347123000.0,1.81,0.61,-0.00666017,0.016073
