In [372]:
import pandas as pd
import numpy as np
from collections import deque
from sklearn.preprocessing import MinMaxScaler
import random
import time

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

LOOK_AHEAD = 3
TRAIN_RATIO = 0.8
SEQUENCE_LENGTH = 20

EPOCHS = 50
BATCH_SIZE = 32

MODEL_NAME = f'LA-{LOOK_AHEAD}--SL-{SEQUENCE_LENGTH}--{int(time.time())}'

df_main = pd.read_csv('dataset.csv', index_col='Date', parse_dates=['Date'])
df_main

Unnamed: 0_level_0,SPX,DIX,GEX,SPY Volume,USO Price,GLD Price,TY 1 mo,TY 3 mo,TY 6 mo,TY 1 yr,...,TY 3 yr,TY 5 yr,TY 7 yr,TY 10 yr,TY 20 yr,TY 30 yr,VIX P/C Ratio,VIX Options Volume,SPX P/C Ratio,SPX Options Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-05-02,1361.219971,0.378842,1.897313e+09,126278700.0,44.930000,150.410004,0.02,0.05,0.10,0.22,...,1.01,1.96,2.66,3.31,4.14,4.38,0.51,182366.0,1.38,567584.0
2011-05-03,1356.619995,0.383411,1.859731e+09,138375000.0,44.080002,149.880005,0.02,0.03,0.09,0.20,...,1.01,1.96,2.64,3.28,4.11,4.36,0.39,318780.0,1.84,540934.0
2011-05-04,1347.319946,0.392122,1.717764e+09,182678500.0,43.259998,147.729996,0.02,0.03,0.07,0.19,...,1.00,1.95,2.61,3.25,4.08,4.33,0.87,369293.0,2.08,432621.0
2011-05-05,1335.099976,0.405457,1.361864e+09,226900000.0,39.320000,143.470001,0.01,0.02,0.07,0.20,...,0.97,1.88,2.54,3.18,4.00,4.26,1.36,446203.0,1.94,601038.0
2011-05-06,1340.199951,0.418649,1.490329e+09,222787200.0,38.869999,145.300003,0.02,0.02,0.07,0.18,...,0.96,1.87,2.54,3.19,4.03,4.29,0.41,382407.0,1.85,601669.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-04-03,2488.650000,0.506475,-4.630797e+08,135561200.0,5.900000,152.649994,0.09,0.10,0.15,0.15,...,0.30,0.39,0.52,0.62,1.05,1.24,0.66,351100.0,1.72,1362044.0
2020-04-06,2663.680000,0.446698,1.555165e+09,188061200.0,5.480000,156.880005,0.09,0.15,0.17,0.20,...,0.35,0.44,0.58,0.67,1.08,1.27,1.14,536473.0,1.27,1421641.0
2020-04-07,2659.410000,0.482110,1.533925e+09,201427200.0,5.090000,156.039993,0.10,0.14,0.20,0.20,...,0.36,0.48,0.64,0.75,1.13,1.32,1.67,380770.0,1.12,1315103.0
2020-04-08,2749.980000,0.501114,3.331833e+09,153774500.0,5.370000,154.649994,0.14,0.22,0.24,0.23,...,0.34,0.47,0.65,0.77,1.18,1.37,1.11,325232.0,1.39,1262348.0


In [373]:
n = len(df_main)
n

2250

In [374]:
#df_main['SPX Future Price'] = df_main['SPX'].shift(periods=-LOOK_AHEAD)
#df_main.dropna(inplace=True)

In [375]:
'''def label(curr_price, future_price):
    return 1 if future_price > curr_price else 0

df_main['Signal'] = list(map(label, df_main['SPX'], df_main['SPX Future Price']))
df_main = df_main.drop('SPX Future Price', axis=1)'''

df_main['DIX Diff.'] = df_main['DIX'].diff()
df_main['GEX Diff.'] = df_main['GEX'].diff()


def label(df, k, u=0.85, d=0.6):
    size = len(df)
    prices = df['SPX']
    labels = []
    
    for i in range(size):
        base_price = prices.iloc[i]
        label = 1 # hold
        for j in range(i+1, min(i+k+1, n)):
            new_price = prices.iloc[j]
            pct_change = 100 * (new_price-base_price)/base_price
            if pct_change >= u:
                label = 2 # buy
                break
            elif pct_change <= -d:
                label = 0 # sell
                break
        labels.append(label)
    
    df['Signal'] = labels

label(df_main, k=LOOK_AHEAD)
print(df_main['Signal'].value_counts())
df_main.tail(30)

1    752
2    752
0    746
Name: Signal, dtype: int64


Unnamed: 0_level_0,SPX,DIX,GEX,SPY Volume,USO Price,GLD Price,TY 1 mo,TY 3 mo,TY 6 mo,TY 1 yr,...,TY 10 yr,TY 20 yr,TY 30 yr,VIX P/C Ratio,VIX Options Volume,SPX P/C Ratio,SPX Options Volume,DIX Diff.,GEX Diff.,Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-28,2954.21,0.380413,-1423445000.0,385764000.0,9.45,148.380005,1.45,1.27,1.11,0.97,...,1.13,1.46,1.65,0.65,1485775.0,2.19,3593415.0,0.018535,1534977000.0,2
2020-03-02,3090.26,0.448496,-132090200.0,238703600.0,9.92,149.199997,1.41,1.13,0.95,0.89,...,1.1,1.46,1.66,2.49,1108103.0,1.54,2282581.0,0.068082,1291355000.0,0
2020-03-03,3003.37,0.456905,-1094765000.0,300139100.0,9.89,153.889999,1.11,0.95,0.83,0.73,...,1.02,1.44,1.64,0.39,1711323.0,1.29,1575781.0,0.008409,-962674500.0,2
2020-03-04,3130.12,0.4203,1292494000.0,176613400.0,9.86,154.160004,1.0,0.72,0.68,0.59,...,1.02,1.45,1.67,0.75,633348.0,1.33,2073964.0,-0.036605,2387259000.0,0
2020-03-05,3024.08,0.442533,-992248100.0,186366800.0,9.61,157.490005,0.92,0.62,0.53,0.48,...,0.92,1.34,1.56,0.39,1714614.0,1.9,1509855.0,0.022233,-2284742000.0,0
2020-03-06,2972.4,0.452162,-1026985000.0,228667200.0,8.73,157.550003,0.79,0.45,0.41,0.39,...,0.74,1.09,1.25,0.88,2330429.0,2.29,2425223.0,0.009629,-34736770.0,0
2020-03-09,2746.52,0.416465,-1664364000.0,309417300.0,6.52,157.809998,0.57,0.33,0.27,0.31,...,0.54,0.87,0.99,2.31,2264772.0,2.33,2340260.0,-0.035697,-637379500.0,2
2020-03-10,2882.24,0.41825,-1000251000.0,276444100.0,7.24,154.479996,0.57,0.44,0.43,0.43,...,0.76,1.16,1.28,1.16,1277866.0,1.9,2075036.0,0.001785,664113200.0,0
2020-03-11,2741.38,0.439415,-1628877000.0,255316300.0,6.94,153.929993,0.42,0.42,0.4,0.4,...,0.82,1.13,1.3,1.1,1797505.0,2.27,2167211.0,0.021165,-628625500.0,0
2020-03-12,2480.64,0.377596,-2170934000.0,392220700.0,6.54,147.789993,0.41,0.33,0.37,0.39,...,0.88,1.27,1.49,1.17,1712200.0,1.56,3038159.0,-0.061818,-542057500.0,2


In [376]:
df_main = df_main[:-LOOK_AHEAD]

In [377]:
def make_stationary(df):
    
    ln_ratio_cols = [
        'SPX',
        'SPY Volume',
        'USO Price',
        'GLD Price',
        #'VIX P/C Ratio',
        'VIX Options Volume',
        #'SPX P/C Ratio',
        'SPX Options Volume'
    ]
    
    diff_cols = [
        #'DIX',
        #'GEX'
    ]
    
    alt_features = [
        'SPX',
        'DIX',
        'GEX',
        'DIX Diff.',
        'GEX Diff.',
        'VIX P/C Ratio',
        'SPX P/C Ratio',
        'Signal'
    ]
    
    TY_cols = [col for col in df.columns if 'TY' in col]
    #diff_cols.extend(TY_cols)
    
    for col in ln_ratio_cols:
        if col in df.columns:
            df[col] = np.log1p(df[col].pct_change())
    
    for col in diff_cols:
        if col in df.columns:
            df[col] = df[col].diff()
    
    #df['GEX'] = list(map(lambda x : x/1000, df['GEX']))
    
    
    df = df[alt_features]
    
    return df

df_old = df_main.copy()
df_main = make_stationary(df_main)
df_main

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = np.log1p(df[col].pct_change())


Unnamed: 0_level_0,SPX,DIX,GEX,DIX Diff.,GEX Diff.,VIX P/C Ratio,SPX P/C Ratio,Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-05-02,,0.378842,1.897313e+09,,,0.51,1.38,0
2011-05-03,-0.003385,0.383411,1.859731e+09,0.004569,-3.758192e+07,0.39,1.84,0
2011-05-04,-0.006879,0.392122,1.717764e+09,0.008711,-1.419668e+08,0.87,2.08,0
2011-05-05,-0.009111,0.405457,1.361864e+09,0.013335,-3.558996e+08,1.36,1.94,2
2011-05-06,0.003813,0.418649,1.490329e+09,0.013192,1.284643e+08,0.41,1.85,2
...,...,...,...,...,...,...,...,...
2020-03-31,-0.016142,0.497417,-1.482108e+08,0.024677,-2.468311e+08,1.30,1.38,0
2020-04-01,-0.045146,0.511401,-7.728500e+08,0.013985,-6.246392e+08,1.14,1.63,2
2020-04-02,0.022573,0.485221,-5.073984e+08,-0.026180,2.654516e+08,1.62,1.57,0
2020-04-03,-0.015253,0.506475,-4.630797e+08,0.021254,4.431874e+07,0.66,1.72,2


In [378]:
train_index = int(n*TRAIN_RATIO)

df_train = df_main.iloc[ :train_index, : ].dropna()
df_test = df_main.iloc[train_index: , : ].dropna()

y_train = df_train['Signal']
y_test = df_test['Signal']

scaler = MinMaxScaler()
scaler.fit(df_train)
train = scaler.transform(df_train)
test = scaler.transform(df_test)

train[:,-1] = y_train
test[:,-1] = y_test

#print(train)
#print(test)

def preprocess_data(array):
    
    '''
    for col in df.columns:
        if col != 'Signal':
            col = scale(df[col].values)
    '''
    
    sequences = []
    prev_days = deque(maxlen=SEQUENCE_LENGTH)
    
    for row in array:
        
        prev_days.append([value for value in row[:-1]])
        if len(prev_days) == SEQUENCE_LENGTH:
            
            #sequence = scale(np.array(prev_days))
            sequence = np.array(prev_days)
            sequences.append([sequence, row[-1]])
    
    random.shuffle(sequences)
    
    buys, holds, sells = [], [], []
    
    for seq, signal in sequences:
        if signal == 2:
            buys.append([seq, signal])
        elif signal == 1:
            holds.append([seq, signal])
        else:
            sells.append([seq, signal])
            
    smallest = min(len(buys), len(holds), len(sells))
    buys = buys[:smallest]
    holds = holds[:smallest]
    sells = sells[:smallest]
    
    sequences = buys+holds+sells
    
    random.shuffle(sequences)
    
    X, y = [], []
    
    for seq, signal in sequences:
        X.append(seq)
        y.append(signal)
    
    return np.array(X), np.array(y)

X_train, y_train = preprocess_data(train)
X_test, y_test = preprocess_data(test)

y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

In [379]:
print(X_train[0])
print(y_train[0])

[[0.60760618 0.3004223  0.33233751 0.4968006  0.56885542 0.06291391
  0.14189189]
 [0.61394679 0.31932456 0.34738626 0.50557932 0.58692579 0.17549669
  0.20608108]
 [0.63698965 0.3647588  0.35584291 0.53721645 0.58050899 0.17880795
  0.18243243]
 [0.62634293 0.31179867 0.37309154 0.41988947 0.58906718 0.22847682
  0.20608108]
 [0.57733035 0.39482122 0.35586589 0.58203736 0.5555096  0.16556291
  0.19256757]
 [0.60959501 0.34800624 0.35637684 0.42721703 0.57277457 0.08278146
  0.11486486]
 [0.64667271 0.30382493 0.38870984 0.43035746 0.60375044 0.10927152
  0.09459459]
 [0.58415201 0.43566767 0.42089945 0.64025133 0.60361087 0.11589404
  0.375     ]
 [0.55026982 0.58004733 0.30948787 0.65520054 0.46382814 0.15562914
  0.28040541]
 [0.57716271 0.47640222 0.29844717 0.35945189 0.56153009 0.12582781
  0.39189189]
 [0.65610932 0.31670544 0.31883806 0.292615   0.59212588 0.05629139
  0.34459459]
 [0.52605526 0.50870675 0.29073811 0.71198535 0.54492445 0.06622517
  0.27702703]
 [0.66021669 0.4

In [380]:
print(len(X_train))
print(len(X_test))

1740
387


In [None]:
model = Sequential()

model.add(LSTM(256, input_shape=(X_train.shape[1:]), return_sequences=True, activation='tanh'))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, return_sequences=True, activation='tanh'))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(64, activation='tanh'))
model.add(Dropout(0.1))
model.add(BatchNormalization())

'''
model.add(LSTM(128, return_sequences=True, activation='tanh'))
#model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, activation='tanh'))
#model.add(Dropout(0.1))
model.add(BatchNormalization())
'''

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(3, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.0005, decay=1e-6)

model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

tensorboard = TensorBoard(log_dir=f'logs/{MODEL_NAME}')

#filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
#checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor="val_acc", verbose=1, save_best_only=True, mode='max')) # saves only the best ones

history = model.fit(X_train,
                    y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=(X_test, y_test),
                    callbacks=[tensorboard])

model.save(f'{MODEL_NAME}.model')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50

In [None]:
model.summary()