In [726]:
import pandas as pd
import numpy as np
from collections import deque
from sklearn.preprocessing import MinMaxScaler
import random
import time
from datetime import datetime

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)

LOOK_AHEAD = 1
TRAIN_RATIO = 0.75
SEQUENCE_LENGTH = 5

EPOCHS = 30
BATCH_SIZE = 4

time_string = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
MODEL_NAME = f'LA-{LOOK_AHEAD}_SL-{SEQUENCE_LENGTH}_{time_string}'

df_main = pd.read_csv('dataset.csv', index_col='Date', parse_dates=['Date'])
df_main

Unnamed: 0_level_0,SPX,DIX,GEX,SPY Volume,USO Price,GLD Price,TY 1 mo,TY 3 mo,TY 6 mo,TY 1 yr,...,TY 5 yr,TY 7 yr,TY 10 yr,TY 20 yr,TY 30 yr,VIX,VIX P/C Ratio,VIX Options Volume,SPX P/C Ratio,SPX Options Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-05-02,1361.219971,0.378842,1.897313e+09,126278700.0,44.930000,150.410004,0.02,0.05,0.10,0.22,...,1.96,2.66,3.31,4.14,4.38,15.99,0.51,182366.0,1.38,567584.0
2011-05-03,1356.619995,0.383411,1.859731e+09,138375000.0,44.080002,149.880005,0.02,0.03,0.09,0.20,...,1.96,2.64,3.28,4.11,4.36,16.70,0.39,318780.0,1.84,540934.0
2011-05-04,1347.319946,0.392122,1.717764e+09,182678500.0,43.259998,147.729996,0.02,0.03,0.07,0.19,...,1.95,2.61,3.25,4.08,4.33,17.08,0.87,369293.0,2.08,432621.0
2011-05-05,1335.099976,0.405457,1.361864e+09,226900000.0,39.320000,143.470001,0.01,0.02,0.07,0.20,...,1.88,2.54,3.18,4.00,4.26,18.20,1.36,446203.0,1.94,601038.0
2011-05-06,1340.199951,0.418649,1.490329e+09,222787200.0,38.869999,145.300003,0.02,0.02,0.07,0.18,...,1.87,2.54,3.19,4.03,4.29,18.40,0.41,382407.0,1.85,601669.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-04-07,2659.410000,0.482110,1.533925e+09,201427200.0,5.090000,156.039993,0.10,0.14,0.20,0.20,...,0.48,0.64,0.75,1.13,1.32,46.70,1.67,380770.0,1.12,1315103.0
2020-04-08,2749.980000,0.501114,3.331833e+09,153774500.0,5.370000,154.649994,0.14,0.22,0.24,0.23,...,0.47,0.65,0.77,1.18,1.37,43.35,1.11,325232.0,1.39,1262348.0
2020-04-09,2789.820000,0.488574,2.608413e+09,190282700.0,4.980000,158.690002,0.20,0.25,0.24,0.25,...,0.41,0.60,0.73,1.15,1.35,41.67,0.81,439073.0,1.27,1418972.0
2020-04-13,2761.630000,0.493960,3.422978e+09,114839100.0,4.930000,161.410004,0.17,0.26,0.27,0.27,...,0.44,0.63,0.76,1.19,1.39,41.17,0.95,314831.0,1.30,1007130.0


In [727]:
df_main.describe()

Unnamed: 0,SPX,DIX,GEX,SPY Volume,USO Price,GLD Price,TY 1 mo,TY 3 mo,TY 6 mo,TY 1 yr,...,TY 5 yr,TY 7 yr,TY 10 yr,TY 20 yr,TY 30 yr,VIX,VIX P/C Ratio,VIX Options Volume,SPX P/C Ratio,SPX Options Volume
count,2252.0,2252.0,2252.0,2252.0,2252.0,2252.0,2235.0,2235.0,2235.0,2235.0,...,2235.0,2235.0,2235.0,2235.0,2235.0,2252.0,2252.0,2252.0,2252.0,2252.0
mean,2111.23674,0.419652,2181906000.0,117205900.0,21.73837,130.897078,0.619114,0.655758,0.732931,0.816913,...,1.582787,1.926783,2.240546,2.706564,2.976984,16.692376,0.509472,577004.0,1.742251,1059732.0
std,558.502991,0.027797,1633909000.0,68830750.0,11.628642,18.536787,0.813561,0.829728,0.841909,0.842558,...,0.602912,0.523169,0.472563,0.492331,0.487544,7.130322,0.295314,354831.4,0.370837,412433.0
min,1099.22998,0.330555,-2958423000.0,20270000.0,4.21,100.5,0.0,0.0,0.02,0.08,...,0.37,0.51,0.54,0.87,0.99,9.14,0.07,41355.0,0.81,223433.0
25%,1659.740051,0.40093,1157774000.0,71236220.0,11.4,117.870003,0.03,0.04,0.08,0.15,...,1.165,1.51,1.88,2.41,2.74,12.8375,0.32,354278.0,1.48,759238.8
50%,2075.935059,0.419259,2121650000.0,99167950.0,14.815,124.279999,0.11,0.11,0.22,0.36,...,1.58,1.98,2.23,2.7,2.99,14.77,0.44,493753.5,1.715,993896.0
75%,2599.26,0.437481,3114182000.0,142829900.0,34.412499,143.75,1.16,1.29,1.45,1.49,...,1.84,2.24,2.61,2.975,3.175,18.0625,0.61,689318.8,1.97,1285118.0
max,3386.12,0.513793,11566400000.0,717828700.0,44.93,184.589996,2.51,2.49,2.58,2.74,...,3.09,3.18,3.31,4.14,4.4,82.69,3.09,4336057.0,3.77,3593415.0


In [728]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2252 entries, 2011-05-02 to 2020-04-14
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SPX                 2252 non-null   float64
 1   DIX                 2252 non-null   float64
 2   GEX                 2252 non-null   float64
 3   SPY Volume          2252 non-null   float64
 4   USO Price           2252 non-null   float64
 5   GLD Price           2252 non-null   float64
 6   TY 1 mo             2235 non-null   float64
 7   TY 3 mo             2235 non-null   float64
 8   TY 6 mo             2235 non-null   float64
 9   TY 1 yr             2235 non-null   float64
 10  TY 2 yr             2235 non-null   float64
 11  TY 3 yr             2235 non-null   float64
 12  TY 5 yr             2235 non-null   float64
 13  TY 7 yr             2235 non-null   float64
 14  TY 10 yr            2235 non-null   float64
 15  TY 20 yr            2235 non-null   f

In [729]:
def feature_engineering(df):
    
    df['SPX Price'] = df['SPX']
    
    ln_ratio_replace_cols = [
        'SPX',
        'SPY Volume',
        'USO Price',
        'GLD Price',
        'SPX Options Volume',
        'VIX Options Volume'
    ]
    
    ln_ratio_augment_cols = [
        'VIX'
    ]
    
    diff_replace_cols = [
    ]
    
    TY_cols = [col for col in df.columns if 'TY' in col]
    diff_replace_cols.extend(TY_cols)
    
    diff_augment_cols = [
        'DIX',
        'GEX'
    ]
    
    for col in ln_ratio_replace_cols:
        df[col] = np.log1p(df[col].pct_change())
    for col in ln_ratio_augment_cols:
        df[col + ' lnRatio'] = np.log1p(df[col].pct_change())
    
    for col in diff_replace_cols:
        df[col] = df[col].diff()
    for col in diff_augment_cols:
        df[col + ' Diff'] = df[col].diff()
        
    features_to_use = [
        'SPX',
        'SPX Price',
        'SPY Volume',
        'DIX',
        'GEX',
        'DIX Diff',
        'GEX Diff',
        'SPX Options Volume',
        'VIX Options Volume',
        'SPX P/C Ratio',
        'VIX P/C Ratio',
        'USO Price',
        #'GLD Price'
    ]
    
    features_to_use.extend(TY_cols)
    
    return df[features_to_use]

df_old = df_main.copy()
df_main = feature_engineering(df_main)
df_main

Unnamed: 0_level_0,SPX,SPX Price,SPY Volume,DIX,GEX,DIX Diff,GEX Diff,SPX Options Volume,VIX Options Volume,SPX P/C Ratio,...,TY 3 mo,TY 6 mo,TY 1 yr,TY 2 yr,TY 3 yr,TY 5 yr,TY 7 yr,TY 10 yr,TY 20 yr,TY 30 yr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-05-02,,1361.219971,,0.378842,1.897313e+09,,,,,1.38,...,,,,,,,,,,
2011-05-03,-0.003385,1356.619995,0.091476,0.383411,1.859731e+09,0.004569,-3.758192e+07,-0.048091,0.558486,1.84,...,-0.02,-0.01,-0.02,0.00,0.00,0.00,-0.02,-0.03,-0.03,-0.02
2011-05-04,-0.006879,1347.319946,0.277760,0.392122,1.717764e+09,0.008711,-1.419668e+08,-0.223435,0.147089,2.08,...,0.00,-0.02,-0.01,-0.01,-0.01,-0.01,-0.03,-0.03,-0.03,-0.03
2011-05-05,-0.009111,1335.099976,0.216782,0.405457,1.361864e+09,0.013335,-3.558996e+08,0.328796,0.189184,1.94,...,-0.01,0.00,0.01,-0.02,-0.03,-0.07,-0.07,-0.07,-0.08,-0.07
2011-05-06,0.003813,1340.199951,-0.018292,0.418649,1.490329e+09,0.013192,1.284643e+08,0.001049,-0.154289,1.85,...,0.00,0.00,-0.02,-0.01,-0.01,-0.01,0.00,0.01,0.03,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-04-07,-0.001604,2659.410000,0.068661,0.482110,1.533925e+09,0.035412,-2.123946e+07,-0.077897,-0.342821,1.12,...,-0.01,0.03,0.00,0.01,0.01,0.04,0.06,0.08,0.05,0.05
2020-04-08,0.033489,2749.980000,-0.269941,0.501114,3.331833e+09,0.019004,1.797908e+09,-0.040942,-0.157657,1.39,...,0.08,0.04,0.03,-0.01,-0.02,-0.01,0.01,0.02,0.05,0.05
2020-04-09,0.014383,2789.820000,0.213024,0.488574,2.608413e+09,-0.012540,-7.234200e+08,0.116959,0.300127,1.27,...,0.03,0.00,0.02,-0.04,-0.05,-0.06,-0.05,-0.04,-0.03,-0.02
2020-04-13,-0.010156,2761.630000,-0.504979,0.493960,3.422978e+09,0.005385,8.145649e+08,-0.342828,-0.332630,1.30,...,0.01,0.03,0.02,0.02,0.02,0.03,0.03,0.03,0.04,0.04


In [730]:
def label(df, k, u=0.8, d=0.7):
    size = len(df)
    labels = []
    prices = df['SPX Price']
    
    for i in range(size):
        base_price = prices.iloc[i]
        label = 1 # hold
        for j in range(i+1, min(i+k+1, size)):
            new_price = prices.iloc[j]
            pct_change = 100 * (new_price-base_price)/base_price
            if pct_change >= u:
                label = 2 # buy
                break
            elif pct_change <= -d:
                label = 0 # sell
                break
        labels.append(label)
    
    
    df['Signal'] = labels

label(df_main, k=LOOK_AHEAD)
df_main.drop('SPX Price', axis=1, inplace=True)
print(df_main['Signal'].value_counts())

df_to_predict = df_main.tail(200).copy()
df_main.tail(30)

1    1572
2     351
0     329
Name: Signal, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Signal'] = labels
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0_level_0,SPX,SPY Volume,DIX,GEX,DIX Diff,GEX Diff,SPX Options Volume,VIX Options Volume,SPX P/C Ratio,VIX P/C Ratio,...,TY 6 mo,TY 1 yr,TY 2 yr,TY 3 yr,TY 5 yr,TY 7 yr,TY 10 yr,TY 20 yr,TY 30 yr,Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-03,-0.02852,0.229023,0.456905,-1094765000.0,0.008409,-962674500.0,-0.370556,0.434617,1.29,0.39,...,-0.12,-0.16,-0.13,-0.13,-0.11,-0.1,-0.08,-0.02,-0.02,2
2020-03-04,0.041336,-0.530283,0.4203,1292494000.0,-0.036605,2387259000.0,0.274711,-0.994002,1.33,0.75,...,-0.15,-0.14,-0.04,-0.04,-0.02,-0.01,0.0,0.01,0.03,0
2020-03-05,-0.034464,0.053754,0.442533,-992248100.0,0.022233,-2284742000.0,-0.317448,0.995923,1.9,0.39,...,-0.15,-0.11,-0.08,-0.07,-0.08,-0.09,-0.1,-0.11,-0.11,0
2020-03-06,-0.017237,0.204551,0.452162,-1026985000.0,0.009629,-34736770.0,0.47391,0.306864,2.29,0.88,...,-0.12,-0.09,-0.1,-0.08,-0.09,-0.12,-0.18,-0.25,-0.31,0
2020-03-09,-0.079035,0.302423,0.416465,-1664364000.0,-0.035697,-637379500.0,-0.035661,-0.028578,2.33,2.31,...,-0.14,-0.08,-0.11,-0.13,-0.12,-0.13,-0.2,-0.22,-0.26,2
2020-03-10,0.048233,-0.112682,0.41825,-1000251000.0,0.001785,664113200.0,-0.120284,-0.572283,1.9,1.16,...,0.16,0.12,0.12,0.18,0.17,0.17,0.22,0.29,0.29,0
2020-03-11,-0.050106,-0.079505,0.439415,-1628877000.0,0.021165,-628625500.0,0.043463,0.341208,2.27,1.1,...,-0.03,-0.03,0.0,0.0,0.03,0.05,0.06,-0.03,0.02,0
2020-03-12,-0.099945,0.429322,0.377596,-2170934000.0,-0.061818,-542057500.0,0.337811,-0.04862,1.56,1.17,...,-0.03,-0.01,0.0,0.0,0.0,0.04,0.06,0.14,0.19,2
2020-03-13,0.088779,-0.174048,0.390936,-1540240000.0,0.013339,630693800.0,0.054848,0.182579,1.58,1.04,...,0.01,-0.01,-0.01,0.0,0.04,0.07,0.06,0.04,0.07,0
2020-03-16,-0.127623,-0.103237,0.372537,-1758235000.0,-0.018399,-217994700.0,-0.522832,-0.414884,1.75,0.82,...,-0.09,-0.09,-0.13,-0.15,-0.21,-0.22,-0.21,-0.21,-0.22,2


In [731]:
df_main = df_main[:-LOOK_AHEAD].dropna()

In [732]:
n = len(df_main)
train_index = int(n*TRAIN_RATIO)

df_train = df_main.iloc[ :train_index, : ].dropna()
df_test = df_main.iloc[train_index: , : ].dropna()

y_train = df_train['Signal']
y_test = df_test['Signal']

scaler = MinMaxScaler()
scaler.fit(df_train)
train = scaler.transform(df_train)
test = scaler.transform(df_test)

train[:,-1] = y_train
test[:,-1] = y_test

#print(train)
#print(test)

def preprocess_data(array):
    
    sequences = []
    prev_days = deque(maxlen=SEQUENCE_LENGTH)
    
    for row in array:
        
        prev_days.append([value for value in row[:-1]])
        if len(prev_days) == SEQUENCE_LENGTH:
            
            sequence = np.array(prev_days)
            sequences.append([sequence, row[-1]])
    
    random.shuffle(sequences)
    
    buys, holds, sells = [], [], []
    
    for seq, signal in sequences:
        if signal == 2:
            buys.append([seq, signal])
        elif signal == 1:
            holds.append([seq, signal])
        else:
            sells.append([seq, signal])
            
    smallest = min([len(buys), len(holds), len(sells)])
    buys = buys[:smallest]
    holds = holds[:smallest]
    sells = sells[:smallest]
    
    sequences = buys+holds+sells
    
    random.shuffle(sequences)
    
    X, y = [], []
    
    for seq, signal in sequences:
        X.append(seq)
        y.append(signal)
    
    return np.array(X), np.array(y)

X_train, y_train = preprocess_data(train)
X_test, y_test = preprocess_data(test)

y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

In [733]:
print(X_train[0])
print(y_train[0])

[[0.61442064 0.4619456  0.33235708 0.45233495 0.28878616 0.59257632
  0.36094239 0.33272898 0.23310811 0.07615894 0.49681946 0.43333333
  0.40909091 0.44444444 0.57894737 0.55555556 0.53125    0.53846154
  0.54545455 0.56097561 0.53191489 0.54716981]
 [0.51987783 0.59959532 0.41046668 0.40131818 0.57212255 0.52261701
  0.55326433 0.66562306 0.18243243 0.02649007 0.52686423 0.46666667
  0.36363636 0.44444444 0.47368421 0.51851852 0.5625     0.51282051
  0.45454545 0.43902439 0.38297872 0.41509434]
 [0.64678898 0.63672926 0.37569969 0.415433   0.44338883 0.5860167
  0.44199091 0.50429888 0.15878378 0.07615894 0.58061172 0.43333333
  0.36363636 0.44444444 0.52631579 0.44444444 0.40625    0.43589744
  0.45454545 0.51219512 0.42553191 0.50943396]
 [0.58627912 0.50886485 0.42239879 0.3642772  0.53629944 0.52248168
  0.70086889 0.61883577 0.20608108 0.10927152 0.50297619 0.41666667
  0.36363636 0.44444444 0.52631579 0.48148148 0.40625    0.46153846
  0.45454545 0.46341463 0.55319149 0.5660377

In [734]:
print(len(X_train))
print(len(X_test))

684
270


In [735]:
model = Sequential()

model.add(LSTM(96, return_sequences=True, input_shape=(X_train.shape[1:]), activation='tanh'))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(64, return_sequences=False, activation='tanh'))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))

#model.add(Dense(16, activation='relu'))
#model.add(Dropout(0.1))

model.add(Dense(8, activation='relu'))
#model.add(Dropout(0.1))

model.add(Dense(3, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.0005, decay=1e-6)

model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

tensorboard = TensorBoard(log_dir=f'logs/{MODEL_NAME}')

#filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
#checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor="val_acc", verbose=1, save_best_only=True, mode='max')) # saves only the best ones

history = model.fit(X_train,
                    y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=(X_test, y_test),
                    callbacks=[tensorboard])

#model.save(f'{MODEL_NAME}.model')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [736]:
model.summary()

Model: "sequential_47"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_84 (LSTM)               (None, 5, 96)             45696     
_________________________________________________________________
dropout_168 (Dropout)        (None, 5, 96)             0         
_________________________________________________________________
batch_normalization_84 (Batc (None, 5, 96)             384       
_________________________________________________________________
lstm_85 (LSTM)               (None, 64)                41216     
_________________________________________________________________
dropout_169 (Dropout)        (None, 64)                0         
_________________________________________________________________
batch_normalization_85 (Batc (None, 64)                256       
_________________________________________________________________
dense_134 (Dense)            (None, 32)              

In [737]:
actual_signals = df_to_predict['Signal']

to_predict = scaler.transform(df_to_predict)
to_predict[:,-1] = actual_signals

def get_sequences(array):
    
    prev_days = deque(maxlen=SEQUENCE_LENGTH)
    sequences = []
    
    for row in array:
        
        prev_days.append([value for value in row[:-1]])
        if len(prev_days) == SEQUENCE_LENGTH:
            
            sequence = np.array(prev_days)
            sequences.append(sequence)
    
    return np.array(sequences)


In [738]:
X_to_predict = get_sequences(to_predict)

y_pred_arr = model.predict(X_to_predict)
y_pred = list([np.argmax(arr) for arr in y_pred_arr])

y_pred_adj = ['N/A'] * (SEQUENCE_LENGTH-1) + y_pred

df_to_predict['Predicted Signal'] = y_pred_adj

df_to_predict['Signal'] = list(actual_signals[:-LOOK_AHEAD]) + ['N/A'] * LOOK_AHEAD

In [739]:
df_to_predict['Predicted Signal'].value_counts()

1      87
0      86
2      23
N/A     4
Name: Predicted Signal, dtype: int64

In [740]:
df_to_predict

Unnamed: 0_level_0,SPX,SPY Volume,DIX,GEX,DIX Diff,GEX Diff,SPX Options Volume,VIX Options Volume,SPX P/C Ratio,VIX P/C Ratio,...,TY 1 yr,TY 2 yr,TY 3 yr,TY 5 yr,TY 7 yr,TY 10 yr,TY 20 yr,TY 30 yr,Signal,Predicted Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-28,0.005741,0.385747,0.397685,2150116000.0,-0.006306,-85555500.0,0.422329,-0.331823,1.36,0.56,...,-0.01,0.01,0.0,0.0,-0.01,-0.01,0.0,0.0,1.0,
2019-07-01,0.007643,0.28734,0.404681,3009288000.0,0.006996,859171900.0,0.052289,0.637216,1.52,0.33,...,0.02,0.03,0.03,0.03,0.03,0.03,0.03,0.03,1.0,
2019-07-02,0.002924,-0.251697,0.413374,4032391000.0,0.008693,1023103000.0,-0.366312,0.281303,1.53,0.24,...,-0.03,-0.01,-0.03,-0.04,-0.05,-0.05,-0.05,-0.04,1.0,
2019-07-03,0.007643,-0.408007,0.369264,6001563000.0,-0.044111,1969172000.0,-0.000849,-0.72583,1.09,0.42,...,0.0,0.0,0.0,-0.01,-0.02,-0.02,-0.04,-0.04,1.0,
2019-07-05,-0.001807,0.233915,0.429075,3605487000.0,0.059811,-2396076000.0,0.276091,0.152478,1.64,0.53,...,0.07,0.1,0.11,0.1,0.1,0.08,0.09,0.07,1.0,1.0
2019-07-08,-0.004847,-0.119822,0.3943,3363917000.0,-0.034775,-241570200.0,-0.02353,0.13075,1.62,0.53,...,0.01,0.01,0.02,0.02,0.01,0.01,-0.02,-0.01,1.0,1.0
2019-07-09,0.001236,-0.109157,0.395008,5031135000.0,0.000709,1667218000.0,-0.570474,-0.095946,2.14,0.61,...,0.01,0.04,0.04,0.02,0.02,0.02,0.02,0.01,1.0,1.0
2019-07-10,0.0045,0.352106,0.38688,6799857000.0,-0.008129,1768722000.0,0.463186,0.394534,1.72,1.19,...,-0.07,-0.1,-0.09,-0.06,-0.03,0.0,0.02,0.03,1.0,1.0
2019-07-11,0.002283,-0.139736,0.388058,6114535000.0,0.001179,-685321700.0,-0.198826,-0.198066,1.82,0.49,...,0.04,0.03,0.05,0.06,0.06,0.06,0.06,0.08,1.0,1.0
2019-07-12,0.004609,-0.231414,0.395606,6592506000.0,0.007548,477971300.0,0.239309,0.009036,1.87,0.34,...,-0.01,-0.01,-0.03,-0.02,-0.01,-0.01,0.0,-0.01,1.0,1.0


In [741]:
def is_correct(predict, actual):
    if predict == 'N/A' or actual == 'N/A':
        return 'N/A'
    return 1 if predict == actual else 0

df_to_predict['Correct?'] = list(map(is_correct, df_to_predict['Signal'], df_to_predict['Predicted Signal']))

correct = len(df_to_predict[df_to_predict['Correct?'] == 1])
total = len(df_to_predict[df_to_predict['Correct?'] != 'N/A'])
perc = 100*correct/total

print(f'Accuracy over {total} trading days:')
print()
print(f'{correct}/{total}')
print(f'{perc:.2f}%')

Accuracy over 195 trading days:

104/195
53.33%
