# NOTES
Right now only takes close price. Update to include all data in the sequence.

After 3 epochs it's loss is basically non-existant. Use an early stop

Learn how to use GPU 

drop_last=True seems necessary for the data loader. The batch_size argument is expecting batches to be that size. But if the whole dataset can't be divided by the batch_size then you had some values remaining that don't fit. Could set batch_size to be some multiple if possible to minimize dropped samples 

It seems to just be predicting the same thing
    try increasing batch size
    overfit on 1-2 samples first

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torchmetrics

In [59]:
DATA_PATH = r'C:\Users\connor\PycharmProjects\trading\data\analytics\analytics_voo.csv'
df = pd.read_csv(DATA_PATH, low_memory=False)

In [60]:
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'join_date', 'Id',
       'Date', 'DateLongDescription', 'DateShortDescription', 'DayLongName',
       'DayShortName', 'MonthLongName', 'MonthShortName', 'CalendarDay',
       'CalendarWeek', 'CalendarWeekStartDateId', 'CalendarWeekEndDateId',
       'CalendarDayInWeek', 'CalendarMonth', 'CalendarMonthStartDateId',
       'CalendarMonthEndDateId', 'CalendarNumberOfDaysInMonth',
       'CalendarDayInMonth', 'CalendarQuarter', 'CalendarQuarterStartDateId',
       'CalendarQuarterEndDateId', 'CalendarQuarterStartDate',
       'CalendarNumberOfDaysInQuarter', 'CalendarDayInQuarter', 'CalendarYear',
       'CalendarYearEndDateId', 'CalendarYearStartDate',
       'CalendarNumberOfDaysInYear', 'month_join_key', 'year_join_key',
       'seven_day_ema', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR', 'GDPC1',
       'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal', 'macd_hist',
       'daily_obv', 'target'],
      dtype='object')

In [61]:
non_target_columns = ['open', 'high', 'low', 'close', 'volume', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR', 'GDPC1', 'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal', 'macd_hist', 'daily_obv', 'seven_day_ema', 'close', 'target']

df_cols = df[non_target_columns]

# version with just close price and date
df_target_train = df[['date', 'target']].where(df.date <= '2020-01-01')
df_target_train.dropna(inplace=True)
df_target_test = df[['date', 'target']].where(df.date > '2020-01-01')
df_target_test.dropna(inplace=True)

# version with all columns
df_target_train_v2 = df[non_target_columns].where(df.date <= '2020-01-01')
df_target_train_v2.dropna(inplace=True)
df_target_test_v2 = df[non_target_columns].where(df.date > '2020-01-01')
df_target_test_v2.dropna(inplace=True)

In [62]:
print(df_target_train.columns)
print()
print(df_target_train_v2.columns)

Index(['date', 'target'], dtype='object')

Index(['open', 'high', 'low', 'close', 'volume', 'CPALTT01USM657N', 'DFF',
       'EXPINF10YR', 'GDPC1', 'RSXFS', 'T10YFF', 'UNRATE', 'macd',
       'macd_signal', 'macd_hist', 'daily_obv', 'seven_day_ema', 'close',
       'target'],
      dtype='object')


In [63]:
def create_sequences(df, seq_length, num_rows=len(df), include_all_features=False):
    """
    Input:
    df: pandas dataframe with at least 2 columns, a date column and a target column
    seq_length: How long the sequence should be. 
    num_rows: How many rows to use. Will pull all rows unless num_rows is provided, in which case the top num_rows will be pulled.
    include_all_features: Whether to use just the target column for the sequence, or to include all features.
        
    Output:
    Generates a sequence of seq_length length. It's shape is [num_rows, seq_length] if include_all_features is False, else [num_rows, seq_length * len(df.columns)]. 
    A sequence refers to how many should be appended as columns. 
    For example, if df = [['2020-01-01',100], ['2020-01-02', 101], ['2020-01-03', 102]] and seq_length = 2, the generated sequence will be [[101,102], [102,103]]. 
    """
    
    
    xs, ys = [], []
    
    if include_all_features == False:
        for itr in range(num_rows - seq_length):
            x = df.iloc[itr:(itr + seq_length), -1]
            y = df.iloc[itr + seq_length, -1]
            xs.append(x)
            ys.append(y)
    elif include_all_features == True:
        for itr in range(num_rows - seq_length):
            x = df.iloc[itr:(itr + seq_length), :]
            y = df.iloc[itr + seq_length, -1]
            xs.append(x)
            ys.append(y)

    else:
        print(f"error: include_all_features accepts True or False, got {include_all_features} instead.")
    return np.array(xs), np.array(ys)

In [64]:
# create training set
sequence_len = 10
num_rows = 100000
X_train, y_train = create_sequences(df_target_train, sequence_len, num_rows)
X_test, y_test = create_sequences(df_target_test, sequence_len, num_rows)

X_train_v2, y_train_v2 = create_sequences(df_target_train_v2, sequence_len, num_rows, include_all_features=True)
X_test_v2, y_test_v2 = create_sequences(df_target_test_v2, sequence_len, num_rows, include_all_features=True)

In [65]:
df[non_target_columns].head(n=100)

Unnamed: 0,open,high,low,close,volume,CPALTT01USM657N,DFF,EXPINF10YR,GDPC1,RSXFS,T10YFF,UNRATE,macd,macd_signal,macd_hist,daily_obv,seven_day_ema,close.1,target
0,83.935,84.042,83.820,83.967,232,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.967,0
1,83.888,83.995,83.743,83.889,150,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.889,1
2,83.888,83.995,83.774,83.920,1600,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.920,1
3,83.904,84.011,83.776,83.923,100,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.923,1
4,83.966,84.073,83.851,83.998,698,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.998,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,83.577,83.684,83.463,83.609,50,0.12452,0.19,1.574237,16960.864,323990.0,2.56,9.4,1.1110,1.1834,-0.0724,-165200.0,81.7122,83.609,1
96,83.608,83.715,83.494,83.640,93,0.12452,0.19,1.574237,16960.864,323990.0,2.56,9.4,1.1110,1.1834,-0.0724,-165200.0,81.7122,83.640,0
97,83.593,83.700,83.479,83.625,100,0.12452,0.19,1.574237,16960.864,323990.0,2.56,9.4,1.1110,1.1834,-0.0724,-165200.0,81.7122,83.625,1
98,83.733,83.840,83.618,83.765,100,0.12452,0.19,1.574237,16960.864,323990.0,2.56,9.4,1.1110,1.1834,-0.0724,-165200.0,81.7122,83.765,1


In [66]:
X_train[0]

array([0., 1., 1., 1., 1., 1., 0., 1., 1., 0.])

In [67]:
y_train[0]

1.0

In [68]:
X_train_v2[0]

array([[ 8.39350000e+01,  8.40420000e+01,  8.38200000e+01,
         8.39670000e+01,  2.32000000e+02,  1.24519889e-01,
         1.90000000e-01,  1.57423670e+00,  1.69608640e+04,
         3.23990000e+05,  2.48000000e+00,  9.40000000e+00,
         1.14270000e+00,  1.19470000e+00, -5.21000000e-02,
        -1.48100000e+05,  8.16086000e+01,  8.39670000e+01,
         0.00000000e+00],
       [ 8.38880000e+01,  8.39950000e+01,  8.37430000e+01,
         8.38890000e+01,  1.50000000e+02,  1.24519889e-01,
         1.90000000e-01,  1.57423670e+00,  1.69608640e+04,
         3.23990000e+05,  2.48000000e+00,  9.40000000e+00,
         1.14270000e+00,  1.19470000e+00, -5.21000000e-02,
        -1.48100000e+05,  8.16086000e+01,  8.38890000e+01,
         1.00000000e+00],
       [ 8.38880000e+01,  8.39950000e+01,  8.37740000e+01,
         8.39200000e+01,  1.60000000e+03,  1.24519889e-01,
         1.90000000e-01,  1.57423670e+00,  1.69608640e+04,
         3.23990000e+05,  2.48000000e+00,  9.40000000e+00,
    

In [69]:
y_train_v2[0]

1.0

In [70]:
print(len(X_train), len(X_train_v2))
print(len(y_train), len(y_train_v2))

99990 99990
99990 99990


# using all fields available

In [71]:
# convert df into objects Torch can read
torch_X_train  = torch.from_numpy(X_train_v2).float()
torch_y_train = torch.from_numpy(y_train_v2).float()
torch_X_test = torch.from_numpy(X_test_v2).float()
torch_y_test = torch.from_numpy(y_test_v2).float()

# normalize data
torch_X_train = torch.nn.functional.normalize(torch_X_train)
torch_X_test = torch.nn.functional.normalize(torch_X_test)

print(torch_X_train.shape, torch_y_train.shape)
# create test and train sets
train_data_set = TensorDataset(torch_X_train, torch_y_train)
test_data_set = TensorDataset(torch_X_test, torch_y_test)

# confirm it works
sample = train_data_set[0]
input_sample, label_sample = sample
print('input sample:', input_sample)
print('label sample:', label_sample)

torch.Size([99990, 10, 19]) torch.Size([99990])
input sample: tensor([[ 0.3158,  0.3158,  0.3158,  0.3158,  0.1254,  0.3162,  0.3162,  0.3162,
          0.3162,  0.3162,  0.3162,  0.3162,  0.3162,  0.3162, -0.3162, -0.3162,
          0.3162,  0.3158,  0.0000],
        [ 0.3156,  0.3156,  0.3155,  0.3155,  0.0811,  0.3162,  0.3162,  0.3162,
          0.3162,  0.3162,  0.3162,  0.3162,  0.3162,  0.3162, -0.3162, -0.3162,
          0.3162,  0.3155,  0.3780],
        [ 0.3156,  0.3156,  0.3156,  0.3156,  0.8650,  0.3162,  0.3162,  0.3162,
          0.3162,  0.3162,  0.3162,  0.3162,  0.3162,  0.3162, -0.3162, -0.3162,
          0.3162,  0.3156,  0.3780],
        [ 0.3157,  0.3157,  0.3156,  0.3156,  0.0541,  0.3162,  0.3162,  0.3162,
          0.3162,  0.3162,  0.3162,  0.3162,  0.3162,  0.3162, -0.3162, -0.3162,
          0.3162,  0.3156,  0.3780],
        [ 0.3159,  0.3159,  0.3159,  0.3159,  0.3774,  0.3162,  0.3162,  0.3162,
          0.3162,  0.3162,  0.3162,  0.3162,  0.3162,  0.3162

# Using just the close price

In [72]:
# # convert df into objects Torch can read
# torch_X_train  = torch.from_numpy(X_train).float()
# torch_y_train = torch.from_numpy(y_train).float()
# torch_X_test = torch.from_numpy(X_test).float()
# torch_y_test = torch.from_numpy(y_test).float()
# 
# # create test and train sets
# train_data_set = TensorDataset(torch_X_train, torch_y_train)
# test_data_set = TensorDataset(torch_X_test, torch_y_test)
# 
# # confirm it works
# sample = train_data_set[0]
# input_sample, label_sample = sample
# print('input sample:', input_sample)
# print('label sample:', label_sample)

In [73]:
print(input_sample.shape)

torch.Size([10, 19])


In [74]:
num_features = len(df_target_train_v2.columns)
batch_size = 1000
shuffle = True
hidden_size = round(num_features / 2)
num_layers = 2
dropout = .5

# create dataloader
train_dataloader = DataLoader(train_data_set, batch_size=batch_size, shuffle=shuffle, drop_last=True)
test_dataloader = DataLoader(test_data_set, batch_size=batch_size, shuffle=shuffle, drop_last=True)
# test loader
# x, y = next(iter(train_dataloader))
# 
# print('x', x, 'y', y)

In [75]:
# LTSM
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__() #super makes all the methods available in nn.Module available for the new class Net
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(in_features=hidden_size, out_features=1)
        self.sigmoid = nn.Sigmoid() # we want a binary output, not %
        
    def forward(self, x):
        h0 = torch.zeros(num_layers, x.size(0), hidden_size)
        c0 = torch.zeros(num_layers, x.size(0), hidden_size)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        return out

In [76]:
learning_rate = 0.0001
num_epochs = 50

net = Net(input_size=num_features)
criterion = nn.BCELoss(reduction='sum') # for binary prediction. Using the 'target' column
#criterion = nn.MSELoss() # for regression, predicting the 'close' column
optimizer = optim.Adam(
    net.parameters(), lr=learning_rate
)

for epoch in range(num_epochs):
    for seqs, labels in train_dataloader:
        seqs = seqs.view(batch_size, sequence_len, num_features)
        outputs = net(seqs).squeeze()
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    # early break if loss isn't changing beyond the learning rate
    if loss.item() < .1:
        break

Epoch 1, Loss: 7.023953914642334
Epoch 2, Loss: 6.279009819030762
Epoch 3, Loss: 6.533650875091553
Epoch 4, Loss: 6.498201847076416
Epoch 5, Loss: 6.224382400512695
Epoch 6, Loss: 7.261401653289795
Epoch 7, Loss: 6.965570449829102
Epoch 8, Loss: 6.7383246421813965
Epoch 9, Loss: 6.796425819396973
Epoch 10, Loss: 7.364734649658203
Epoch 11, Loss: 6.877278804779053
Epoch 12, Loss: 7.344733715057373
Epoch 13, Loss: 7.001951694488525
Epoch 14, Loss: 6.760310649871826
Epoch 15, Loss: 6.970333576202393
Epoch 16, Loss: 6.745433807373047
Epoch 17, Loss: 6.983258247375488
Epoch 18, Loss: 6.911835193634033
Epoch 19, Loss: 6.848789215087891
Epoch 20, Loss: 6.456284999847412
Epoch 21, Loss: 7.201022148132324
Epoch 22, Loss: 6.357461452484131
Epoch 23, Loss: 7.601963043212891
Epoch 24, Loss: 6.740070343017578
Epoch 25, Loss: 7.156708717346191
Epoch 26, Loss: 6.92434024810791
Epoch 27, Loss: 6.947149753570557
Epoch 28, Loss: 7.560534477233887
Epoch 29, Loss: 6.3907060623168945
Epoch 30, Loss: 7.3618

In [77]:
# # Define MSE metric
# mse = torchmetrics.regression.MeanSquaredError()
# 
# net.eval()
# with torch.no_grad():
#     for seqs, labels in test_dataloader:
#         seqs = seqs.view(batch_size, torch_X_test.shape[1], num_features)
#         # Pass seqs to net and squeeze the result
#         outputs = net(seqs).squeeze()
#         mse(outputs, labels)
# 
# # Compute final metric value
# test_mse = mse.compute()
# print(f"Test MSE: {test_mse}")
# print(f"Test RMSE: {test_mse**.5}")

In [78]:
# used for binary classifcation
f1 = torchmetrics.F1Score(num_classes=1, task='BINARY')

net.eval()
with torch.no_grad():
    for seqs, labels in test_dataloader:
        seqs = seqs.view(batch_size, sequence_len, num_features)
        outputs = net(seqs).squeeze()
        f1(torch.round(outputs), labels)

print(f"Test F1 score: {f1.compute()}")

acc = torchmetrics.Accuracy(task="binary")

net.eval()
with torch.no_grad():
    for seqs, labels in test_dataloader:
        seqs = seqs.view(batch_size, sequence_len, num_features)
        outputs = net(seqs).squeeze()
        acc(torch.round(outputs), labels)

print(f"Test accuracy score: {acc.compute()}")



Test F1 score: 0.0
Test accuracy score: 0.5077507495880127


# Results

Baseline guessing 

In [79]:
random_guess_likelihood = df.target.mean()

random_guess_likelihood

0.4687579112426077

At the minute level, 46.875% of the time the next close price is higher than the previous close price for VOO. 
The model is able to correctly predict a candle 50.077% of the time making it not much better than a random (weighted) coinflip. The thing holding the model back is that it doesn't take into account expected payoff. The average positive candel is higher than the average negative candle in absolute terms. 