# NOTES
Right now only takes close price. Update to include all data in the sequence.

After 3 epochs it's loss is basically non-existant. Use an early stop

Learn how to use GPU 

drop_last=True seems necessary for the data loader. The batch_size argument is expecting batches to be that size. But if the whole dataset can't be divided by the batch_size then you had some values remaining that don't fit. Could set batch_size to be some multiple if possible to minimize dropped samples 

In [292]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torchmetrics

In [293]:
DATA_PATH = r'C:\Users\connor\PycharmProjects\trading\data\analytics\analytics_voo.csv'
df = pd.read_csv(DATA_PATH, low_memory=False)

In [294]:
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'join_date', 'Id',
       'Date', 'DateLongDescription', 'DateShortDescription', 'DayLongName',
       'DayShortName', 'MonthLongName', 'MonthShortName', 'CalendarDay',
       'CalendarWeek', 'CalendarWeekStartDateId', 'CalendarWeekEndDateId',
       'CalendarDayInWeek', 'CalendarMonth', 'CalendarMonthStartDateId',
       'CalendarMonthEndDateId', 'CalendarNumberOfDaysInMonth',
       'CalendarDayInMonth', 'CalendarQuarter', 'CalendarQuarterStartDateId',
       'CalendarQuarterEndDateId', 'CalendarQuarterStartDate',
       'CalendarNumberOfDaysInQuarter', 'CalendarDayInQuarter', 'CalendarYear',
       'CalendarYearEndDateId', 'CalendarYearStartDate',
       'CalendarNumberOfDaysInYear', 'month_join_key', 'year_join_key',
       'seven_day_ema', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR', 'GDPC1',
       'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal', 'macd_hist',
       'daily_obv', 'target'],
      dtype='object')

In [295]:
non_target_columns = ['open', 'high', 'low', 'volume', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR', 'GDPC1', 'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal', 'macd_hist', 'daily_obv', 'seven_day_ema', 'close']

df_cols = df[non_target_columns]

# version with just close price and date
df_target_train = df[['date', 'close']].where(df.date <= '2020-01-01')
df_target_train.dropna(inplace=True)
df_target_test = df[['date', 'close']].where(df.date > '2020-01-01')
df_target_test.dropna(inplace=True)

# version with all columns
df_target_train_v2 = df[non_target_columns].where(df.date <= '2020-01-01')
df_target_train_v2.dropna(inplace=True)
df_target_test_v2 = df[non_target_columns].where(df.date > '2020-01-01')
df_target_test_v2.dropna(inplace=True)

In [296]:
print(df_target_train.columns)
print()
print(df_target_train_v2.columns)

Index(['date', 'close'], dtype='object')

Index(['open', 'high', 'low', 'volume', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR',
       'GDPC1', 'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal',
       'macd_hist', 'daily_obv', 'seven_day_ema', 'close'],
      dtype='object')


In [297]:
def create_sequences(df, seq_length, num_rows=len(df), include_all_features=False):
    """
    Input:
    df: pandas dataframe with at least 2 columns, a date column and a target column
    seq_length: How long the sequence should be. 
    num_rows: How many rows to use. Will pull all rows unless num_rows is provided, in which case the top num_rows will be pulled.
    include_all_features: Whether to use just the target column for the sequence, or to include all features.
        
    Output:
    Generates a sequence of seq_length length. It's shape is [num_rows, seq_length] if include_all_features is False, else [num_rows, seq_length * len(df.columns)]. 
    A sequence refers to how many should be appended as columns. 
    For example, if df = [['2020-01-01',100], ['2020-01-02', 101], ['2020-01-03', 102]] and seq_length = 2, the generated sequence will be [[101,102], [102,103]]. 
    """
    
    
    xs, ys = [], []
    
    if include_all_features == False:
        for itr in range(num_rows - seq_length):
            x = df.iloc[itr:(itr + seq_length), -1]
            y = df.iloc[itr + seq_length, -1]
            xs.append(x)
            ys.append(y)
    elif include_all_features == True:
        for itr in range(num_rows - seq_length):
            x = df.iloc[itr:(itr + seq_length), :]
            y = df.iloc[itr + seq_length, -1]
            xs.append(x)
            ys.append(y)

            # when you get more than 1 column, it'll append an array to the array which makes it 3d instead of 2d. Flatten out sample into single row
            #x_reshaped = x.to_numpy().reshape(sequence_len, -1)

    else:
        print(f"error: include_all_features accepts True or False, got {include_all_features} instead.")
    return np.array(xs), np.array(ys)

In [298]:
# create training set
sequence_len = 2
num_rows = 50
X_train, y_train = create_sequences(df_target_train, sequence_len, num_rows)
X_test, y_test = create_sequences(df_target_test, sequence_len, num_rows)

X_train_v2, y_train_v2 = create_sequences(df_target_train_v2, sequence_len, num_rows, include_all_features=True)
X_test_v2, y_test_v2 = create_sequences(df_target_test_v2, sequence_len, num_rows, include_all_features=True)

In [299]:
df[non_target_columns].head(n=100)

Unnamed: 0,open,high,low,volume,CPALTT01USM657N,DFF,EXPINF10YR,GDPC1,RSXFS,T10YFF,UNRATE,macd,macd_signal,macd_hist,daily_obv,seven_day_ema,close
0,83.935,84.042,83.820,232,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.967
1,83.888,83.995,83.743,150,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.889
2,83.888,83.995,83.774,1600,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.920
3,83.904,84.011,83.776,100,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.923
4,83.966,84.073,83.851,698,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,83.577,83.684,83.463,50,0.12452,0.19,1.574237,16960.864,323990.0,2.56,9.4,1.1110,1.1834,-0.0724,-165200.0,81.7122,83.609
96,83.608,83.715,83.494,93,0.12452,0.19,1.574237,16960.864,323990.0,2.56,9.4,1.1110,1.1834,-0.0724,-165200.0,81.7122,83.640
97,83.593,83.700,83.479,100,0.12452,0.19,1.574237,16960.864,323990.0,2.56,9.4,1.1110,1.1834,-0.0724,-165200.0,81.7122,83.625
98,83.733,83.840,83.618,100,0.12452,0.19,1.574237,16960.864,323990.0,2.56,9.4,1.1110,1.1834,-0.0724,-165200.0,81.7122,83.765


In [300]:
X_train[0]

array([83.967, 83.889])

In [301]:
y_train[0]

83.92

In [302]:
X_train_v2[0]

array([[ 8.39350000e+01,  8.40420000e+01,  8.38200000e+01,
         2.32000000e+02,  1.24519889e-01,  1.90000000e-01,
         1.57423670e+00,  1.69608640e+04,  3.23990000e+05,
         2.48000000e+00,  9.40000000e+00,  1.14270000e+00,
         1.19470000e+00, -5.21000000e-02, -1.48100000e+05,
         8.16086000e+01,  8.39670000e+01],
       [ 8.38880000e+01,  8.39950000e+01,  8.37430000e+01,
         1.50000000e+02,  1.24519889e-01,  1.90000000e-01,
         1.57423670e+00,  1.69608640e+04,  3.23990000e+05,
         2.48000000e+00,  9.40000000e+00,  1.14270000e+00,
         1.19470000e+00, -5.21000000e-02, -1.48100000e+05,
         8.16086000e+01,  8.38890000e+01]])

In [303]:
y_train_v2[0]

83.92

In [304]:
print(len(X_train), len(X_train_v2))
print(len(y_train), len(y_train_v2))

48 48
48 48


# using all fields available

In [305]:
# convert df into objects Torch can read
torch_X_train  = torch.from_numpy(X_train_v2).float()
torch_y_train = torch.from_numpy(y_train_v2).float()
torch_X_test = torch.from_numpy(X_test_v2).float()
torch_y_test = torch.from_numpy(y_test_v2).float()

print(torch_X_train.shape, torch_y_train.shape)
# create test and train sets
train_data_set = TensorDataset(torch_X_train, torch_y_train)
test_data_set = TensorDataset(torch_X_test, torch_y_test)

# confirm it works
sample = train_data_set[0]
input_sample, label_sample = sample
print('input sample:', input_sample)
print('label sample:', label_sample)

torch.Size([48, 2, 17]) torch.Size([48])
input sample: tensor([[ 8.3935e+01,  8.4042e+01,  8.3820e+01,  2.3200e+02,  1.2452e-01,
          1.9000e-01,  1.5742e+00,  1.6961e+04,  3.2399e+05,  2.4800e+00,
          9.4000e+00,  1.1427e+00,  1.1947e+00, -5.2100e-02, -1.4810e+05,
          8.1609e+01,  8.3967e+01],
        [ 8.3888e+01,  8.3995e+01,  8.3743e+01,  1.5000e+02,  1.2452e-01,
          1.9000e-01,  1.5742e+00,  1.6961e+04,  3.2399e+05,  2.4800e+00,
          9.4000e+00,  1.1427e+00,  1.1947e+00, -5.2100e-02, -1.4810e+05,
          8.1609e+01,  8.3889e+01]])
label sample: tensor(83.9200)


In [306]:
# Using just the close price

In [307]:
# # convert df into objects Torch can read
# torch_X_train  = torch.from_numpy(X_train).float()
# torch_y_train = torch.from_numpy(y_train).float()
# torch_X_test = torch.from_numpy(X_test).float()
# torch_y_test = torch.from_numpy(y_test).float()
# 
# # create test and train sets
# train_data_set = TensorDataset(torch_X_train, torch_y_train)
# test_data_set = TensorDataset(torch_X_test, torch_y_test)
# 
# # confirm it works
# sample = train_data_set[0]
# input_sample, label_sample = sample
# print('input sample:', input_sample)
# print('label sample:', label_sample)

In [308]:
print(input_sample.shape)

torch.Size([2, 17])


In [309]:
batch_size = 10
shuffle = True
hidden_size = 5
num_layers = 2

# create dataloader
train_dataloader = DataLoader(train_data_set, batch_size=batch_size, shuffle=shuffle, drop_last=True)
test_dataloader = DataLoader(test_data_set, batch_size=batch_size, shuffle=shuffle, drop_last=True)
# test loader
# x, y = next(iter(train_dataloader))
# 
# print('x', x, 'y', y)

In [310]:
# LTSM
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__() #super makes all the methods available in nn.Module available for the new class Net
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(in_features=hidden_size, out_features=1)
        
    def forward(self, x):
        h0 = torch.zeros(num_layers, x.size(0), hidden_size)
        c0 = torch.zeros(num_layers, x.size(0), hidden_size)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [311]:
learning_rate = 0.001
num_epochs = 50
num_features = len(df_target_train_v2.columns) # this right now is just the close date. 

net = Net(input_size=num_features)
#criterion = nn.BCELoss(reduction='sum') #you'll need to use the binary close price higher for this
criterion = nn.MSELoss()
optimizer = optim.Adam(
    net.parameters(), lr=learning_rate
)

for epoch in range(num_epochs):
    for seqs, labels in train_dataloader:
        seqs = seqs.view(batch_size, sequence_len, num_features)
        outputs = net(seqs).squeeze()
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    # early break if loss isn't changing beyond the learning rate
    if loss.item() < .1:
        break

Epoch 1, Loss: 7105.06005859375
Epoch 2, Loss: 7107.15234375
Epoch 3, Loss: 7102.54931640625
Epoch 4, Loss: 7097.87255859375
Epoch 5, Loss: 7101.92333984375
Epoch 6, Loss: 7092.74462890625
Epoch 7, Loss: 7105.91552734375
Epoch 8, Loss: 7081.5126953125
Epoch 9, Loss: 7101.16552734375
Epoch 10, Loss: 7087.5234375
Epoch 11, Loss: 7095.5458984375
Epoch 12, Loss: 7087.10302734375
Epoch 13, Loss: 7096.78125
Epoch 14, Loss: 7085.5615234375
Epoch 15, Loss: 7087.6708984375
Epoch 16, Loss: 7079.38134765625
Epoch 17, Loss: 7071.56787109375
Epoch 18, Loss: 7062.3251953125
Epoch 19, Loss: 7075.57958984375
Epoch 20, Loss: 7074.0732421875
Epoch 21, Loss: 7061.5244140625
Epoch 22, Loss: 7069.5419921875
Epoch 23, Loss: 7058.32421875
Epoch 24, Loss: 7056.10791015625
Epoch 25, Loss: 7055.2138671875
Epoch 26, Loss: 7056.71630859375
Epoch 27, Loss: 7046.2333984375
Epoch 28, Loss: 7034.10302734375
Epoch 29, Loss: 7044.6669921875
Epoch 30, Loss: 7043.22021484375
Epoch 31, Loss: 7035.96337890625
Epoch 32, Los

In [312]:
# Define MSE metric
mse = torchmetrics.regression.MeanSquaredError()

net.eval()
with torch.no_grad():
    for seqs, labels in test_dataloader:
        seqs = seqs.view(batch_size, torch_X_test.shape[1], num_features)
        # Pass seqs to net and squeeze the result
        outputs = net(seqs).squeeze()
        mse(outputs, labels)

# Compute final metric value
test_mse = mse.compute()
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_mse**.5}")

Test MSE: 77569.859375
Test RMSE: 278.513671875


In [313]:
## used for binary classifcation
# f1 = torchmetrics.F1Score(num_classes=2)
# 
# net.eval()
# with torch.no_grad():
#     for seqs, labels in test_dataloader:
#         seqs = seqs.view(batch_size, num_features, 1)
#         outputs = net(seqs).squeeze()
#         f1.compute()
#         
# print(f"Test F1 score: {f1.compute()}")

# acc = torchmetrics.Accuracy(task="binary")
# 
# net.eval()
# with torch.no_grad():
#     for seqs, labels in test_dataloader:
#         seqs = seqs.view(batch_size, sequence_len, num_features)
#         outputs = net(seqs).squeeze()
#         acc.compute()
# 
# print(f"Test accuracy score: {acc.compute()}")


# Error when using just the close price:
274 RMSE

# Error when using all fields
277 RMSE

when using 10000 samples: 188