# NOTES
Right now only takes close price. Update to include all data in the sequence.

After 3 epochs it's loss is basically non-existant. Use an early stop

Learn how to use GPU 

drop_last=True seems necessary for the data loader. The batch_size argument is expecting batches to be that size. But if the whole dataset can't be divided by the batch_size then you had some values remaining that don't fit. Could set batch_size to be some multiple if possible to minimize dropped samples 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torchmetrics

In [2]:
DATA_PATH = r'C:\Users\connor\PycharmProjects\trading\data\analytics\analytics_voo.csv'
df = pd.read_csv(DATA_PATH, low_memory=False)

In [3]:
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'join_date', 'Id',
       'Date', 'DateLongDescription', 'DateShortDescription', 'DayLongName',
       'DayShortName', 'MonthLongName', 'MonthShortName', 'CalendarDay',
       'CalendarWeek', 'CalendarWeekStartDateId', 'CalendarWeekEndDateId',
       'CalendarDayInWeek', 'CalendarMonth', 'CalendarMonthStartDateId',
       'CalendarMonthEndDateId', 'CalendarNumberOfDaysInMonth',
       'CalendarDayInMonth', 'CalendarQuarter', 'CalendarQuarterStartDateId',
       'CalendarQuarterEndDateId', 'CalendarQuarterStartDate',
       'CalendarNumberOfDaysInQuarter', 'CalendarDayInQuarter', 'CalendarYear',
       'CalendarYearEndDateId', 'CalendarYearStartDate',
       'CalendarNumberOfDaysInYear', 'month_join_key', 'year_join_key',
       'seven_day_ema', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR', 'GDPC1',
       'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal', 'macd_hist',
       'daily_obv', 'target'],
      dtype='object')

In [4]:
non_target_columns = ['open', 'high', 'low', 'volume', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR', 'GDPC1', 'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal', 'macd_hist', 'daily_obv', 'seven_day_ema', 'close']

df_cols = df[non_target_columns]

# version with just close price and date
df_target_train = df[['date', 'close']].where(df.date <= '2020-01-01')
df_target_train.dropna(inplace=True)
df_target_test = df[['date', 'close']].where(df.date > '2020-01-01')
df_target_test.dropna(inplace=True)

# version with all columns
df_target_train_v2 = df[non_target_columns].where(df.date <= '2020-01-01')
df_target_train_v2.dropna(inplace=True)
df_target_test_v2 = df[non_target_columns].where(df.date > '2020-01-01')
df_target_test_v2.dropna(inplace=True)

In [5]:
print(df_target_train.columns)
print()
print(df_target_train_v2.columns)

Index(['date', 'close'], dtype='object')

Index(['open', 'high', 'low', 'volume', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR',
       'GDPC1', 'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal',
       'macd_hist', 'daily_obv', 'seven_day_ema', 'close'],
      dtype='object')


In [6]:
def create_sequences(df, seq_length, num_rows=len(df), include_all_features=False):
    """
    Input:
    df: pandas dataframe with at least 2 columns, a date column and a target column
    seq_length: How long the sequence should be. 
    num_rows: How many rows to use. Will pull all rows unless num_rows is provided, in which case the top num_rows will be pulled.
    include_all_features: Whether to use just the target column for the sequence, or to include all features.
        
    Output:
    Generates a sequence of seq_length length. It's shape is [num_rows, seq_length] if include_all_features is False, else [num_rows, seq_length * len(df.columns)]. 
    A sequence refers to how many should be appended as columns. 
    For example, if df = [['2020-01-01',100], ['2020-01-02', 101], ['2020-01-03', 102]] and seq_length = 2, the generated sequence will be [[101,102], [102,103]]. 
    """
    
    
    xs, ys = [], []
    if include_all_features == True:
        feature_num = len(df.columns)
        seq_length = seq_length * feature_num
    if include_all_features == False:
        for itr in range(num_rows - seq_length):
            x = df.iloc[itr:(itr + seq_length), -1]
            y = df.iloc[itr + seq_length, -1]
            xs.append(x)
            ys.append(y)
    elif include_all_features == True:
        for itr in range(num_rows - seq_length):
            x = df.iloc[itr:(itr + seq_length), :]
            y = df.iloc[itr + seq_length, -1]
            # when you get more than 1 column, it'll append an array to the array which makes it 3d instead of 2d. Flatten out sample into single row
            x_reshaped = x.to_numpy().reshape(-1, 1)
            xs.append(x_reshaped)
            ys.append(y)
    else:
        print(f"error: include_all_features accepts True or False, got {include_all_features} instead.")
    return np.array(xs), np.array(ys)

In [7]:
# create training set
sequence_len = 5
num_rows = 100
X_train, y_train = create_sequences(df_target_train, sequence_len, num_rows)
X_test, y_test = create_sequences(df_target_test, sequence_len, num_rows)

In [8]:
print('x train', X_train)
print('x test', X_test)

x train [[83.967 83.889 83.92  83.923 83.998]
 [83.889 83.92  83.923 83.998 84.2  ]
 [83.92  83.923 83.998 84.2   84.309]
 [83.923 83.998 84.2   84.309 84.169]
 [83.998 84.2   84.309 84.169 84.184]
 [84.2   84.309 84.169 84.184 84.278]
 [84.309 84.169 84.184 84.278 84.169]
 [84.169 84.184 84.278 84.169 84.241]
 [84.184 84.278 84.169 84.241 84.325]
 [84.278 84.169 84.241 84.325 84.301]
 [84.169 84.241 84.325 84.301 84.278]
 [84.241 84.325 84.301 84.278 84.278]
 [84.325 84.301 84.278 84.278 84.356]
 [84.301 84.278 84.278 84.356 84.29 ]
 [84.278 84.278 84.356 84.29  84.325]
 [84.278 84.356 84.29  84.325 84.363]
 [84.356 84.29  84.325 84.363 84.371]
 [84.29  84.325 84.363 84.371 84.309]
 [84.325 84.363 84.371 84.309 84.257]
 [84.363 84.371 84.309 84.257 84.293]
 [84.371 84.309 84.257 84.293 84.387]
 [84.309 84.257 84.293 84.387 84.387]
 [84.257 84.293 84.387 84.387 84.122]
 [84.293 84.387 84.387 84.122 84.107]
 [84.387 84.387 84.122 84.107 84.153]
 [84.387 84.122 84.107 84.153 84.138]
 [84

In [9]:
# create training set
sequence_len = 5
num_rows = 100
X_train_v2, y_train_v2 = create_sequences(df_target_train_v2, sequence_len, num_rows, include_all_features=True)
X_test_v2, y_test_v2 = create_sequences(df_target_test_v2, sequence_len, num_rows, include_all_features=True)

In [10]:
print('x train_v2', X_train_v2)
print('x test_v2', X_test_v2)

x train_v2 [[[ 8.39350e+01]
  [ 8.40420e+01]
  [ 8.38200e+01]
  ...
  [-1.65200e+05]
  [ 8.17122e+01]
  [ 8.35160e+01]]

 [[ 8.38880e+01]
  [ 8.39950e+01]
  [ 8.37430e+01]
  ...
  [-1.65200e+05]
  [ 8.17122e+01]
  [ 8.34850e+01]]

 [[ 8.38880e+01]
  [ 8.39950e+01]
  [ 8.37740e+01]
  ...
  [-1.65200e+05]
  [ 8.17122e+01]
  [ 8.35160e+01]]

 ...

 [[ 8.42920e+01]
  [ 8.44000e+01]
  [ 8.41770e+01]
  ...
  [-1.65200e+05]
  [ 8.17122e+01]
  [ 8.36400e+01]]

 [[ 8.42690e+01]
  [ 8.43770e+01]
  [ 8.41540e+01]
  ...
  [-1.65200e+05]
  [ 8.17122e+01]
  [ 8.36250e+01]]

 [[ 8.42460e+01]
  [ 8.43530e+01]
  [ 8.41310e+01]
  ...
  [-1.65200e+05]
  [ 8.17122e+01]
  [ 8.37650e+01]]]
x test_v2 [[[2.78785000e+02]
  [2.78985000e+02]
  [2.78648000e+02]
  ...
  [1.45631618e+08]
  [2.70204100e+02]
  [2.78284000e+02]]

 [[2.78972000e+02]
  [2.79117000e+02]
  [2.78835000e+02]
  ...
  [1.45631618e+08]
  [2.70204100e+02]
  [2.78351000e+02]]

 [[2.78954000e+02]
  [2.79135000e+02]
  [2.78798000e+02]
  ...
  [1.4

In [11]:
print(X_train.shape, y_train.shape)
print(X_train_v2.reshape(X_train_v2.shape[0], X_train_v2.shape[1]).shape, y_train_v2.shape)

(95, 5) (95,)
(15, 1445) (15,)


In [12]:
df[non_target_columns].head(n=10)

Unnamed: 0,open,high,low,volume,CPALTT01USM657N,DFF,EXPINF10YR,GDPC1,RSXFS,T10YFF,UNRATE,macd,macd_signal,macd_hist,daily_obv,seven_day_ema,close
0,83.935,84.042,83.82,232,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.967
1,83.888,83.995,83.743,150,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.889
2,83.888,83.995,83.774,1600,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.92
3,83.904,84.011,83.776,100,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.923
4,83.966,84.073,83.851,698,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,83.998
5,84.168,84.276,84.053,100,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,84.2
6,84.277,84.385,84.162,350,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,84.309
7,84.137,84.244,84.022,100,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,84.169
8,84.152,84.26,84.037,300,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,84.184
9,84.246,84.353,84.131,235,0.12452,0.19,1.574237,16960.864,323990.0,2.48,9.4,1.1427,1.1947,-0.0521,-148100.0,81.6086,84.278


In [13]:
pd.DataFrame(X_train).head(n=10)

Unnamed: 0,0,1,2,3,4
0,83.967,83.889,83.92,83.923,83.998
1,83.889,83.92,83.923,83.998,84.2
2,83.92,83.923,83.998,84.2,84.309
3,83.923,83.998,84.2,84.309,84.169
4,83.998,84.2,84.309,84.169,84.184
5,84.2,84.309,84.169,84.184,84.278
6,84.309,84.169,84.184,84.278,84.169
7,84.169,84.184,84.278,84.169,84.241
8,84.184,84.278,84.169,84.241,84.325
9,84.278,84.169,84.241,84.325,84.301


In [14]:
pd.DataFrame(y_train).head(n=10)

Unnamed: 0,0
0,84.2
1,84.309
2,84.169
3,84.184
4,84.278
5,84.169
6,84.241
7,84.325
8,84.301
9,84.278


In [15]:
pd.DataFrame(X_train_v2.reshape(X_train_v2.shape[0], X_train_v2.shape[1])).head(n=200)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1435,1436,1437,1438,1439,1440,1441,1442,1443,1444
0,83.935,84.042,83.82,232.0,0.12452,0.19,1.574237,16960.864,323990.0,2.48,...,16960.864,323990.0,2.56,9.4,1.111,1.1834,-0.0724,-165200.0,81.7122,83.516
1,83.888,83.995,83.743,150.0,0.12452,0.19,1.574237,16960.864,323990.0,2.48,...,16960.864,323990.0,2.56,9.4,1.111,1.1834,-0.0724,-165200.0,81.7122,83.485
2,83.888,83.995,83.774,1600.0,0.12452,0.19,1.574237,16960.864,323990.0,2.48,...,16960.864,323990.0,2.56,9.4,1.111,1.1834,-0.0724,-165200.0,81.7122,83.516
3,83.904,84.011,83.776,100.0,0.12452,0.19,1.574237,16960.864,323990.0,2.48,...,16960.864,323990.0,2.56,9.4,1.111,1.1834,-0.0724,-165200.0,81.7122,83.516
4,83.966,84.073,83.851,698.0,0.12452,0.19,1.574237,16960.864,323990.0,2.48,...,16960.864,323990.0,2.56,9.4,1.111,1.1834,-0.0724,-165200.0,81.7122,83.578
5,84.168,84.276,84.053,100.0,0.12452,0.19,1.574237,16960.864,323990.0,2.48,...,16960.864,323990.0,2.56,9.4,1.111,1.1834,-0.0724,-165200.0,81.7122,83.438
6,84.277,84.385,84.162,350.0,0.12452,0.19,1.574237,16960.864,323990.0,2.48,...,16960.864,323990.0,2.56,9.4,1.111,1.1834,-0.0724,-165200.0,81.7122,83.407
7,84.137,84.244,84.022,100.0,0.12452,0.19,1.574237,16960.864,323990.0,2.48,...,16960.864,323990.0,2.56,9.4,1.111,1.1834,-0.0724,-165200.0,81.7122,83.458
8,84.152,84.26,84.037,300.0,0.12452,0.19,1.574237,16960.864,323990.0,2.48,...,16960.864,323990.0,2.56,9.4,1.111,1.1834,-0.0724,-165200.0,81.7122,83.584
9,84.246,84.353,84.131,235.0,0.12452,0.19,1.574237,16960.864,323990.0,2.48,...,16960.864,323990.0,2.56,9.4,1.111,1.1834,-0.0724,-165200.0,81.7122,83.734


In [16]:
pd.DataFrame(y_train_v2).head()

Unnamed: 0,0
0,83.485
1,83.516
2,83.516
3,83.578
4,83.438


# using all fields available

In [17]:
# # convert df into objects Torch can read
# torch_X_train  = torch.from_numpy(X_train_v2.reshape(X_train_v2.shape[0], X_train_v2.shape[1])).float()
# torch_y_train = torch.from_numpy(y_train_v2).float()
# torch_X_test = torch.from_numpy(X_test_v2.reshape(X_test_v2.shape[0], X_test_v2.shape[1])).float()
# torch_y_test = torch.from_numpy(y_test_v2).float()
# 
# print(torch_X_train.shape, torch_y_train.shape)
# # create test and train sets
# train_data_set = TensorDataset(torch_X_train, torch_y_train)
# test_data_set = TensorDataset(torch_X_test, torch_y_test)
# 
# # confirm it works
# sample = train_data_set[0]
# input_sample, label_sample = sample
# print('input sample:', input_sample)
# print('label sample:', label_sample)

In [18]:
# Using just the close price

In [19]:
# convert df into objects Torch can read
torch_X_train  = torch.from_numpy(X_train).float()
torch_y_train = torch.from_numpy(y_train).float()
torch_X_test = torch.from_numpy(X_test).float()
torch_y_test = torch.from_numpy(y_test).float()

# create test and train sets
train_data_set = TensorDataset(torch_X_train, torch_y_train)
test_data_set = TensorDataset(torch_X_test, torch_y_test)

# confirm it works
sample = train_data_set[0]
input_sample, label_sample = sample
print('input sample:', input_sample)
print('label sample:', label_sample)

input sample: tensor([83.9670, 83.8890, 83.9200, 83.9230, 83.9980])
label sample: tensor(84.2000)


In [20]:
print(input_sample.shape)

torch.Size([5])


In [21]:
batch_size = 10
shuffle = True
hidden_size = 5
num_layers = 2

# create dataloader
train_dataloader = DataLoader(train_data_set, batch_size=batch_size, shuffle=shuffle, drop_last=True)
test_dataloader = DataLoader(test_data_set, batch_size=batch_size, shuffle=shuffle, drop_last=True)
# test loader
# x, y = next(iter(train_dataloader))
# 
# print('x', x, 'y', y)

In [22]:
# LTSM
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__() #super makes all the methods available in nn.Module available for the new class Net
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(in_features=hidden_size, out_features=1)
        
    def forward(self, x):
        h0 = torch.zeros(num_layers, x.size(0), hidden_size)
        c0 = torch.zeros(num_layers, x.size(0), hidden_size)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [23]:
learning_rate = 0.001
num_epochs = 50
num_features = 1 # this right now is just the close date. 

net = Net(input_size=num_features)
#criterion = nn.BCELoss(reduction='sum') #you'll need to use the binary close price higher for this
criterion = nn.MSELoss()
optimizer = optim.Adam(
    net.parameters(), lr=learning_rate
)

for epoch in range(num_epochs):
    for seqs, labels in train_dataloader:
        seqs = seqs.view(batch_size, torch_X_train.shape[1], num_features)
        outputs = net(seqs).squeeze()
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    # early break if loss isn't changing beyond the learning rate
    if loss.item() < .1:
        break

Epoch 1, Loss: 7031.55078125
Epoch 2, Loss: 6994.5400390625
Epoch 3, Loss: 6999.08837890625
Epoch 4, Loss: 7007.5703125
Epoch 5, Loss: 6989.71875
Epoch 6, Loss: 6983.2333984375
Epoch 7, Loss: 7013.97412109375
Epoch 8, Loss: 6945.89599609375
Epoch 9, Loss: 6977.58349609375
Epoch 10, Loss: 6962.5732421875
Epoch 11, Loss: 6942.65478515625
Epoch 12, Loss: 6957.1552734375
Epoch 13, Loss: 6919.90771484375
Epoch 14, Loss: 6891.54150390625
Epoch 15, Loss: 6895.62109375
Epoch 16, Loss: 6886.0771484375
Epoch 17, Loss: 6851.74755859375
Epoch 18, Loss: 6872.359375
Epoch 19, Loss: 6839.1923828125
Epoch 20, Loss: 6799.86962890625
Epoch 21, Loss: 6765.390625
Epoch 22, Loss: 6784.1611328125
Epoch 23, Loss: 6752.25
Epoch 24, Loss: 6725.86865234375
Epoch 25, Loss: 6705.203125
Epoch 26, Loss: 6706.81787109375
Epoch 27, Loss: 6653.02099609375
Epoch 28, Loss: 6617.125
Epoch 29, Loss: 6619.60791015625
Epoch 30, Loss: 6580.92578125
Epoch 31, Loss: 6573.97802734375
Epoch 32, Loss: 6552.958984375
Epoch 33, Los

In [24]:
# Define MSE metric
mse = torchmetrics.regression.MeanSquaredError()

net.eval()
with torch.no_grad():
    for seqs, labels in test_dataloader:
        seqs = seqs.view(batch_size, torch_X_test.shape[1], num_features)
        # Pass seqs to net and squeeze the result
        outputs = net(seqs).squeeze()
        mse(outputs, labels)

# Compute final metric value
test_mse = mse.compute()
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_mse**.5}")

Test MSE: 75252.6015625
Test RMSE: 274.32208251953125


In [25]:
## used for binary classifcation
# f1 = torchmetrics.F1Score(num_classes=2)
# 
# net.eval()
# with torch.no_grad():
#     for seqs, labels in test_dataloader:
#         seqs = seqs.view(batch_size, num_features, 1)
#         outputs = net(seqs).squeeze()
#         f1.compute()
#         
# print(f"Test F1 score: {f1.compute()}")

# acc = torchmetrics.Accuracy(task="binary")
# 
# net.eval()
# with torch.no_grad():
#     for seqs, labels in test_dataloader:
#         seqs = seqs.view(batch_size, sequence_len, num_features)
#         outputs = net(seqs).squeeze()
#         acc.compute()
# 
# print(f"Test accuracy score: {acc.compute()}")


# Error when using just the close price:
274 RMSE

# Error when using all fields
277 RMSE