# NOTES
Right now only takes close price. Update to include all data in the sequence.

After 3 epochs it's loss is basically non-existant. Use an early stop

Learn how to use GPU 

drop_last=True seems necessary for the data loader. The batch_size argument is expecting batches to be that size. But if the whole dataset can't be divided by the batch_size then you had some values remaining that don't fit. Could set batch_size to be some multiple if possible to minimize dropped samples 

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torchmetrics

In [27]:
DATA_PATH = r'C:\Users\connor\PycharmProjects\trading\data\analytics\analytics_voo.csv'
df = pd.read_csv(DATA_PATH, low_memory=False)

In [28]:
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'join_date', 'Id',
       'Date', 'DateLongDescription', 'DateShortDescription', 'DayLongName',
       'DayShortName', 'MonthLongName', 'MonthShortName', 'CalendarDay',
       'CalendarWeek', 'CalendarWeekStartDateId', 'CalendarWeekEndDateId',
       'CalendarDayInWeek', 'CalendarMonth', 'CalendarMonthStartDateId',
       'CalendarMonthEndDateId', 'CalendarNumberOfDaysInMonth',
       'CalendarDayInMonth', 'CalendarQuarter', 'CalendarQuarterStartDateId',
       'CalendarQuarterEndDateId', 'CalendarQuarterStartDate',
       'CalendarNumberOfDaysInQuarter', 'CalendarDayInQuarter', 'CalendarYear',
       'CalendarYearEndDateId', 'CalendarYearStartDate',
       'CalendarNumberOfDaysInYear', 'month_join_key', 'year_join_key',
       'seven_day_ema', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR', 'GDPC1',
       'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal', 'macd_hist',
       'daily_obv', 'target'],
      dtype='object')

In [29]:
non_target_columns = ['open', 'high', 'low', 'volume', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR', 'GDPC1', 'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal', 'macd_hist', 'daily_obv', 'seven_day_ema', 'close']

df_cols = df[non_target_columns]

# version with just close price and date
df_target_train = df[['date', 'close']].where(df.date <= '2020-01-01')
df_target_train.dropna(inplace=True)
df_target_test = df[['date', 'close']].where(df.date > '2020-01-01')
df_target_test.dropna(inplace=True)

# version with all columns
df_target_train_v2 = df[non_target_columns].where(df.date <= '2020-01-01')
df_target_train_v2.dropna(inplace=True)
df_target_test_v2 = df[non_target_columns].where(df.date > '2020-01-01')
df_target_test_v2.dropna(inplace=True)

In [30]:
print(df_target_train.columns)
print()
print(df_target_train_v2.columns)

Index(['date', 'close'], dtype='object')

Index(['open', 'high', 'low', 'volume', 'CPALTT01USM657N', 'DFF', 'EXPINF10YR',
       'GDPC1', 'RSXFS', 'T10YFF', 'UNRATE', 'macd', 'macd_signal',
       'macd_hist', 'daily_obv', 'seven_day_ema', 'close'],
      dtype='object')


In [31]:
def create_sequences(df, seq_length, num_rows=len(df), include_all_features=False):
    """
    Input:
    df: pandas dataframe with at least 2 columns, a date column and a target column
    seq_length: How long the sequence should be. 
    num_rows: How many rows to use. Will pull all rows unless num_rows is provided, in which case the top num_rows will be pulled.
    include_all_features: Whether to use just the target column for the sequence, or to include all features.
        
    Output:
    Generates a sequence of seq_length length. It's shape is [num_rows, seq_length] if include_all_features is False, else [num_rows, seq_length * len(df.columns)]. 
    A sequence refers to how many should be appended as columns. 
    For example, if df = [['2020-01-01',100], ['2020-01-02', 101], ['2020-01-03', 102]] and seq_length = 2, the generated sequence will be [[101,102], [102,103]]. 
    """
    
    
    xs, ys = [], []
    if include_all_features == True:
        feature_num = len(df.columns)
        seq_length = seq_length * feature_num
    if include_all_features == False:
        for itr in range(num_rows - seq_length):
            x = df.iloc[itr:(itr + seq_length), -1]
            y = df.iloc[itr + seq_length, -1]
            xs.append(x)
            ys.append(y)
    elif include_all_features == True:
        for itr in range(num_rows - seq_length):
            x = df.iloc[itr:(itr + seq_length), :]
            y = df.iloc[itr + seq_length, -1]
            # when you get more than 1 column, it'll append an array to the array which makes it 3d instead of 2d. Flatten out sample into single row
            x_reshaped = x.to_numpy().reshape(-1, 1)
            xs.append(x_reshaped)
            ys.append(y)
    else:
        print(f"error: include_all_features accepts True or False, got {include_all_features} instead.")
    return np.array(xs), np.array(ys)

In [32]:
# create training set
sequence_len = 10
num_rows = 10000
X_train, y_train = create_sequences(df_target_train, sequence_len, num_rows)
X_test, y_test = create_sequences(df_target_test, sequence_len, num_rows)

In [33]:
# create training set
X_train_v2, y_train_v2 = create_sequences(df_target_train_v2, sequence_len, num_rows, include_all_features=True)
X_test_v2, y_test_v2 = create_sequences(df_target_test_v2, sequence_len, num_rows, include_all_features=True)

# using all fields available

In [34]:
# convert df into objects Torch can read
torch_X_train  = torch.from_numpy(X_train_v2.reshape(X_train_v2.shape[0], X_train_v2.shape[1])).float()
torch_y_train = torch.from_numpy(y_train_v2).float()
torch_X_test = torch.from_numpy(X_test_v2.reshape(X_test_v2.shape[0], X_test_v2.shape[1])).float()
torch_y_test = torch.from_numpy(y_test_v2).float()

print(torch_X_train.shape, torch_y_train.shape)
# create test and train sets
train_data_set = TensorDataset(torch_X_train, torch_y_train)
test_data_set = TensorDataset(torch_X_test, torch_y_test)

# confirm it works
sample = train_data_set[0]
input_sample, label_sample = sample
print('input sample:', input_sample)
print('label sample:', label_sample)

torch.Size([9915, 1445]) torch.Size([9915])
input sample: tensor([ 8.3935e+01,  8.4042e+01,  8.3820e+01,  ..., -1.6520e+05,
         8.1712e+01,  8.3516e+01])
label sample: tensor(83.4850)


In [35]:
# Using just the close price

In [36]:
# # convert df into objects Torch can read
# torch_X_train  = torch.from_numpy(X_train).float()
# torch_y_train = torch.from_numpy(y_train).float()
# torch_X_test = torch.from_numpy(X_test).float()
# torch_y_test = torch.from_numpy(y_test).float()
# 
# # create test and train sets
# train_data_set = TensorDataset(torch_X_train, torch_y_train)
# test_data_set = TensorDataset(torch_X_test, torch_y_test)
# 
# # confirm it works
# sample = train_data_set[0]
# input_sample, label_sample = sample
# print('input sample:', input_sample)
# print('label sample:', label_sample)

In [37]:
print(input_sample.shape)

torch.Size([1445])


In [38]:
batch_size = 10
shuffle = True
hidden_size = 5
num_layers = 2

# create dataloader
train_dataloader = DataLoader(train_data_set, batch_size=batch_size, shuffle=shuffle, drop_last=True)
test_dataloader = DataLoader(test_data_set, batch_size=batch_size, shuffle=shuffle, drop_last=True)
# test loader
# x, y = next(iter(train_dataloader))
# 
# print('x', x, 'y', y)

In [39]:
# LTSM
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__() #super makes all the methods available in nn.Module available for the new class Net
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(in_features=hidden_size, out_features=1)
        
    def forward(self, x):
        h0 = torch.zeros(num_layers, x.size(0), hidden_size)
        c0 = torch.zeros(num_layers, x.size(0), hidden_size)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [40]:
learning_rate = 0.001
num_epochs = 50
num_features = 1 # this right now is just the close date. 

net = Net(input_size=num_features)
#criterion = nn.BCELoss(reduction='sum') #you'll need to use the binary close price higher for this
criterion = nn.MSELoss()
optimizer = optim.Adam(
    net.parameters(), lr=learning_rate
)

for epoch in range(num_epochs):
    for seqs, labels in train_dataloader:
        seqs = seqs.view(batch_size, torch_X_train.shape[1], num_features)
        outputs = net(seqs).squeeze()
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    # early break if loss isn't changing beyond the learning rate
    if loss.item() < .1:
        break

Epoch 1, Loss: 6739.86572265625
Epoch 2, Loss: 5653.71337890625
Epoch 3, Loss: 5081.59716796875
Epoch 4, Loss: 4532.958984375
Epoch 5, Loss: 3710.39697265625
Epoch 6, Loss: 3180.00439453125
Epoch 7, Loss: 2479.31103515625
Epoch 8, Loss: 2045.7144775390625
Epoch 9, Loss: 1534.8338623046875
Epoch 10, Loss: 1133.159912109375
Epoch 11, Loss: 795.7372436523438
Epoch 12, Loss: 574.6461791992188
Epoch 13, Loss: 392.6899108886719
Epoch 14, Loss: 240.85903930664062
Epoch 15, Loss: 138.28704833984375
Epoch 16, Loss: 73.03668975830078
Epoch 17, Loss: 18.597078323364258
Epoch 18, Loss: 11.70531940460205
Epoch 19, Loss: 11.268365859985352
Epoch 20, Loss: 6.2229719161987305
Epoch 21, Loss: 6.2082366943359375
Epoch 22, Loss: 5.539273262023926
Epoch 23, Loss: 3.9265456199645996
Epoch 24, Loss: 3.910283327102661
Epoch 25, Loss: 7.9704389572143555
Epoch 26, Loss: 5.355321407318115
Epoch 27, Loss: 7.322512149810791
Epoch 28, Loss: 6.673825263977051
Epoch 29, Loss: 2.541616201400757
Epoch 30, Loss: 4.2957

In [41]:
# Define MSE metric
mse = torchmetrics.regression.MeanSquaredError()

net.eval()
with torch.no_grad():
    for seqs, labels in test_dataloader:
        seqs = seqs.view(batch_size, torch_X_test.shape[1], num_features)
        # Pass seqs to net and squeeze the result
        outputs = net(seqs).squeeze()
        mse(outputs, labels)

# Compute final metric value
test_mse = mse.compute()
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_mse**.5}")

Test MSE: 35426.28125
Test RMSE: 188.2187042236328


In [42]:
## used for binary classifcation
# f1 = torchmetrics.F1Score(num_classes=2)
# 
# net.eval()
# with torch.no_grad():
#     for seqs, labels in test_dataloader:
#         seqs = seqs.view(batch_size, num_features, 1)
#         outputs = net(seqs).squeeze()
#         f1.compute()
#         
# print(f"Test F1 score: {f1.compute()}")

# acc = torchmetrics.Accuracy(task="binary")
# 
# net.eval()
# with torch.no_grad():
#     for seqs, labels in test_dataloader:
#         seqs = seqs.view(batch_size, sequence_len, num_features)
#         outputs = net(seqs).squeeze()
#         acc.compute()
# 
# print(f"Test accuracy score: {acc.compute()}")


# Error when using just the close price:
274 RMSE

# Error when using all fields
277 RMSE

when using 10000 samples: 188