In this notebook we will be building and training LSTM to predict IBM stock. We will use PyTorch.

## 1. Libraries and settings

In [1]:
import numpy as np
import random
import pandas as pd 
from pylab import mpl, plt
plt.style.use('seaborn-v0_8-darkgrid')
mpl.rcParams['font.family'] = 'serif'
%matplotlib inline

import math, time
import itertools
import datetime
from operator import itemgetter
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
import torch
import torch.nn as nn
from torch.autograd import Variable

import vectorbtpro as vbt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
vbt.settings.set_theme('dark')
vbt.settings['plotting']['layout']['width'] = 800
vbt.settings['plotting']['layout']['height'] = 400


## 2. Load data

In [2]:
sol_data = pd.read_csv('2ySOLdata1h.csv')
sol_data['timestamp'] = pd.to_datetime(sol_data['timestamp'], unit='s')
sol_data.set_index('timestamp', inplace=True)
pd.set_option('future.no_silent_downcasting', True)

In [3]:
data = sol_data.iloc[:, 0:5].copy()
data


Unnamed: 0_level_0,Open,High,Low,Close,Volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01 04:00:00,172.790,173.130,172.480,172.930,49675
2022-01-01 05:00:00,172.940,173.110,171.470,171.470,68973
2022-01-01 06:00:00,171.490,174.310,171.340,173.220,95595
2022-01-01 07:00:00,173.220,173.740,172.260,172.470,95494
2022-01-01 08:00:00,172.450,173.440,172.140,173.160,57926
...,...,...,...,...,...
2023-12-31 02:00:00,102.718,103.063,101.208,101.411,731765
2023-12-31 03:00:00,101.413,101.850,100.044,100.738,970135
2023-12-31 04:00:00,100.734,100.939,99.635,100.743,858035
2023-12-31 05:00:00,100.734,102.533,100.532,101.974,879783


In [4]:

data_trimmed = data.copy()
pd.set_option('future.no_silent_downcasting', True)
data_trimmed.loc[:, 'signal'] = 'SignalNone'


# Define window size
window_size = 10

rolling_max = data_trimmed.loc[:,'Close'].rolling(window=2*window_size+1, center=True, min_periods=1).max()
rolling_min = data_trimmed.loc[:,'Close'].rolling(window=2*window_size+1, center=True, min_periods=1).min()

is_peak = (data_trimmed.loc[:, 'Close'] == rolling_max)

is_low = (data_trimmed.loc[:, 'Close'] == rolling_min) 

# Update signal columns where conditions are met
data_trimmed.loc[is_peak, 'signal'] = 'SignalShort'  # Mark peaks with SignalShort
data_trimmed.loc[is_low, 'signal'] = 'SignalLong'   # Mark lows with SignalLong
df = data_trimmed.copy()
df_filtered = df[df['signal'] != 'SignalNone'].copy()

# Iterate through the DataFrame and adjust the signals
for i in range(1, len(df_filtered)):
    current_signal = df_filtered.iloc[i]['signal']
    previous_signal = df_filtered.iloc[i - 1]['signal']
    current_close = df_filtered.iloc[i]['Close']
    previous_close = df_filtered.iloc[i - 1]['Close']
    
    if current_signal == previous_signal:
        if current_signal == 'SignalLong' and previous_close > current_close:
            df_filtered.iloc[i - 1, df_filtered.columns.get_loc('signal')] = 'SignalNone'
        elif current_signal != 'SignalLong' and previous_close < current_close:
            df_filtered.iloc[i - 1, df_filtered.columns.get_loc('signal')] = 'SignalNone'
        else:
            df_filtered.iloc[i, df_filtered.columns.get_loc('signal')] = 'SignalNone'


df.update(df_filtered)

df.loc[:,'signal'] = df.loc[:,'signal'].replace({'SignalLong': 2, 'SignalShort': 0, 'SignalNone': 1})
df = df.ffill()

df['signal'] = df['signal'].astype(float)
long_signals = df['signal'] == 2
short_signals = df['signal'] == 0

scaler = MinMaxScaler(feature_range=(0, 2))
for idx in df.index[:-1]:
    if short_signals.loc[idx]:
        short_index = idx
        next_long_idx = df.loc[idx:].index[long_signals[idx:]].min()
        bear_slice = df.loc[short_index : next_long_idx].copy()
        bear_slice['signal'] = bear_slice['Close']
        signal_values = bear_slice['signal'].values.reshape(-1, 1)
        scaled_signal_values = scaler.fit_transform(signal_values)
        scaled_signal_values_transformed = 2 - (scaled_signal_values)
        bear_slice['signal'] = scaled_signal_values_transformed.flatten()
        df.update(bear_slice)
    elif long_signals.loc[idx]:
        long_index = idx
        next_short_idx = df.loc[idx:].index[short_signals[idx:]].min()
        bull_slice = df.loc[long_index : next_short_idx].copy()
        bull_slice['signal'] = bull_slice['Close']
        signal_values = bull_slice['signal'].values.reshape(-1, 1)
        scaled_signal_values = scaler.fit_transform(signal_values)
        scaled_signal_values_transformed = 2 - (scaled_signal_values)
        bull_slice['signal'] = scaled_signal_values_transformed.flatten()
        df.update(bull_slice)


In [5]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,signal
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-01 04:00:00,172.79,173.13,172.48,172.93,49675,1.0
2022-01-01 05:00:00,172.94,173.11,171.47,171.47,68973,2.0
2022-01-01 06:00:00,171.49,174.31,171.34,173.22,95595,1.538259
2022-01-01 07:00:00,173.22,173.74,172.26,172.47,95494,1.736148
2022-01-01 08:00:00,172.45,173.44,172.14,173.16,57926,1.55409


In [6]:
df_sol=df.copy()

In [7]:
df_sol = vbt.Data.from_data(df)

features = df_sol.run("talib", mavp=vbt.run_arg_dict(periods=14))

df_sol.data['symbol'] = pd.concat([df_sol.data['symbol'], features], axis=1)
# This will drop columns from the DataFrame where all values are NaN
df_sol.data['symbol'] = df_sol.data['symbol'].dropna(axis=1, how='all')


df_sol.data['symbol'] = df_sol.data['symbol'].dropna()
predictor_list = df_sol.data['symbol'].drop('signal', axis=1).columns.tolist()
X = df_sol.data['symbol'][predictor_list]

y = df_sol.data['symbol']['signal']

X.columns = X.columns.astype(str)


In [8]:
# # function to create train, test data given stock data and sequence length
# def load_data(stock, look_back):
#     data_raw = stock.values # convert to numpy array
#     data = []
    
#     # create all possible sequences of length look_back
#     for index in range(len(data_raw) - look_back): 
#         data.append(data_raw[index: index + look_back])
    
#     data = np.array(data);
#     # print(data[0, :, -1])
#     test_set_size = int(np.round(0.2*data.shape[0]));
#     train_set_size = data.shape[0] - (test_set_size);
    
#     x_train = data[:train_set_size,:-1,:-1]
#     y_train = data[:train_set_size,-1,-1:]
    
#     x_test = data[train_set_size:,:-1,:-1]
#     y_test = data[train_set_size:,-1,-1:]
    
#     return [x_train, y_train, x_test, y_test]

# look_back = 40 # choose sequence length
# x_train, y_train, x_test, y_test = load_data(df_sol.data['symbol'], look_back)
# print('x_train.shape = ',x_train.shape)
# print('y_train.shape = ',y_train.shape)
# print('x_test.shape = ',x_test.shape)
# print('y_test.shape = ',y_test.shape)

train test split

In [9]:
# Split the data into a training set and a test set
# Assuming X is a DataFrame or a NumPy array
from sklearn.model_selection import train_test_split

indices = np.arange(X.shape[0])

# First, split your data into a training+validation set and a separate test set
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, indices, test_size=0.3, shuffle=False)


scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


sequence

In [10]:
timestep = 20

def create_sequences(input_data, timestep):
    sequences = []
    data_len = len(input_data)
    for i in range(data_len - timestep):
        seq = input_data[i:(i + timestep)]
        sequences.append(seq)
    return np.array(sequences)

X_train_list = create_sequences(X_train_scaled, timestep)
X_test_list = create_sequences(X_test_scaled, timestep)
y_train_seq_ar = y_train[timestep:]
y_test_seq_ar = y_test[timestep:]

In [11]:
# Convert to numpy arrays
x_train_ar = np.array(X_train_list)
x_test_ar = np.array(X_test_list)  

y_train_seq = np.array(y_train_seq_ar).reshape(-1, 1)
y_test_seq = np.array(y_test_seq_ar).reshape(-1, 1)

In [12]:
# make training and test sets in torch
x_train_tensor = torch.from_numpy(x_train_ar).type(torch.Tensor)
x_test_tensor = torch.from_numpy(x_test_ar).type(torch.Tensor)
y_train_tensor = torch.from_numpy(y_train_seq).type(torch.Tensor)
y_test_tensor = torch.from_numpy(y_test_seq).type(torch.Tensor)

In [13]:
y_train_tensor.size(),x_train_tensor.size()

(torch.Size([12167, 1]), torch.Size([12167, 20, 178]))

In [14]:
x_train_tensor.shape[2]

178

model

In [18]:
# Build model
#####################
input_dim = x_train_tensor.shape[2]
hidden_dim = 32
num_layers = 2 
output_dim = 1


# Here we define our model as a class
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        batch_size = x.size(0)  # Dynamically obtain the batch size from input
        hn, cn = self.init_hidden(batch_size)  # Initialize hidden and cell states based on batch size
        out, (hn, cn) = self.lstm(x, (hn, cn))
        out = self.fc(out[:, -1, :])
        return out, (hn, cn)

    def init_hidden(self, batch_size):
        # Generate the initial hidden state and cell state without requiring gradients
        return (torch.zeros(self.num_layers, batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, batch_size, self.hidden_dim))


    
model = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)

loss_fn = torch.nn.MSELoss()

optimiser = torch.optim.Adam(model.parameters(), lr=0.02)

In [19]:
# Train model
#####################
num_epochs = 100
hist = np.zeros(num_epochs)  
model = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)

from torch.utils.data import DataLoader, TensorDataset

# Assuming x_train and y_train are numpy arrays

# Create TensorDataset
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)

batch_size = 64
# Create DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False)

for epoch in range(num_epochs):
    # Loop over batches in the dataset
    for data, targets in train_loader:
        model.zero_grad()
        outputs, _ = model(data)  # Hidden states are now initialized inside the model
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimiser.step()

In [24]:
with torch.no_grad():
    # make predictions
    y_test_pred = model(x_test_tensor)

    # # y_train = y_train.detach().numpy()
    # # y_test = y_test.detach().numpy()
    # y_train_pred = outputs.detach().numpy()
    # y_test_pred = y_test_pred.detach().numpy()

    # # Calculate RMSE directly without inverse transformation
    # trainScore = math.sqrt(mean_squared_error(y_train[:,0], y_train_pred[:,0]))
    # print('Train Score: %.2f RMSE' % (trainScore))
    # testScore = math.sqrt(mean_squared_error(y_test[:,0], y_test_pred[:,0]))
    # print('Test Score: %.2f RMSE' % (testScore))

In [26]:
y_test_pred.shape

AttributeError: 'tuple' object has no attribute 'shape'

In [None]:
signal = pd.Series(y_test[timestep:])
pred_sig = pd.Series(y_test_pred.flatten())
pred_sig.index = signal.index
combined_df = pd.concat([signal, pred_sig], axis=1)
fig = combined_df.vbt.plot()
fig.update_layout(yaxis_title='signal')
fig.show()

In [None]:
long_entries = pred_sig > 1.35
long_exits = pred_sig < 1
short_entries = pred_sig < 0.5
short_exits = pred_sig > 1.2
pf = vbt.Portfolio.from_signals(
    close=X_test.Close, 
    long_entries=long_entries, 
    long_exits=long_exits, 
    # short_entries=short_entries,
    # short_exits=short_exits,
    size=100,
    size_type='value',
    # accumulate=True,
    init_cash='auto'
)
pf.plot({"orders", "cum_returns"}, settings=dict(bm_returns=False)).show()




In [None]:
# # Assuming df_sol is your original DataFrame and y_test, y_test_pred are your numpy arrays
# # First, create the index you want to use for the x-axis
# x_axis_index = df[len(df)-len(y_test):].index

# # Create Series with the custom index
# y_test_series = pd.Series(y_test.flatten(), index=x_axis_index, name="Actual")
# y_test_pred_series = pd.Series(y_test_pred.flatten(), index=x_axis_index, name="Predicted")

# # Create a DataFrame from your series
# combined_df = pd.DataFrame({
#     "Actual": y_test_series,
#     "Predicted": y_test_pred_series
# })
# entries = 2
# exits = 0

# combined_df['Actual_Diff'] = combined_df['Actual'].diff()
# combined_df['Predicted_Diff'] = combined_df['Predicted'].diff()


# cross_over = (combined_df['Actual_Diff'] > 0) & (combined_df['Actual'] > combined_df['Predicted']) & (combined_df['Actual'].shift(1) <= combined_df['Predicted'].shift(1))
# cross_under = (combined_df['Actual_Diff'] < 0) & (combined_df['Actual'] < combined_df['Predicted']) & (combined_df['Actual'].shift(1) >= combined_df['Predicted'].shift(1))

# combined_df['Signal'] = 1  # Default to '1' for hold/no action
# combined_df.loc[cross_over, 'Signal'] = 2  # '2' for cross over
# combined_df.loc[cross_under, 'Signal'] = 0  # '0' for cross under


# # Plot using vectorbt
# combined_df_vbt = vbt.Data.from_data(combined_df)
# fig = combined_df_vbt.plot(trace_kwargs=dict(mode='lines'))
# fig.show()

plot