In [1]:
import importlib
import os
from datetime import datetime, date

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from data_collection import DataCollection

import multi_input_lsm
importlib.reload(multi_input_lsm)
from multi_input_lsm import MultiInputLSTM as lstm

import data_plotter
importlib.reload(data_plotter)
from data_plotter import DataPlotter

from algorithmic_trading.experiment_4.src import preprocessing

importlib.reload(preprocessing)
from algorithmic_trading.experiment_4.src.preprocessing import Preprocessing

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### DATA PREPARATION #####

In [2]:
# Parameters
predicted_days = 1
tickers = ["GOOGL"]
start_date = "2000-01-01"
end_date = datetime.today().date()
sequence_length = 100 # todo 30, 50, 100, 365
batch_size = 32 # todo 16, 32, 64
data_path = "../data"
os.makedirs(data_path, exist_ok=True)

# Data collection
data_collector = DataCollection(tickers, start_date, end_date, folder_path=data_path)
data_collector.fetch_and_save_all()

# Preprocessing
pp = Preprocessing(folder_path=data_path, split_ratio=0.8, sequence_length=sequence_length)
x_open_train, x_high_train, x_low_train, x_close_train, y_train, y_train_dates,x_open_test, x_high_test, x_low_test, x_close_test, y_test, y_test_dates = pp.preprocess_pipeline()

# Convert data to PyTorch tensors
x_open_train_tensor = torch.tensor(x_open_train, dtype=torch.float32).unsqueeze(-1).to(device)
x_high_train_tensor = torch.tensor(x_high_train, dtype=torch.float32).unsqueeze(-1).to(device)
x_low_train_tensor = torch.tensor(x_low_train, dtype=torch.float32).unsqueeze(-1).to(device)
x_close_train_tensor = torch.tensor(x_close_train, dtype=torch.float32).unsqueeze(-1).to(device)
# x_volume_train_tensor = torch.tensor(x_volume_train, dtype=torch.float32).unsqueeze(-1).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)

x_open_test_tensor = torch.tensor(x_open_test, dtype=torch.float32).unsqueeze(-1).to(device)
x_high_test_tensor = torch.tensor(x_high_test, dtype=torch.float32).unsqueeze(-1).to(device)
x_low_test_tensor = torch.tensor(x_low_test, dtype=torch.float32).unsqueeze(-1).to(device)
x_close_test_tensor = torch.tensor(x_close_test, dtype=torch.float32).unsqueeze(-1).to(device)
# x_volume_test_tensor = torch.tensor(x_volume_test, dtype=torch.float32).unsqueeze(-1).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

# Create DataLoader for batching
train_dataset = TensorDataset(
    x_open_train_tensor, x_high_train_tensor, x_low_train_tensor, x_close_train_tensor, y_train_tensor
)
test_dataset = TensorDataset(
    x_open_test_tensor, x_high_test_tensor, x_low_test_tensor, x_close_test_tensor, y_test_tensor
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

[*********************100%***********************]  1 of 1 completed


Data saved to ./data\GOOGL_data.csv


##### MODEL SETUP #####

In [3]:
input_size = 4  # ['Open', 'High', 'Low', 'Close']
hidden_size = 128  # Number of hidden units in LSTM # todo 32, 50, 64, or 128
# output_size = 4  # Predicting 5 values for the next day
# num_layers = 2  # LSTM layers # todo 1 to 3
# dropout = 0.2  # Dropout rate for regularization # todo 0.1 to 0.5
learning_rate = 0.0001 # todo 0.001, 0.0005, 0.0001

# Instantiate the model
model = lstm(input_sz=input_size, hidden_sz=hidden_size).to(device) # todo Bidirectional LSTM, Gated Recurrent Unit (GRU)

# Loss and optimizer
criterion = nn.MSELoss() # todo Mean Absolute Error (MAE = L1Loss()), Mean Squared Error (MSE)
optimizer = optim.Adam(model.parameters(), lr=learning_rate) # todo Adam, AdamW

##### TRAINING THE MODEL #####

In [None]:
num_epochs = 200 # todo 50, 200
model.train()
for epoch in range(num_epochs):
    running_loss = 0.0

    for x_open, x_high, x_low, x_close, y in train_loader:
        optimizer.zero_grad()
        outputs = model(x_open, x_high, x_low, x_close)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss / len(train_loader):.6f}")

Epoch 1/200, Loss: 0.066297
Epoch 2/200, Loss: 0.004652
Epoch 3/200, Loss: 0.000997
Epoch 4/200, Loss: 0.000521
Epoch 5/200, Loss: 0.000409
Epoch 6/200, Loss: 0.000336
Epoch 7/200, Loss: 0.000295
Epoch 8/200, Loss: 0.000273
Epoch 9/200, Loss: 0.000262
Epoch 10/200, Loss: 0.000246
Epoch 11/200, Loss: 0.000252
Epoch 12/200, Loss: 0.000250
Epoch 13/200, Loss: 0.000226
Epoch 14/200, Loss: 0.000219
Epoch 15/200, Loss: 0.000231
Epoch 16/200, Loss: 0.000219
Epoch 17/200, Loss: 0.000208
Epoch 18/200, Loss: 0.000202
Epoch 19/200, Loss: 0.000198
Epoch 20/200, Loss: 0.000197
Epoch 21/200, Loss: 0.000192
Epoch 22/200, Loss: 0.000184
Epoch 23/200, Loss: 0.000178
Epoch 24/200, Loss: 0.000175
Epoch 25/200, Loss: 0.000174
Epoch 26/200, Loss: 0.000179
Epoch 27/200, Loss: 0.000169
Epoch 28/200, Loss: 0.000172
Epoch 29/200, Loss: 0.000153
Epoch 30/200, Loss: 0.000159
Epoch 31/200, Loss: 0.000161
Epoch 32/200, Loss: 0.000146
Epoch 33/200, Loss: 0.000151


##### EVALUATING THE MODEL #####

In [None]:
import data_plotter
importlib.reload(data_plotter)
from data_plotter import DataPlotter

model.eval()
test_loss = 0.0
predictions = []
actuals = []

with torch.no_grad():
    for x_open, x_high, x_low, x_close, y in test_loader:
         # Forward pass
        outputs = model(x_open, x_high, x_low, x_close)
        # Compute loss
        loss = criterion(outputs.squeeze(), y)
        test_loss += loss.item()
        # Sotre predictions and actuals
        predictions.append(outputs.cpu())
        actuals.append(y.cpu())

print(f"Test Loss: {test_loss / len(test_loader):.4f}")

# Post-processing predictions and actuals for inverse scaling
predictions = torch.cat(predictions).numpy()
actuals = torch.cat(actuals).numpy()

# Inverse transform OHLC using pp.scaler
predictions_original = pp.scaler.inverse_transform(predictions)
actuals_original = pp.scaler.inverse_transform(actuals)


for i in range(5):
    print(f"Date: {y_test_dates[i]}")
    prediction_str = ", ".join([f"{x:.2f}" for x in predictions_original[i]])
    actual_str = ", ".join([f"{x:.2f}" for x in actuals_original[i]])
    print(f"Sample prediction (original scale): {prediction_str}")
    print(f"Actual values (original scale): {actual_str}")
    print("-" * 50)

# Plotting results
plotter = DataPlotter()
df_results = plotter.create_results_dataframe(y_test_dates, actuals_original, predictions_original)
plotter.plot_results(df_results)

##### Predict ######

In [None]:
import predictor
importlib.reload(predictor)
from predictor import Predictor

# Initialize the Predictor
predictor = Predictor(
    model=model,
    scaler=pp.scaler,
    # volume_scaler=pp.volume_scaler,
    sequence_length=sequence_length,
    device=device
)

# Specify parameters for prediction
start_date = date(2023, 1, 3)

days_to_predict = 10

# Perform the prediction
dataset = pp.load_data()  # Load the dataset to access dates
dataset['Date'] = pd.to_datetime(dataset['Date']).dt.date
predicted_values, predicted_values_scaled = predictor.predict_future(dataset, start_date, days_to_predict)

# Extract actual values for comparison
start_index = dataset.index[dataset['Date'] == start_date][0] + 1
actual_values = dataset.iloc[start_index:start_index + days_to_predict][['Open', 'High', 'Low', 'Close']].values
actual_values_scaled = pp.scaler.transform(actual_values)

# Evaluate predictions using multiple metrics
mse = np.mean((actual_values_scaled - predicted_values_scaled) ** 2)  # Mean Squared Error
mae = np.mean(np.abs(actual_values_scaled - predicted_values_scaled))  # Mean Absolute Error
variance = np.var(actual_values_scaled - predicted_values_scaled)  # Variance of errors

# Compare predicted values with actual values
print(f"\nPredictions vs Actuals starting from {start_date.strftime('%Y-%m-%d')} for {days_to_predict} days:")

for i, (prediction, actual) in enumerate(zip(predicted_values, actual_values)):
    prediction_date = dataset.iloc[start_index + 1 + i]['Date']
    # Compute changes
    predicted_change = ((prediction - actual_values[i]) / actual_values[i]) * 100
    
    print(f"Date: {prediction_date}")
    print(f"Predicted: {prediction}")
    print(f"Actual: {actual}")
    print(f"Change (%): {predicted_change}")
    print("---")
    
# Print evaluation metrics
print(f"Test Metrics:")
print(f"MSE: {mse:.6f}")  # Mean Squared Error -> Smaller values indicate better model performance.
print(f"MAE: {mae:.6f}")  # Mean Absolute Error -> Provides a more interpretable metric compared to MSE (does not square the errors).
print(f"Variance of errors: {variance:.6f}")  # Variance -> Lower variance indicates that the model consistently makes predictions close to the actual values.

# Plot
plotter = DataPlotter()
# Create a DataFrame for results
df_results = plotter.create_results_dataframe(
    dates=dataset["Date"].iloc[start_index + 1:start_index + 1 + days_to_predict],  # Use the dates for the prediction range
    actuals_original=actual_values,
    predictions_original=predicted_values
)
# Plot results for the prediction range
plotter.plot_results(df_results
)

Simulating the "Without Predictions" Strategy

In [None]:
import random
import simulator
importlib.reload(simulator)
from simulator import InvestmentSimulator

random_index = random.randint(0, len(dataset) - 1)
values = dataset.iloc[random_index:][['Open', 'High', 'Low', 'Close']].values

simulator = InvestmentSimulator(
    prices=values,
    initial_capital=1000,
    profit_threshold=0.10,
    loss_threshold=-0.05,
)

print(f"Simulated buying stock in day: {dataset['Date'][random_index]}")
result = simulator.simulate()
print(result)

Simulating the "With Predictions" Strategy

In [None]:
import predictor
importlib.reload(predictor)
from predictor import Predictor

# Initialize Predictor
predictor = Predictor(
    model=model,  # Your trained model
    scaler=pp.scaler,
    sequence_length=sequence_length,
    device=device
)

# Random day
start_date = pd.to_datetime(dataset['Date'][random_index]).date()
# Values up to day at random_index
past_values = dataset.iloc[:random_index + 1][['Date', 'Open', 'High', 'Low', 'Close']]

i = 0
result = None

while result is None or result.get('status') not in ['profit', 'loss']:
    i += 1
    predicted_values, predicted_values_scaled = predictor.predict_future(past_values, start_date, i)
    
    simulator = InvestmentSimulator(
        prices=predicted_values,
        initial_capital=1000,
        profit_threshold=0.10,
        loss_threshold=-0.05,
    )
    result = simulator.simulate()
    print("---------")
    
# After the loop, print the final result
print(f"Simulated buying stock in day: {dataset['Date'][random_index]}")
print(f"Simulation completed after {i} days:")
print(result)