# Training
This notebook contains the implementation for training and hyperparameter optimization.

In [1]:
import pandas as pd
import torch
import numpy as np
from benchmarks import RandomForest, LSTMModel, TransformerEncoderModel, EncoderDecoderTransformer
from models import IMHAMomentumTransformer, SelfAttentionMomentumTransformer, GRUMomentumTransformer, CNNMomentumTransformer
import random

## Data Loading

Loading the Data prepared in the notebook "data_preparation_real_data".

In [2]:
batches_train = torch.load("processed_data/batches_train.pt")
batches_val = torch.load("processed_data/batches_validation.pt")
batches_test = torch.load("processed_data/batches_test.pt")

## Training

In [6]:
from tqdm import tqdm

def train(model, epochs: int, train_batches: list[torch.Tensor], validation_batches: list[torch.Tensor], n_timestamps: int = 10, learning_rate: float = 0.001):
    # Check if GPU is available and use it; otherwise, use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training on {device}")
    
    # Move the model to the appropriate device
    model.to(device)

    # Set up the loss function and optimizer
    lossFunction = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    training_loss_progression = []
    validation_loss_progression = []

    # Training loop
    for epoch in range(epochs):
        model.train() # Set the model to training mode
        totalLoss = 0
        random.shuffle(train_batches) # Shuffling the training batches
        for i in tqdm(range(len(train_batches))):
            # Prepare Data and move it to the appropriate device
            batch = train_batches[i]
            x = batch[:,:,:-1].to(device)
            t = batch[:,-1,-1].reshape(-1,1).to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            y = model(x)
            loss = lossFunction(y, t)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Clipping the gradient to avoid exploding gradients
            optimizer.step()

            # Storing the batch loss
            totalLoss += loss.item()
        # Storing the epoch training loss
        training_loss_progression.append(totalLoss)

        # Validation loop
        model.eval()
        with torch.no_grad():
            totalLoss = 0
            for i in tqdm(range(len(validation_batches))):
                # Prepare Data and move it to the appropriate device
                batch = validation_batches[i]
                x = batch[:,:,:-1].to(device)
                t = batch[:,-1,-1].reshape(-1,1).to(device)

                # Forward pass: compute the model's predictions
                y = model(x)

                # Comput batch loss and store it
                loss = lossFunction(y, t)
                totalLoss += loss.item()
            # Storing the epoch validation loss
            validation_loss_progression.append(totalLoss)


    return training_loss_progression, validation_loss_progression

In [7]:
#hyperparameters and utilities
sequence_length = 10
batch_size = 64
n_head = 13
learning_rate = 0.005
epochs = 5
dropout_rate = 0.3
hidden_size = 25

In [11]:
#instantiate model
model = IMHAMomentumTransformer(d_model=65, n_head=n_head, dropout_rate=dropout_rate, hidden_size=hidden_size)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Number of Trainable Parameters: {trainable_params}')

Number of Trainable Parameters: 140206


In [None]:
training_loss, validation_loss = train(model, epochs=epochs, train_batches=batches_train,  validation_batches=batches_val, n_timestamps=sequence_length, learning_rate=learning_rate)

In [None]:
import numpy as np
from matplotlib import pyplot as plt

# Calculating average training and validation loss
avg_train_loss = np.array(training_loss) / len(batches_train)
avg_val_loss = np.array(validation_loss) / len(batches_val)

fig, ax1 = plt.subplots()

# Plotting training loss on the primary y-axis
ax1.plot(training_loss, 'b-')  # 'b-' sets the color blue and a solid line
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Training Loss', color='b')  # Set the color of the y-axis label to blue
ax1.tick_params(axis='y', labelcolor='b')

# Create a second y-axis for the validation loss
ax2 = ax1.twinx()
ax2.plot(validation_loss, 'r-')  # 'r-' sets the color red and a solid line
ax2.set_ylabel('Validation Loss', color='r')  # Set the color of the y-axis label to red
ax2.tick_params(axis='y', labelcolor='r')

# Adding a title and a legend
plt.title("Loss Progressions")
fig.tight_layout()  # Adjust the layout to make room for the second y-axis
plt.show()


## Hyperparameter Optimization with Random Gridsearch

In [5]:
from sklearn.model_selection import ParameterSampler
from data_handling import complete_data_preparation

In [11]:
model_name = "IMHAMomentumTransformer"

# Number of Parameter Combinations that will be evaluated
num_samples = 10 
param_grid = {
    'hidden_size': [5, 10, 20],
    'learning_rate': [0.01, 0.001, 0.0001],
    'epochs': [10, 50, 100], 
    'attention_heads': [5, 13], # the dimension of the features (65) must be divisible by this
    'sequence_length': [10, 30],
    'batch_size': [64, 128],
    'dropout_rate': [0.2, 0.3],
}

param_list = list(ParameterSampler(param_grid, n_iter=num_samples))

In [None]:
# Perform random search over the hyperparameters
best_loss = float('inf')
best_params = None
for params in param_list:
    batches_train, batches_validation, _ = complete_data_preparation(sequence_length=params['sequence_length'], batch_size=params['batch_size'])
    model = IMHAMomentumTransformer(d_model=65, n_head=params['attention_heads'], dropout_rate=params['dropout_rate'], hidden_size=params['hidden_size'])
    training_loss, validation_loss = train(model, epochs=params['heads'], train_batches=batches_train,  validation_batches=batches_val, n_timestamps=params['sequence_length'], learning_rate=params['learning_rate'])
    loss = validation_loss[-1]
    print(f"Validation Losses: {validation_loss}")
    print(f"Tested Params: {params}, Loss: {loss}")
    if loss < best_loss:
        best_loss = loss
        best_params = params
        best_model = model

print(f"Hyperparameter Opimization for {model_name}")
print(f"Best Parameters: {best_params}, Loss: {best_loss}")

## Dump the model to save the weights

In [None]:
model_saving_path = "modelDumps/" + model_name + ".pt"
import os
if not os.path.exists('modelDumps'):
    os.makedirs('modelDumps')
torch.save(best_model, model_saving_path)