# Training and Evaluation for Transformer (Encoder & Decoder)
This model takes the feature sequence and also the target sequence as input. This is different to the other models which is why there is a seperate notebook for training and evaluation.

In [1]:
import torch
import torch.nn as nn
from benchmarks import EncoderDecoderTransformer

## Training

In [33]:
#hyperparameters and utilities
sequence_length = 10
batch_size = 64
n_head = 13
learning_rate = 0.005
epochs = 5
dropout_rate = 0.3
hidden_size = 25

In [34]:
batches_train = torch.load("processed_data/batches_train.pt")
batches_val = torch.load("processed_data/batches_validation.pt")
batches_test = torch.load("processed_data/batches_test.pt")

In [23]:
from tqdm import tqdm
import random

def train(model, epochs: int, train_batches: list[torch.Tensor], validation_batches: list[torch.Tensor], n_timestamps: int = 10, learning_rate: float = 0.001):
    # Check if GPU is available and use it; otherwise, use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training on {device}")
    
    # Move the model to the appropriate device
    model.to(device)

    # Set up the loss function and optimizer
    lossFunction = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    training_loss_progression = []
    validation_loss_progression = []

    # Training loop
    for epoch in range(epochs):
        model.train() # Set the model to training mode
        totalLoss = 0
        random.shuffle(train_batches) # Shuffling the training batches
        for i in tqdm(range(len(train_batches))):
            # Prepare Data and move it to the appropriate device
            batch = train_batches[i]
            x = batch[:,:,:-1].to(device)
            tgt = batch[:,:,-1].unsqueeze(-1).to(device)
            t = batch[:,-1,-1].reshape(-1,1).to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward + backward + optimize
            y = model(x, tgt)
            loss = lossFunction(y, t)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Clipping the gradient to avoid exploding gradients
            optimizer.step()

            # Storing the batch loss
            totalLoss += loss.item()
        # Storing the epoch training loss
        training_loss_progression.append(totalLoss)

        # Validation loop
        model.eval()
        with torch.no_grad():
            totalLoss = 0
            for i in tqdm(range(len(validation_batches))):
                # Prepare Data and move it to the appropriate device
                batch = validation_batches[i]
                x = batch[:,:,:-1].to(device)
                tgt = batch[:,:,-1].unsqueeze(-1).to(device)
                t = batch[:,-1,-1].reshape(-1,1).to(device)

                # Forward pass: compute the model's predictions
                y = model(x, tgt)

                # Comput batch loss and store it
                loss = lossFunction(y, t)
                totalLoss += loss.item()
            # Storing the epoch validation loss
            validation_loss_progression.append(totalLoss)


    return training_loss_progression, validation_loss_progression

In [24]:
model = EncoderDecoderTransformer(src_feat_dim=65, tgt_feat_dim=1, dim_model=4, num_heads=4, num_layers=3, dropout=0.1)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Number of Trainable Parameters: {trainable_params}')

In [None]:
training_loss, validation_loss = train(model, epochs=epochs, train_batches=batches_train,  validation_batches=batches_val, n_timestamps=sequence_length, learning_rate=learning_rate)

In [None]:
model_saving_path = "modelDumps/EncoderDecoderTransformer.pt"
import os
if not os.path.exists('modelDumps'):
    os.makedirs('modelDumps')
torch.save(model, model_saving_path)

## Evaluation

In [None]:
def evaluate(model, batches_test):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    lossFunction = torch.nn.MSELoss()

    model.to(device)
    model.eval()
    with torch.no_grad():
        totalLoss = 0
        for i in tqdm(range(len(batches_test))):
            # Prepare Data and move it to the appropriate device
            batch = batches_test[i]
            x = batch[:,:,:-1].to(device)
            tgt = batch[:,:,-1].unsqueeze(-1).to(device)
            t = batch[:,-1,-1].reshape(-1,1).to(device)

            # Forward pass: compute the model's predictions
            y = model(x, tgt)

            # Comput batch loss and store it
            loss = lossFunction(y, t)
            totalLoss += loss.item()
                
    return totalLoss

In [None]:
test_loss = evaluate(model, batches_test)
print(f'Test Loss: {test_loss}')