# Create a simple LSTM by hand

In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

import lightning as L
# For Lighting version 2.x
from lightning.pytorch.tuner import Tuner
from lightning.pytorch.loggers import TensorBoardLogger
from torch.utils.data import TensorDataset, DataLoader

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
class LSTMbyHand(L.LightningModule):

    def __init__(self):

        super().__init__()

        # Set up the mean and std for the normal distribution which we will use to initialise the weights
        mean = torch.tensor(0.0)
        std = torch.tensor(1.0)

        self.wlr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wlr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.blr1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

        self.wpr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wpr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bpr1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

        self.wp1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wp2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bp1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

        self.wo1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wo2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bo1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)


    def lstm_unit(self, input_value, long_memory, short_memory):
        
        long_remember_percent = torch.sigmoid((short_memory * self.wlr1) + (input_value * self.wlr2) + self.blr1)

        potential_remember_percent = torch.sigmoid((short_memory * self.wpr1) + (input_value * self.wpr2) + self.bpr1)

        potential_memory = torch.tanh((short_memory * self.wp1) + (input_value * self.wp2) + self.bp1)

        updated_long_memory = ((long_memory * long_remember_percent) + (potential_memory * potential_remember_percent))

        output_percent = torch.sigmoid((short_memory * self.wo1) + (input_value * self.wo2) + self.bo1)

        updated_short_memory = torch.tanh(updated_long_memory) * output_percent

        return ([updated_long_memory, updated_short_memory])

    def forward(self, input):

        # Initialise long term and short memory to 0
        long_memory = 0
        short_memory = 0

        # Seperate the input into single data point
        day1 = input[0]
        day2 = input[1]
        day3 = input[2]
        day4 = input[3]

        # Pass the data point into unrolled LSTM unit one by one (could have used a for loop)
        long_memory, short_memory = self.lstm_unit(day1, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day2, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day3, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day4, long_memory, short_memory)

        return short_memory
    
    def configure_optimizers(self):
        return Adam(self.parameters())
    
    def training_step(self, batch, batch_idx):
        input_i, label_i = batch
        # A batch is a list of tensors - feature tensor and label tensor
        print("Batch: ", batch, " The type of batch is: ", type(batch))
        # Input tensor is of torch size [1,4], with the first dimension as the leading batch dimension
        print("input_i: ", input_i, " The dimension of input is: ", input_i.shape)
        print("input_i[0]: ", input_i[0], " The dimension of input[0] is: ", input_i[0].shape)
        # # If the label is a scalar value (e.g. single target regression or classification), pytorch would use a 1D vector to represent it [batch_size, ] rather than [batch_size, 1]
        print("label_i: ", label_i, " The dimension of label is: ", label_i.shape)
        
        # No .detach() is needed here as the gradients are required
        # Why we need to index it with [0] is because input_i is a batch, e.g. [[1,2,3]]. Our forward method is defined to take a one dimensional tensor, e.g. [1,2,3]
        output_i = self(input_i[0])
        loss = (output_i - label_i) ** 2

        # Log the training loss after each training step
        self.log("train_loss", loss)

        # Log the predictions for the corresponding companies
        if (label_i == 0):
            self.log("out_0", output_i)
        else:
            self.log("out_1", output_i)

        return loss

In [30]:
# See how the model prediction is before training for fun
model = LSTMbyHand()

print("Now let's compare the observed and predicted values...")
print("Company A: Observed value = 0, Predicted =",
      # Need to detach the model weights and biases as their require_grad's are set to true
      model(torch.tensor([0., 0.5, 0.25, 1.])).detach())

print("Now let's compare the observed and predicted values...")
print("Company B: Observed value = 1, Predicted =",
      model(torch.tensor([1., 0.5, 0.25, 1.])).detach())


Now let's compare the observed and predicted values...
Company A: Observed value = 0, Predicted = tensor(0.0528)
Now let's compare the observed and predicted values...
Company B: Observed value = 1, Predicted = tensor(0.0540)


# Train the model

In [31]:
# Create a training data set
inputs = torch.tensor([[0., 0.5, 0.25, 1.], [1., 0.5, 0.25, 1.]])
labels = torch.tensor([0., 1.])

dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset)

In [32]:
# Create a TensorBoardLogger that writes to lightning_logs/
tb_logger = TensorBoardLogger("lightning_logs", name=None)

# Create a trainer to train the model
trainer = L.Trainer(max_epochs=2000, logger=tb_logger)
# trainer = L.Trainer(max_epochs=1, logger=False)
trainer.fit(model, dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-trainable params
12        Total params
0.000     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

Batch:  [tensor([[0.0000, 0.5000, 0.2500, 1.0000]], device='mps:0'), tensor([0.], device='mps:0')]  The type of batch is:  <class 'list'>
input_i:  tensor([[0.0000, 0.5000, 0.2500, 1.0000]], device='mps:0')  The dimension of input is:  torch.Size([1, 4])
input_i[0]:  tensor([0.0000, 0.5000, 0.2500, 1.0000], device='mps:0')  The dimension of input[0] is:  torch.Size([4])
label_i:  tensor([0.], device='mps:0')  The dimension of label is:  torch.Size([1])


`Trainer.fit` stopped: `max_epochs=1` reached.


Batch:  [tensor([[1.0000, 0.5000, 0.2500, 1.0000]], device='mps:0'), tensor([1.], device='mps:0')]  The type of batch is:  <class 'list'>
input_i:  tensor([[1.0000, 0.5000, 0.2500, 1.0000]], device='mps:0')  The dimension of input is:  torch.Size([1, 4])
input_i[0]:  tensor([1.0000, 0.5000, 0.2500, 1.0000], device='mps:0')  The dimension of input[0] is:  torch.Size([4])
label_i:  tensor([1.], device='mps:0')  The dimension of label is:  torch.Size([1])


In [33]:
# Check the model performance after the first traning
print("Now let's compare the observed and the predicted value...")
print("Company A: Observed = 0, Predicted =",
      model(torch.tensor([0.0, 0.5, 0.25, 1.0]).detach()))
print("Company B: Observed = 1, Predicted =",
      model(torch.tensor([1.0, 0.5, 0.25, 1.0]).detach()))

Now let's compare the observed and the predicted value...
Company A: Observed = 0, Predicted = tensor(0.0526, grad_fn=<MulBackward0>)
Company B: Observed = 1, Predicted = tensor(0.0539, grad_fn=<MulBackward0>)


In [34]:
# Do further training (1000 epochs) starting from the most recent checkpoint
path_to_best_checkpoint = trainer.checkpoint_callback.best_model_path

trainer = L.Trainer(max_epochs=3000)
trainer.fit(model, train_dataloaders=dataloader, ckpt_path=path_to_best_checkpoint)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at /Users/edison/Git/pytorch-playground/checkpoints/epoch=0-step=2-v1.ckpt
/Users/edison/Git/pytorch-playground/myenv/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:360: The dirpath has changed from '/Users/edison/Git/pytorch-playground/checkpoints' to '/Users/edison/Git/pytorch-playground/lightning_logs/version_4/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-trainable params
12        Total params
0.000     Total estimated model params size (MB)
Restored all sta

Training: |          | 0/? [00:00<?, ?it/s]

Batch:  [tensor([[0.0000, 0.5000, 0.2500, 1.0000]], device='mps:0'), tensor([0.], device='mps:0')]  The type of batch is:  <class 'list'>
input_i:  tensor([[0.0000, 0.5000, 0.2500, 1.0000]], device='mps:0')  The dimension of input is:  torch.Size([1, 4])
input_i[0]:  tensor([0.0000, 0.5000, 0.2500, 1.0000], device='mps:0')  The dimension of input[0] is:  torch.Size([4])
label_i:  tensor([0.], device='mps:0')  The dimension of label is:  torch.Size([1])
Batch:  [tensor([[1.0000, 0.5000, 0.2500, 1.0000]], device='mps:0'), tensor([1.], device='mps:0')]  The type of batch is:  <class 'list'>
input_i:  tensor([[1.0000, 0.5000, 0.2500, 1.0000]], device='mps:0')  The dimension of input is:  torch.Size([1, 4])
input_i[0]:  tensor([1.0000, 0.5000, 0.2500, 1.0000], device='mps:0')  The dimension of input[0] is:  torch.Size([4])
label_i:  tensor([1.], device='mps:0')  The dimension of label is:  torch.Size([1])
Batch:  [tensor([[0.0000, 0.5000, 0.2500, 1.0000]], device='mps:0'), tensor([0.], devi

/Users/edison/Git/pytorch-playground/myenv/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
# As the training loss is not fully flattened out yet based on the graphs from the tensorboard, we can try to further train the model by a bit and see if the optimizer just encounters a pleateau

# Do further training (2000 epochs) starting from the most recent checkpoint
path_to_best_checkpoint = trainer.checkpoint_callback.best_model_path

trainer = L.Trainer(max_epochs=5000)
trainer.fit(model, train_dataloaders=dataloader, ckpt_path=path_to_best_checkpoint)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at /Users/edison/Git/pytorch-playground/lightning_logs/version_1/checkpoints/epoch=2999-step=6000.ckpt
/Users/edison/Git/pytorch-playground/myenv/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:360: The dirpath has changed from '/Users/edison/Git/pytorch-playground/lightning_logs/version_1/checkpoints' to '/Users/edison/Git/pytorch-playground/lightning_logs/version_2/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-trainable params
12        Total params
0.000     To

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5000` reached.


In [None]:
# Do further training (1000 epochs) starting from the most recent checkpoint
path_to_best_checkpoint = trainer.checkpoint_callback.best_model_path

trainer = L.Trainer(max_epochs=6000)
trainer.fit(model, train_dataloaders=dataloader, ckpt_path=path_to_best_checkpoint)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at /Users/edison/Git/pytorch-playground/lightning_logs/version_2/checkpoints/epoch=4999-step=10000.ckpt
/Users/edison/Git/pytorch-playground/myenv/lib/python3.11/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:360: The dirpath has changed from '/Users/edison/Git/pytorch-playground/lightning_logs/version_2/checkpoints' to '/Users/edison/Git/pytorch-playground/lightning_logs/version_3/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-trainable params
12        Total params
0.000     T

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=6000` reached.


# Now fit a LSTM with Lightning and nn Module

In [None]:
class LightningLSTM(L.LightningModule):

    def __init__(self):

        super().__init__()

        # input_size states the number of features and hidden_size states the dimension of the outcome, most of the time it is larger than 1 as it will be used as an input to another MLP layers
        self.lstm = nn.LSTM(input_size=1, hidden_size=1)

    
    def forward(self, input):

        input_trans = input.view(len(input), 1)

        lstm_out, temp = self.lstm(input_trans)

        prediction = lstm_out[-1]

        return prediction
    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)
    
    def training_step(self, batch, batch_idx):
        input_i, label_i = batch
        
        output_i = self(input_i[0])
        loss = (output_i - label_i) ** 2

        # Log the training loss after each training step
        self.log("train_loss", loss)

        # Log the predictions for the corresponding companies
        if (label_i == 0):
            self.log("out_0", output_i)
        else:
            self.log("out_1", output_i)

        return loss


In [None]:
# Let's try to see the model performance without training (maybe we are lucky)
model = LightningLSTM()

print("Now let's compare the observed and predicted values...")
print("Company A: Observed value = 0, Predicted =",
      # Need to detach the model weights and biases as their require_grad's are set to true
      model(torch.tensor([0., 0.5, 0.25, 1.])).detach())

print("Now let's compare the observed and predicted values...")
print("Company B: Observed value = 1, Predicted =",
      model(torch.tensor([1., 0.5, 0.25, 1.])).detach())

Now let's compare the observed and predicted values...
Company A: Observed value = 0, Predicted = tensor([0.0245])
Now let's compare the observed and predicted values...
Company B: Observed value = 1, Predicted = tensor([0.0201])


In [None]:
# The results are terrible, hence we need to train the model

# Create a trainer to train the model, max_epochs = 300 is enough this time as the learning rate is set to be larger
trainer = L.Trainer(max_epochs=300, log_every_n_steps=2)
trainer.fit(model, train_dataloaders=dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name | Type | Params | Mode 
--------------------------------------
0 | lstm | LSTM | 16     | train
--------------------------------------
16        Trainable params
0         Non-trainable params
16        Total params
0.000     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


In [None]:
# Let's try to see the model performance now after training
print("Now let's compare the observed and predicted values...")
print("Company A: Observed value = 0, Predicted =",
      # Need to detach the model weights and biases as their require_grad's are set to true
      model(torch.tensor([0., 0.5, 0.25, 1.])).detach())

print("Now let's compare the observed and predicted values...")
print("Company B: Observed value = 1, Predicted =",
      model(torch.tensor([1., 0.5, 0.25, 1.])).detach())

Now let's compare the observed and predicted values...
Company A: Observed value = 0, Predicted = tensor([7.3010e-05])
Now let's compare the observed and predicted values...
Company B: Observed value = 1, Predicted = tensor([0.9787])


# Create a simple LSTM by hand again (batch_size larger than 1)

In [215]:
class LSTMbyHand_batch(L.LightningModule):

    def __init__(self):

        super().__init__()

        # Set up the mean and std for the normal distribution which we will use to initialise the weights
        mean = torch.tensor(0.0)
        std = torch.tensor(1.0)

        self.wlr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wlr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.blr1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

        self.wpr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wpr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bpr1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

        self.wp1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wp2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bp1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

        self.wo1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wo2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bo1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

    def lstm_unit(self, input_value, long_memory, short_memory):
        
        long_remember_percent = torch.sigmoid((short_memory * self.wlr1) + (input_value * self.wlr2) + self.blr1)

        potential_remember_percent = torch.sigmoid((short_memory * self.wpr1) + (input_value * self.wpr2) + self.bpr1)

        potential_memory = torch.tanh((short_memory * self.wp1) + (input_value * self.wp2) + self.bp1)

        updated_long_memory = ((long_memory * long_remember_percent) + (potential_memory * potential_remember_percent))

        output_percent = torch.sigmoid((short_memory * self.wo1) + (input_value * self.wo2) + self.bo1)

        updated_short_memory = torch.tanh(updated_long_memory) * output_percent

        return ([updated_long_memory, updated_short_memory])

    def forward(self, input):
        
        device = input.device
        
        # Input is a 2D tensor (size [2, 4], i.e. batch_size = 2, seq_length = 4)
        batch_size = input.size()[0]
        seq_len = input.size()[1]

        # By default, the zero tensors are stored in the cpu, so we need to move them into the gpu as the inputs and parameters tensors are in the gpu
        long_memory = torch.zeros(batch_size).to(device)
        short_memory = torch.zeros(batch_size).to(device)

        # We keep the assumption that the lstm_unit method will only take a 0D tensor (scalar) as an input, we need a double loop to deal with a 2D tensor here
        # Seq_len should be the outer loop
        for i in range(seq_len):
            
            # Avoid in-place ops
            updated_long_memory = long_memory.clone()
            updated_short_memory = short_memory.clone()

            for j in range(batch_size):
                
                # The input values on each day could be different across samples
                day_i = input[j, i]

                updated_long_memory[j], updated_short_memory[j] = self.lstm_unit(day_i, long_memory[j], short_memory[j])

            # Update the original memory tensor by reassigning the values of the clone tensors to them
            long_memory = updated_long_memory
            short_memory = updated_short_memory

        return short_memory

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)
    
    def training_step(self, batch, batch_idx):
        input, label = batch
        
        # No indexing is needed now as the forward method can take batches of size larger than 1
        output = self(input)
        
        # Output and label are both 1D vector of size [batch_size]
        loss = torch.sum((output - label) ** 2)

        return loss

In [216]:
# Let's try to see if our model can take batches with batch size larger than 1 now
model = LSTMbyHand_batch()

print("Now let's compare the observed and predicted values...")
print("Company A: Observed value = 0; Company B: Observed value = 1, Predicted =",
      # Need to add an extra sqaure bracket now as the forward function is supposed to take batches as input
      model(torch.tensor([[0., 0.5, 0.25, 1.], 
                          [1., 0.5, 0.25, 1.]]))
                          .detach())


Now let's compare the observed and predicted values...
Company A: Observed value = 0; Company B: Observed value = 1, Predicted = tensor([0.0328, 0.0357])


In [150]:
# Rewrite the lstm_unit method so that it can take batches (2D tensor) as an input
class LSTMbyHand_batch_vec(L.LightningModule):

    def __init__(self):

        super().__init__()

        # Set up the mean and std for the normal distribution which we will use to initialise the weights
        mean = torch.tensor(0.0)
        std = torch.tensor(1.0)

        self.wlr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wlr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.blr1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

        self.wpr1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wpr2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bpr1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

        self.wp1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wp2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bp1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

        self.wo1 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.wo2 = nn.Parameter(torch.normal(mean=mean, std=std), requires_grad=True)
        self.bo1 = nn.Parameter(torch.tensor(0.0), requires_grad=True)

    # Vectorise the operations - input_value is supposed to be a 1D tensor (e.g. tensor([1, 2]), size [batch_size (2)]) now; long_term memory and short_term memory are 1D tensor (e.g. tensor([0, 0]), size [batch_size (2)])
    def lstm_unit(self, input_value, long_memory, short_memory):
        
        long_remember_percent = torch.sigmoid((short_memory * self.wlr1) + (input_value * self.wlr2) + self.blr1)

        potential_remember_percent = torch.sigmoid((short_memory * self.wpr1) + (input_value * self.wpr2) + self.bpr1)

        potential_memory = torch.tanh((short_memory * self.wp1) + (input_value * self.wp2) + self.bp1)

        updated_long_memory = ((long_memory * long_remember_percent) + (potential_memory * potential_remember_percent))

        output_percent = torch.sigmoid((short_memory * self.wo1) + (input_value * self.wo2) + self.bo1)

        updated_short_memory = torch.tanh(updated_long_memory) * output_percent

        return ([updated_long_memory, updated_short_memory])

    def forward(self, input):
        
        device = input.device
        
        # Input is a 2D tensor (e.g. tensor([[1, 2, 3, 4], [5, 6, 7, 8]]), size [2,4], i.e. batch_size = 2, seq_length = 4)
        batch_size = input.size()[0]
        seq_len = input.size()[1]

        # By default, the zero tensors are stored in the cpu, so we need to move them into the gpu as the inputs and parameters tensors are in the gpu
        long_memory = torch.zeros(batch_size).to(device)
        short_memory = torch.zeros(batch_size).to(device)

        # Seq_len should be the outer loop
        for i in range(seq_len):
            
            # Cloning is not needed anymore because now the lstm_unit method can process all samples in the batch at the same time
            # updated_long_memory = long_memory.clone()
            # updated_short_memory = short_memory.clone()

            # It will return a 1D tensor (e.g. tensor([1, 5]), size [2]
            day_i = input[:, i]

            long_memory, short_memory = self.lstm_unit(day_i, long_memory, short_memory)

        return short_memory

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)
    
    def training_step(self, batch, batch_idx):
        input, label = batch
        
        # No indexing is needed now as the forward method can take batches of batch size larger than 1
        output = self(input)
        
        # Output and label are both 1D vector of size [batch_size]
        loss = torch.sum((output - label) ** 2)

        return loss

In [158]:
# Let's try to see if our model can take batches with batch size larger than 1 now
model = LSTMbyHand_batch_vec()

print("Now let's compare the observed and predicted values...")
print("Company A: Observed value = 0; Company B: Observed value = 1, Predicted =",
      # Need to add an extra sqaure bracket now as the forward function is supposed to take batches as input
      model(torch.tensor([[0., 0.5, 0.25, 1.], [1., 0.5, 0.25, 1.]])).detach())


Now let's compare the observed and predicted values...
Company A: Observed value = 0; Company B: Observed value = 1, Predicted = tensor([0.4616, 0.4913])


In [143]:
# Create a training data set
inputs = torch.tensor([[0., 0.5, 0.25, 1.], [1., 0.5, 0.25, 1.]])
labels = torch.tensor([0., 1.])

dataset = TensorDataset(inputs, labels)
# Explicitly set the batch_size to 2
dataloader = DataLoader(dataset, batch_size=2)

In [155]:
# Create a trainer to train the model
trainer = L.Trainer(max_epochs=300, logger=False)
trainer.fit(model, dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-trainable params
12        Total params
0.000     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=300` reached.


In [156]:
# Let's take a look at the predicions again
print("Now let's compare the observed and predicted values...")
print("Company A: Observed value = 0, Predicted =",
      # Need to add an extra sqaure bracket now as the forward function is supposed to take batches as input
      model(torch.tensor([[0., 0.5, 0.25, 1.]])).detach())

print("Now let's compare the observed and predicted values...")
print("Company B: Observed value = 1, Predicted =",
      model(torch.tensor([[1., 0.5, 0.25, 1.]])).detach())

Now let's compare the observed and predicted values...
Company A: Observed value = 0, Predicted = tensor([2.9159e-06])
Now let's compare the observed and predicted values...
Company B: Observed value = 1, Predicted = tensor([0.9973])


# Now fit a LSTM with Lightning and nn Module with batch size larget than 1 (batch_first = True)

In [210]:
class LightningLSTM_BatchFirst_true(L.LightningModule):

    def __init__(self, input_size=1, hidden_size=1):

        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size

        # input_size states the number of features and hidden_size states the dimension of the outcome, most of the time it is larger than 1 as it will be used as an input to another MLP layers
        self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True)

    def forward(self, input):

        batch_size = input.size()[0]
        seq_len = input.size()[1]

        # Pytorch's nn.LSTM requires the shape of the input tensor to be (assuming batch_first = True) [batch_size, seq_len, input_size (no. of feature)]
        input_trans = input.view(batch_size, seq_len, self.input_size)

        lstm_out, temp = self.lstm(input_trans)

        # Because of batch_first is set to True, the shape of lstm_out becomes: [batch_size, seq_len, hidden_size], hence the shape of prediction is: [batch_size, hidden_size (=1)]
        prediction = lstm_out[:, -1, :]

        # Simplify the shape of prediction by turning it into a 1D vector because the size of the last dimension (hidden_size) is 1 anyway
        prediction = prediction.squeeze(-1)

        return prediction
    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)
    
    def training_step(self, batch, batch_idx):
        input_i, label_i = batch
        
        output_i = self(input_i[0])
        loss = (output_i - label_i) ** 2

        # Log the training loss after each training step
        self.log("train_loss", loss)

        # Log the predictions for the corresponding companies
        if (label_i == 0):
            self.log("out_0", output_i)
        else:
            self.log("out_1", output_i)

        return loss


In [213]:
model = LightningLSTM_BatchFirst_true()

print("Now let's compare the observed and predicted values...")
print("Company A: Observed value = 0; Company B: Observed value = 1, Predicted =",
      # Need to add an extra sqaure bracket now as the forward function is supposed to take batches as input
      model(torch.tensor([[0., 0.5, 0.25, 1.], [1., 0.5, 0.25, 1.]])).detach())


Now let's compare the observed and predicted values...
Company A: Observed value = 0; Company B: Observed value = 1, Predicted = tensor([0.0586, 0.0844])


In [212]:
class LightningLSTM_BatchFirst_false(L.LightningModule):

    def __init__(self, input_size=1, hidden_size=1):

        super().__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size

        # Batch_first is set to False by default
        self.lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size)

    def forward(self, input):

        batch_size = input.size()[0]
        seq_len = input.size()[1]

        # Pytorch's nn.LSTM requires the shape of the input tensor to be (assuming batch_first = False) [seq_len, batch_size, input_size (no. of feature)]
        input_trans = input.view(seq_len, batch_size, self.input_size)

        lstm_out, temp = self.lstm(input_trans)

        # Because of batch_first is set to False, the shape of lstm_out becomes: [seq_len, batch_size, hidden_size], hence we do the slicing at the first index (seq_len)
        prediction = lstm_out[-1]

        # Simplify the shape of prediction by turning it into a 1D vector because the size of the last dimension (hidden_size) is 1 anyway
        prediction = prediction.squeeze(-1)

        return prediction
    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)
    
    def training_step(self, batch, batch_idx):
        input_i, label_i = batch
        
        output_i = self(input_i[0])
        loss = (output_i - label_i) ** 2

        # Log the training loss after each training step
        self.log("train_loss", loss)

        # Log the predictions for the corresponding companies
        if (label_i == 0):
            self.log("out_0", output_i)
        else:
            self.log("out_1", output_i)

        return loss


In [214]:
model = LightningLSTM_BatchFirst_false()

print("Now let's compare the observed and predicted values...")
print("Company A: Observed value = 0; Company B: Observed value = 1, Predicted =",
      # Need to add an extra sqaure bracket now as the forward function is supposed to take batches as input
      model(torch.tensor([[0., 0.5, 0.25, 1.], [1., 0.5, 0.25, 1.]])).detach())


Now let's compare the observed and predicted values...
Company A: Observed value = 0; Company B: Observed value = 1, Predicted = tensor([-0.3613, -0.2819])
