# Question 3: LSTM

Following question 1 (2), train a sequence to sequence (e.g., 4 timesteps to 4 timesteps) predictive model (e.g., LSTM)  in the reduced space, and decode predicted results in the full space. Evaluate your algorithm performance on the test dataset using different metrics (e.g., MSE, RMSE, SSIM…).

Reference: https://towardsdatascience.com/multivariate-time-series-forecasting-with-deep-learning-3e7b3e2d2bcf

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

### Video sequence loading

In [2]:
# load the data as sequences
import os
import cv2 as cv
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [3]:
# custom dataset to process videos
class VideoDataset(Dataset):
  def __init__(self, folder_path, sequence_length=4, video_length=16, transform=None):
    self.folder_path = folder_path
    self.sequence_length = sequence_length
    self.video_length = video_length
    self.transform = transform
    self.video_list = os.listdir(folder_path)

  def __len__(self):
    return len(self.video_list) * ((self.video_length // self.sequence_length) - 1) # - 1 because we don't want to train on the final sequence - no target sequence!

  def __getitem__(self, idx):
    video_idx = idx // (self.sequence_length - 1)
    frame_idx = idx % (self.sequence_length - 1)
    video_path = os.path.join(self.folder_path, self.video_list[video_idx])
    frames = self._load_video(video_path)
    input_seq, target_seq = self._get_sequence_pair(frames, frame_idx)
    return torch.tensor(input_seq, dtype=torch.float32), torch.tensor(target_seq, dtype=torch.float32)

  def _load_video(self, video_path):
    frames = []
    cap = cv.VideoCapture(video_path)
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      frame = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
      frames.append(frame.flatten())
    cap.release()

    frames = np.array(frames) # should be 16 x (128*128) i.e. 16 frames, each 128x128 frame flattened to 1d
    return frames

  def _get_sequence_pair(self, frames, frame_idx):
    input_seq = np.array(frames[frame_idx:frame_idx+self.sequence_length])
    target_seq = np.array(frames[frame_idx+self.sequence_length:frame_idx+2 * self.sequence_length])
    return input_seq, target_seq

In [4]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

batch_size = 1
sequence_length = 4
folder_path = "./VIDEOS/training/"

video_dataset = VideoDataset(folder_path, sequence_length=sequence_length, transform=transform)
train_loader = DataLoader(video_dataset, batch_size=batch_size, shuffle=True)

### Seq2Seq model definition

In [11]:
# Define the seq2seq model
class Seq2SeqModel(nn.Module):
  def __init__(self, input_dim, latent_dim, num_layers=1):
    super(Seq2SeqModel, self).__init__()

    self.encoder = nn.LSTM(input_dim, latent_dim, num_layers, batch_first=True)
    self.decoder = nn.LSTM(latent_dim, input_dim, num_layers, batch_first=True)

  def forward(self, input_seq):
    # Encoder
    _, (last_hidden, _) = self.encoder(input_seq)
    
    # reshape for batch
    encoded = last_hidden.repeat(1, input_seq.size(1), 1)

    # Decoder
    decoder_output, _ = self.decoder(encoded)
    
    return decoder_output

### Seq2Seq model training

In [12]:
# Try to use GPU if available
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

print(device)

cpu


In [13]:
# define function to train the model
def train(model, train_data, criterion, optimizer, epochs):
  model.train()
  for epoch in range(epochs):
    running_loss = 0.0
    for input_seq, target_seq in train_data:
      input_seq, target_seq = input_seq.to(device), target_seq.to(device)
      optimizer.zero_grad()
      output_seq = model(input_seq)
      loss = criterion(output_seq, target_seq)
      loss.backward()
      optimizer.step()
      running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_data)}")

In [14]:
# Set hyperparameters
input_dim = 128*128
latent_dim = 64
num_layers = 1
learning_rate = 0.001
num_epochs = 10

In [15]:
# Initialize model, loss function, and optimizer
model = Seq2SeqModel(input_dim, latent_dim, num_layers)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [16]:
# Train the model
train(model, train_loader, criterion, optimizer, num_epochs)

torch.Size([32, 4, 64])


  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 4299161600 bytes.