#### nltk library installation for BLEU

In [36]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


### Extracting Dataset

In [1]:
import tarfile

# Path to dataset file
dataset_path = 'MLDS_hw2_1_data.tar.gz'

with tarfile.open(dataset_path, 'r:gz') as tar:
    tar.extractall(path='data')

### Loads captions and feature files

In [93]:
import os
import json
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class VideoCaptionDataset(Dataset):
    def __init__(self, id_file, feat_folder, captions_data, vocab):
        self.feat_folder = feat_folder
        self.vocab = vocab
        
        # Load video IDs from id.txt
        with open(id_file, 'r') as f:
            self.video_ids = [line.strip() for line in f.readlines()]
        
        # Numericalize captions
        self.captions_data = {item['id']: self.vocab.numericalize(item['caption'][0]) for item in captions_data}

    def __len__(self):
        return len(self.video_ids)
    
    def __getitem__(self, idx):
        # Get the video ID
        video_id = self.video_ids[idx]
        
        # Load the precomputed features from the .npy file
        feat_path = os.path.join(self.feat_folder, video_id + '.npy')
        features = np.load(feat_path)
        features = torch.tensor(features, dtype=torch.float32)
        
        # Get the corresponding numericalized caption and convert to tensor
        caption = torch.tensor(self.captions_data[video_id], dtype=torch.long)
        
        return features, caption
    

In [94]:
import torch.nn.functional as F

def pad_collate_fn(batch):

    
    batch_features, batch_captions = zip(*batch)
    
    # Stack the features
    batch_features = torch.stack(batch_features, dim=0)
    
    # Find the length of the longest caption
    max_length = max([len(caption) for caption in batch_captions])
    
    # Pad all captions to the same length with <PAD> token
    padded_captions = []
    for caption in batch_captions:
        padded_caption = F.pad(torch.tensor(caption), (0, max_length - len(caption)), value=vocab.word2idx[vocab.pad_token])
        padded_captions.append(padded_caption)
    
    # Stack padded captions
    padded_captions = torch.stack(padded_captions, dim=0)
    
    return batch_features, padded_captions


### DataLoader for Training and Testing

In [95]:
# Paths to the training data
train_id_file = './data/MLDS_hw2_1_data/training_data/id.txt'
train_feat_folder = './data/MLDS_hw2_1_data/training_data/feat'
train_captions_file = './data/MLDS_hw2_1_data/training_label.json'

# Paths to the testing data
test_id_file = './data/MLDS_hw2_1_data/testing_data/id.txt'
test_feat_folder = './data/MLDS_hw2_1_data/testing_data/feat'
test_captions_file = './data/MLDS_hw2_1_data/testing_label.json'

# Load captions JSON file for training
with open(train_captions_file, 'r') as f:
    train_captions = json.load(f)

# Load captions JSON file for testing
with open(test_captions_file, 'r') as f:
    test_captions = json.load(f)

# Creates the vocabulary and build it from the captions
vocab = Vocabulary(min_freq=4)
all_captions = list(itertools.chain.from_iterable([item['caption'] for item in train_captions]))
vocab.build_vocab(all_captions)

# dataset and dataloader for training
train_dataset = VideoCaptionDataset(id_file=train_id_file, feat_folder=train_feat_folder, captions_data=train_captions, vocab=vocab)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=pad_collate_fn)

# dataset and dataloader for testing
test_dataset = VideoCaptionDataset(id_file=test_id_file, feat_folder=test_feat_folder, captions_data=test_captions, vocab=vocab)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=pad_collate_fn)


### Vocabulary

#### Vocabulary Class

In [96]:
from collections import Counter
import itertools

class Vocabulary:
    def __init__(self, min_freq=1):
        self.word2idx = {}
        self.idx2word = {}
        self.min_freq = min_freq
        self.pad_token = "<PAD>"
        self.bos_token = "<BOS>"
        self.eos_token = "<EOS>"
        self.unk_token = "<UNK>"
        self.special_tokens = [self.pad_token, self.bos_token, self.eos_token, self.unk_token]
        self.word_counter = Counter()

    def build_vocab(self, captions):
        for caption in captions:
            tokens = caption.split() 
            self.word_counter.update(tokens)
        

        idx = 0
        for token in self.special_tokens:
            self.word2idx[token] = idx
            self.idx2word[idx] = token
            idx += 1
        

        for word, count in self.word_counter.items():
            if count >= self.min_freq:
                self.word2idx[word] = idx
                self.idx2word[idx] = word
                idx += 1

    def numericalize(self, text):

        return [self.word2idx.get(word, self.word2idx[self.unk_token]) for word in text.split()]

    def denumericalize(self, indices):

        return [self.idx2word.get(idx, self.unk_token) for idx in indices]

#### Builds vocabulary from the captions

In [97]:
# Extracts all captions from the JSON data
all_captions = list(itertools.chain.from_iterable([item['caption'] for item in train_captions]))

# Creates the vocabulary and build it from the captions
vocab = Vocabulary(min_freq=3)
vocab.build_vocab(all_captions)

print(f"Vocabulary size: {len(vocab.word2idx)}")

Vocabulary size: 3594


#### Numericalizes the captions

In [98]:
# Extracts all captions from the JSON data
all_captions = list(itertools.chain.from_iterable([item['caption'] for item in train_captions]))

# Creates the vocabulary and build it from the captions
vocab = Vocabulary(min_freq=3)
vocab.build_vocab(all_captions)

print(f"Vocabulary size: {len(vocab.word2idx)}")


Vocabulary size: 3594


### Testing dataloader

In [99]:
# Test the training DataLoader
for i, (features, captions) in enumerate(train_dataloader):
    print(f'Batch {i+1}')
    print(f'Feature shape: {features.shape}')
    print(f'Captions: {captions}') 
    break  # to check the first batch


Batch 1
Feature shape: torch.Size([4, 80, 4096])
Captions: tensor([[   4,  185,   15, 1049,  868,    0,    0,    0],
        [   4,   57,   15,  273, 1096,  148,    8,    3],
        [   4,  117,   15,  158,   31,    8,  551,    0],
        [   4, 2073,   15,   51,  603,   31,    8,  967]])


  padded_caption = F.pad(torch.tensor(caption), (0, max_length - len(caption)), value=vocab.word2idx[vocab.pad_token])


### Encoder

In [100]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, feature_size, hidden_size):
        super(Encoder, self).__init__()
        self.fc = nn.Linear(feature_size, hidden_size)
        self.relu = nn.ReLU()

    def forward(self, features):
        # features: (batch_size, num_frames, feature_size)
        features = self.fc(features) 
        features = self.relu(features)
        return features

### Decoder

In [101]:
class DecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, vocab_size, embedding_size, num_layers=1):
        super(DecoderWithAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size + hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.attention = nn.Linear(hidden_size, hidden_size)

    def forward(self, features, captions, hidden=None):
        batch_size = features.size(0)
        num_frames = features.size(1)
        hidden_size = features.size(2)
        

        if hidden is None:
            h_0 = torch.zeros(1, batch_size, hidden_size).to(features.device)  
            c_0 = torch.zeros(1, batch_size, hidden_size).to(features.device)  
            hidden = (h_0, c_0)
        
        # Embed the captions
        embeddings = self.embedding(captions)  
        
        outputs = []
        
        for t in range(embeddings.size(1)):
            # Apply attention over the video features
            attention_weights = torch.bmm(features, hidden[0][-1].unsqueeze(2)).squeeze(2) 
            attention_weights = torch.softmax(attention_weights, dim=1)
            attention_applied = torch.bmm(attention_weights.unsqueeze(1), features)  
            
            # Concatenate the attention-applied video features with the current word embedding
            lstm_input = torch.cat((attention_applied.squeeze(1), embeddings[:, t, :]), dim=1) 
            
            # Pass through LSTM
            lstm_output, hidden = self.lstm(lstm_input.unsqueeze(1), hidden)  
            
            # Generate the output (next word prediction)
            output = self.fc(lstm_output.squeeze(1))
            outputs.append(output)
        
        # Stack outputs along the time dimension
        outputs = torch.stack(outputs, dim=1) 
        
        return outputs, hidden


### Seq2Seq

In [102]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, features, captions):
        # Passes the features through the encoder
        encoder_outputs = self.encoder(features)
        
        # Passes the encoded features and captions to the decoder
        outputs, _ = self.decoder(encoder_outputs, captions) 
        
        return outputs


In [103]:
import torch.optim as optim
import torch.nn as nn

# Hyperparameters
feature_size = 4096  # feature size
hidden_size = 512    # size of hidden state in the LSTM
vocab_size = len(vocab.word2idx)  # Size of the vocabulary
embedding_size = 256  # Size of the word embeddings
num_layers = 1  # Number of LSTM layers

# Initialize encoder and decoder
encoder = Encoder(feature_size, hidden_size)
decoder = DecoderWithAttention(hidden_size, vocab_size, embedding_size, num_layers)

# Create Seq2Seq model
seq2seq_model = Seq2Seq(encoder, decoder)

criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx[vocab.pad_token])
optimizer = optim.Adam(seq2seq_model.parameters(), lr=0.00005)


### Model Training

In [85]:
num_epochs = 100
seq2seq_model.train()

for epoch in range(num_epochs):
    for i, (features, captions) in enumerate(train_dataloader):
        optimizer.zero_grad()

        # Forward pass
        outputs = seq2seq_model(features, captions[:, :-1])
        targets = captions[:, 1:].reshape(-1)
        
        # flattens the outputs for loss calculation
        outputs = outputs.view(-1, outputs.size(-1))
        
        # computes the loss
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        if i % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}, Step {i+1}, Loss: {loss.item()}')


  padded_caption = F.pad(torch.tensor(caption), (0, max_length - len(caption)), value=vocab.word2idx[vocab.pad_token])


Epoch 1/100, Step 1, Loss: 8.02110481262207
Epoch 1/100, Step 11, Loss: 7.864769458770752
Epoch 1/100, Step 21, Loss: 7.748021602630615
Epoch 1/100, Step 31, Loss: 7.611619472503662
Epoch 1/100, Step 41, Loss: 7.205143451690674
Epoch 1/100, Step 51, Loss: 7.001648902893066
Epoch 1/100, Step 61, Loss: 6.374505996704102
Epoch 1/100, Step 71, Loss: 6.467390060424805
Epoch 1/100, Step 81, Loss: 6.102726459503174
Epoch 1/100, Step 91, Loss: 6.46403694152832
Epoch 1/100, Step 101, Loss: 5.951260089874268
Epoch 1/100, Step 111, Loss: 5.48821496963501
Epoch 1/100, Step 121, Loss: 5.579092025756836
Epoch 1/100, Step 131, Loss: 5.6846795082092285
Epoch 1/100, Step 141, Loss: 5.632481098175049
Epoch 1/100, Step 151, Loss: 5.295943260192871
Epoch 1/100, Step 161, Loss: 6.054490566253662
Epoch 1/100, Step 171, Loss: 4.541777610778809
Epoch 1/100, Step 181, Loss: 4.910405158996582
Epoch 1/100, Step 191, Loss: 5.198885440826416
Epoch 1/100, Step 201, Loss: 5.6328020095825195
Epoch 1/100, Step 211, Lo

Epoch 5/100, Step 281, Loss: 4.439336776733398
Epoch 5/100, Step 291, Loss: 4.118088245391846
Epoch 5/100, Step 301, Loss: 4.35374641418457
Epoch 5/100, Step 311, Loss: 4.140695095062256
Epoch 5/100, Step 321, Loss: 4.83750581741333
Epoch 5/100, Step 331, Loss: 4.868332862854004
Epoch 5/100, Step 341, Loss: 4.352811813354492
Epoch 5/100, Step 351, Loss: 5.0759077072143555
Epoch 5/100, Step 361, Loss: 4.855072975158691
Epoch 6/100, Step 1, Loss: 3.718043804168701
Epoch 6/100, Step 11, Loss: 4.647392272949219
Epoch 6/100, Step 21, Loss: 4.5905632972717285
Epoch 6/100, Step 31, Loss: 4.913887977600098
Epoch 6/100, Step 41, Loss: 4.607448101043701
Epoch 6/100, Step 51, Loss: 4.435571193695068
Epoch 6/100, Step 61, Loss: 4.495782375335693
Epoch 6/100, Step 71, Loss: 4.603936195373535
Epoch 6/100, Step 81, Loss: 4.718899250030518
Epoch 6/100, Step 91, Loss: 4.880528450012207
Epoch 6/100, Step 101, Loss: 4.410231590270996
Epoch 6/100, Step 111, Loss: 4.568432807922363
Epoch 6/100, Step 121, L

Epoch 10/100, Step 181, Loss: 3.433279514312744
Epoch 10/100, Step 191, Loss: 4.651848316192627
Epoch 10/100, Step 201, Loss: 4.2104105949401855
Epoch 10/100, Step 211, Loss: 3.5983073711395264
Epoch 10/100, Step 221, Loss: 3.4761078357696533
Epoch 10/100, Step 231, Loss: 3.55800724029541
Epoch 10/100, Step 241, Loss: 3.6871140003204346
Epoch 10/100, Step 251, Loss: 4.039853572845459
Epoch 10/100, Step 261, Loss: 3.794186592102051
Epoch 10/100, Step 271, Loss: 3.5654118061065674
Epoch 10/100, Step 281, Loss: 3.6807358264923096
Epoch 10/100, Step 291, Loss: 4.493956089019775
Epoch 10/100, Step 301, Loss: 3.8432729244232178
Epoch 10/100, Step 311, Loss: 4.189023017883301
Epoch 10/100, Step 321, Loss: 3.8341801166534424
Epoch 10/100, Step 331, Loss: 3.4202959537506104
Epoch 10/100, Step 341, Loss: 3.429445743560791
Epoch 10/100, Step 351, Loss: 3.994781732559204
Epoch 10/100, Step 361, Loss: 3.885603189468384
Epoch 11/100, Step 1, Loss: 4.19248104095459
Epoch 11/100, Step 11, Loss: 4.0041

Epoch 15/100, Step 41, Loss: 3.7060956954956055
Epoch 15/100, Step 51, Loss: 3.558755397796631
Epoch 15/100, Step 61, Loss: 2.9089372158050537
Epoch 15/100, Step 71, Loss: 3.4249231815338135
Epoch 15/100, Step 81, Loss: 3.2281486988067627
Epoch 15/100, Step 91, Loss: 3.5786423683166504
Epoch 15/100, Step 101, Loss: 3.23661208152771
Epoch 15/100, Step 111, Loss: 2.9900150299072266
Epoch 15/100, Step 121, Loss: 2.8833954334259033
Epoch 15/100, Step 131, Loss: 2.7528951168060303
Epoch 15/100, Step 141, Loss: 2.829198122024536
Epoch 15/100, Step 151, Loss: 3.0560312271118164
Epoch 15/100, Step 161, Loss: 2.23795747756958
Epoch 15/100, Step 171, Loss: 3.145447254180908
Epoch 15/100, Step 181, Loss: 3.632474184036255
Epoch 15/100, Step 191, Loss: 3.937760829925537
Epoch 15/100, Step 201, Loss: 2.9670252799987793
Epoch 15/100, Step 211, Loss: 3.1272683143615723
Epoch 15/100, Step 221, Loss: 3.7986674308776855
Epoch 15/100, Step 231, Loss: 2.9441683292388916
Epoch 15/100, Step 241, Loss: 3.754

Epoch 19/100, Step 261, Loss: 2.29032564163208
Epoch 19/100, Step 271, Loss: 2.7634308338165283
Epoch 19/100, Step 281, Loss: 2.3991410732269287
Epoch 19/100, Step 291, Loss: 3.1455435752868652
Epoch 19/100, Step 301, Loss: 2.346346139907837
Epoch 19/100, Step 311, Loss: 3.4039244651794434
Epoch 19/100, Step 321, Loss: 2.9404807090759277
Epoch 19/100, Step 331, Loss: 2.3965885639190674
Epoch 19/100, Step 341, Loss: 2.047685146331787
Epoch 19/100, Step 351, Loss: 3.2175233364105225
Epoch 19/100, Step 361, Loss: 3.1747334003448486
Epoch 20/100, Step 1, Loss: 2.6091017723083496
Epoch 20/100, Step 11, Loss: 3.0474154949188232
Epoch 20/100, Step 21, Loss: 2.113675594329834
Epoch 20/100, Step 31, Loss: 2.347351312637329
Epoch 20/100, Step 41, Loss: 2.237776517868042
Epoch 20/100, Step 51, Loss: 2.8153810501098633
Epoch 20/100, Step 61, Loss: 2.6249876022338867
Epoch 20/100, Step 71, Loss: 2.7033579349517822
Epoch 20/100, Step 81, Loss: 3.120803117752075
Epoch 20/100, Step 91, Loss: 2.8095893

Epoch 24/100, Step 121, Loss: 1.5018043518066406
Epoch 24/100, Step 131, Loss: 1.9742450714111328
Epoch 24/100, Step 141, Loss: 2.3827602863311768
Epoch 24/100, Step 151, Loss: 1.5930135250091553
Epoch 24/100, Step 161, Loss: 2.8260014057159424
Epoch 24/100, Step 171, Loss: 1.4859927892684937
Epoch 24/100, Step 181, Loss: 1.777292251586914
Epoch 24/100, Step 191, Loss: 2.0554990768432617
Epoch 24/100, Step 201, Loss: 2.668644428253174
Epoch 24/100, Step 211, Loss: 2.5380334854125977
Epoch 24/100, Step 221, Loss: 2.0837066173553467
Epoch 24/100, Step 231, Loss: 2.0615506172180176
Epoch 24/100, Step 241, Loss: 2.5047855377197266
Epoch 24/100, Step 251, Loss: 2.7849576473236084
Epoch 24/100, Step 261, Loss: 2.0214157104492188
Epoch 24/100, Step 271, Loss: 2.034599781036377
Epoch 24/100, Step 281, Loss: 1.8874285221099854
Epoch 24/100, Step 291, Loss: 2.6097512245178223
Epoch 24/100, Step 301, Loss: 2.4113290309906006
Epoch 24/100, Step 311, Loss: 1.4095110893249512
Epoch 24/100, Step 321,

Epoch 28/100, Step 341, Loss: 1.8501349687576294
Epoch 28/100, Step 351, Loss: 1.3280413150787354
Epoch 28/100, Step 361, Loss: 1.849678874015808
Epoch 29/100, Step 1, Loss: 2.28460693359375
Epoch 29/100, Step 11, Loss: 2.7102251052856445
Epoch 29/100, Step 21, Loss: 2.336930751800537
Epoch 29/100, Step 31, Loss: 1.5581122636795044
Epoch 29/100, Step 41, Loss: 1.2602171897888184
Epoch 29/100, Step 51, Loss: 1.9756364822387695
Epoch 29/100, Step 61, Loss: 1.797582745552063
Epoch 29/100, Step 71, Loss: 1.500029444694519
Epoch 29/100, Step 81, Loss: 2.1754848957061768
Epoch 29/100, Step 91, Loss: 1.9700956344604492
Epoch 29/100, Step 101, Loss: 1.3857269287109375
Epoch 29/100, Step 111, Loss: 2.20163893699646
Epoch 29/100, Step 121, Loss: 2.064619541168213
Epoch 29/100, Step 131, Loss: 2.1262166500091553
Epoch 29/100, Step 141, Loss: 1.4202625751495361
Epoch 29/100, Step 151, Loss: 1.3071367740631104
Epoch 29/100, Step 161, Loss: 2.2709224224090576
Epoch 29/100, Step 171, Loss: 2.43939495

Epoch 33/100, Step 191, Loss: 1.693976879119873
Epoch 33/100, Step 201, Loss: 0.9707023501396179
Epoch 33/100, Step 211, Loss: 1.4501733779907227
Epoch 33/100, Step 221, Loss: 1.3485803604125977
Epoch 33/100, Step 231, Loss: 1.2469815015792847
Epoch 33/100, Step 241, Loss: 1.4300544261932373
Epoch 33/100, Step 251, Loss: 1.6326584815979004
Epoch 33/100, Step 261, Loss: 1.7960292100906372
Epoch 33/100, Step 271, Loss: 1.1495803594589233
Epoch 33/100, Step 281, Loss: 0.9055989384651184
Epoch 33/100, Step 291, Loss: 1.5467348098754883
Epoch 33/100, Step 301, Loss: 1.6585365533828735
Epoch 33/100, Step 311, Loss: 1.4232083559036255
Epoch 33/100, Step 321, Loss: 1.319531798362732
Epoch 33/100, Step 331, Loss: 1.7763291597366333
Epoch 33/100, Step 341, Loss: 0.9498095512390137
Epoch 33/100, Step 351, Loss: 0.9406037330627441
Epoch 33/100, Step 361, Loss: 1.5985623598098755
Epoch 34/100, Step 1, Loss: 1.3483681678771973
Epoch 34/100, Step 11, Loss: 0.9693371653556824
Epoch 34/100, Step 21, Lo

Epoch 38/100, Step 31, Loss: 1.3107028007507324
Epoch 38/100, Step 41, Loss: 1.0696032047271729
Epoch 38/100, Step 51, Loss: 1.4279401302337646
Epoch 38/100, Step 61, Loss: 0.8846351504325867
Epoch 38/100, Step 71, Loss: 1.5099462270736694
Epoch 38/100, Step 81, Loss: 0.6084270477294922
Epoch 38/100, Step 91, Loss: 0.7251667380332947
Epoch 38/100, Step 101, Loss: 0.7757107615470886
Epoch 38/100, Step 111, Loss: 0.9137650728225708
Epoch 38/100, Step 121, Loss: 1.3095182180404663
Epoch 38/100, Step 131, Loss: 1.244731068611145
Epoch 38/100, Step 141, Loss: 1.0465623140335083
Epoch 38/100, Step 151, Loss: 1.660347819328308
Epoch 38/100, Step 161, Loss: 0.8161086440086365
Epoch 38/100, Step 171, Loss: 0.7365796566009521
Epoch 38/100, Step 181, Loss: 1.2606785297393799
Epoch 38/100, Step 191, Loss: 1.6459550857543945
Epoch 38/100, Step 201, Loss: 1.337121605873108
Epoch 38/100, Step 211, Loss: 1.0109496116638184
Epoch 38/100, Step 221, Loss: 0.8966121673583984
Epoch 38/100, Step 231, Loss: 

Epoch 42/100, Step 241, Loss: 1.1176172494888306
Epoch 42/100, Step 251, Loss: 0.7534047365188599
Epoch 42/100, Step 261, Loss: 1.5655808448791504
Epoch 42/100, Step 271, Loss: 1.1848092079162598
Epoch 42/100, Step 281, Loss: 0.8168073296546936
Epoch 42/100, Step 291, Loss: 1.0001156330108643
Epoch 42/100, Step 301, Loss: 0.9220693111419678
Epoch 42/100, Step 311, Loss: 0.8550592064857483
Epoch 42/100, Step 321, Loss: 0.9231992363929749
Epoch 42/100, Step 331, Loss: 0.6729952692985535
Epoch 42/100, Step 341, Loss: 0.7078519463539124
Epoch 42/100, Step 351, Loss: 0.8652878403663635
Epoch 42/100, Step 361, Loss: 0.9702801704406738
Epoch 43/100, Step 1, Loss: 0.48186200857162476
Epoch 43/100, Step 11, Loss: 1.2446736097335815
Epoch 43/100, Step 21, Loss: 0.6417281627655029
Epoch 43/100, Step 31, Loss: 1.3353004455566406
Epoch 43/100, Step 41, Loss: 0.8472495079040527
Epoch 43/100, Step 51, Loss: 0.9274616837501526
Epoch 43/100, Step 61, Loss: 0.6092960238456726
Epoch 43/100, Step 71, Loss

Epoch 47/100, Step 81, Loss: 0.5214908123016357
Epoch 47/100, Step 91, Loss: 0.4791414141654968
Epoch 47/100, Step 101, Loss: 0.684723436832428
Epoch 47/100, Step 111, Loss: 0.3674705922603607
Epoch 47/100, Step 121, Loss: 0.6201081871986389
Epoch 47/100, Step 131, Loss: 0.831281304359436
Epoch 47/100, Step 141, Loss: 0.7356937527656555
Epoch 47/100, Step 151, Loss: 0.7277629375457764
Epoch 47/100, Step 161, Loss: 0.3944413959980011
Epoch 47/100, Step 171, Loss: 0.6445716619491577
Epoch 47/100, Step 181, Loss: 0.7153991460800171
Epoch 47/100, Step 191, Loss: 1.0768206119537354
Epoch 47/100, Step 201, Loss: 1.0017979145050049
Epoch 47/100, Step 211, Loss: 0.420367032289505
Epoch 47/100, Step 221, Loss: 0.4356672167778015
Epoch 47/100, Step 231, Loss: 0.48660075664520264
Epoch 47/100, Step 241, Loss: 0.6296613216400146
Epoch 47/100, Step 251, Loss: 0.8472769856452942
Epoch 47/100, Step 261, Loss: 0.7212298512458801
Epoch 47/100, Step 271, Loss: 0.5089578032493591
Epoch 47/100, Step 281, 

Epoch 51/100, Step 291, Loss: 0.6307932138442993
Epoch 51/100, Step 301, Loss: 0.975734293460846
Epoch 51/100, Step 311, Loss: 0.46180421113967896
Epoch 51/100, Step 321, Loss: 0.6560173034667969
Epoch 51/100, Step 331, Loss: 0.3325207233428955
Epoch 51/100, Step 341, Loss: 0.32646581530570984
Epoch 51/100, Step 351, Loss: 0.7416433691978455
Epoch 51/100, Step 361, Loss: 0.3166297376155853
Epoch 52/100, Step 1, Loss: 0.3893885910511017
Epoch 52/100, Step 11, Loss: 0.2623175084590912
Epoch 52/100, Step 21, Loss: 0.4982377588748932
Epoch 52/100, Step 31, Loss: 0.504220724105835
Epoch 52/100, Step 41, Loss: 0.5839739441871643
Epoch 52/100, Step 51, Loss: 0.6965935230255127
Epoch 52/100, Step 61, Loss: 0.6609988212585449
Epoch 52/100, Step 71, Loss: 0.6129423975944519
Epoch 52/100, Step 81, Loss: 0.5369455218315125
Epoch 52/100, Step 91, Loss: 0.28164538741111755
Epoch 52/100, Step 101, Loss: 0.20884393155574799
Epoch 52/100, Step 111, Loss: 0.5312389731407166
Epoch 52/100, Step 121, Loss:

Epoch 56/100, Step 121, Loss: 0.46336251497268677
Epoch 56/100, Step 131, Loss: 0.31905797123908997
Epoch 56/100, Step 141, Loss: 0.19205188751220703
Epoch 56/100, Step 151, Loss: 0.4704340398311615
Epoch 56/100, Step 161, Loss: 0.382571280002594
Epoch 56/100, Step 171, Loss: 0.5398601293563843
Epoch 56/100, Step 181, Loss: 0.35174182057380676
Epoch 56/100, Step 191, Loss: 0.3943469524383545
Epoch 56/100, Step 201, Loss: 0.5408291220664978
Epoch 56/100, Step 211, Loss: 0.3777329921722412
Epoch 56/100, Step 221, Loss: 0.37811213731765747
Epoch 56/100, Step 231, Loss: 0.6734633445739746
Epoch 56/100, Step 241, Loss: 0.5866354703903198
Epoch 56/100, Step 251, Loss: 0.6304921507835388
Epoch 56/100, Step 261, Loss: 0.6778460144996643
Epoch 56/100, Step 271, Loss: 0.21828483045101166
Epoch 56/100, Step 281, Loss: 0.34425339102745056
Epoch 56/100, Step 291, Loss: 0.32867830991744995
Epoch 56/100, Step 301, Loss: 0.1994665414094925
Epoch 56/100, Step 311, Loss: 0.2519721984863281
Epoch 56/100,

Epoch 60/100, Step 311, Loss: 0.15764130651950836
Epoch 60/100, Step 321, Loss: 0.13333937525749207
Epoch 60/100, Step 331, Loss: 0.3378867506980896
Epoch 60/100, Step 341, Loss: 0.17179448902606964
Epoch 60/100, Step 351, Loss: 0.12750151753425598
Epoch 60/100, Step 361, Loss: 0.28891581296920776
Epoch 61/100, Step 1, Loss: 0.2508259415626526
Epoch 61/100, Step 11, Loss: 0.28905361890792847
Epoch 61/100, Step 21, Loss: 0.21661560237407684
Epoch 61/100, Step 31, Loss: 0.19029274582862854
Epoch 61/100, Step 41, Loss: 0.2475425899028778
Epoch 61/100, Step 51, Loss: 0.2881168723106384
Epoch 61/100, Step 61, Loss: 0.17302559316158295
Epoch 61/100, Step 71, Loss: 0.12496200948953629
Epoch 61/100, Step 81, Loss: 0.2237272709608078
Epoch 61/100, Step 91, Loss: 0.26754653453826904
Epoch 61/100, Step 101, Loss: 0.2916221022605896
Epoch 61/100, Step 111, Loss: 0.3532579839229584
Epoch 61/100, Step 121, Loss: 0.33733320236206055
Epoch 61/100, Step 131, Loss: 0.3171541094779968
Epoch 61/100, Step 

Epoch 65/100, Step 131, Loss: 0.15688428282737732
Epoch 65/100, Step 141, Loss: 0.26785901188850403
Epoch 65/100, Step 151, Loss: 0.1884002983570099
Epoch 65/100, Step 161, Loss: 0.18410560488700867
Epoch 65/100, Step 171, Loss: 0.21584011614322662
Epoch 65/100, Step 181, Loss: 0.23060324788093567
Epoch 65/100, Step 191, Loss: 0.23272904753684998
Epoch 65/100, Step 201, Loss: 0.22766511142253876
Epoch 65/100, Step 211, Loss: 0.1674998551607132
Epoch 65/100, Step 221, Loss: 0.11202456802129745
Epoch 65/100, Step 231, Loss: 0.1476382166147232
Epoch 65/100, Step 241, Loss: 0.0939263105392456
Epoch 65/100, Step 251, Loss: 0.21711963415145874
Epoch 65/100, Step 261, Loss: 0.2820400595664978
Epoch 65/100, Step 271, Loss: 0.12878896296024323
Epoch 65/100, Step 281, Loss: 0.12350282073020935
Epoch 65/100, Step 291, Loss: 0.1313333809375763
Epoch 65/100, Step 301, Loss: 0.233578622341156
Epoch 65/100, Step 311, Loss: 0.21239805221557617
Epoch 65/100, Step 321, Loss: 0.3116298317909241
Epoch 65/

Epoch 69/100, Step 311, Loss: 0.07948959618806839
Epoch 69/100, Step 321, Loss: 0.10797745734453201
Epoch 69/100, Step 331, Loss: 0.17262160778045654
Epoch 69/100, Step 341, Loss: 0.20311005413532257
Epoch 69/100, Step 351, Loss: 0.07859025150537491
Epoch 69/100, Step 361, Loss: 0.1315818428993225
Epoch 70/100, Step 1, Loss: 0.09851064532995224
Epoch 70/100, Step 11, Loss: 0.10421687364578247
Epoch 70/100, Step 21, Loss: 0.11501353234052658
Epoch 70/100, Step 31, Loss: 0.13823561370372772
Epoch 70/100, Step 41, Loss: 0.26747092604637146
Epoch 70/100, Step 51, Loss: 0.10162068903446198
Epoch 70/100, Step 61, Loss: 0.23457352817058563
Epoch 70/100, Step 71, Loss: 0.14315521717071533
Epoch 70/100, Step 81, Loss: 0.27578863501548767
Epoch 70/100, Step 91, Loss: 0.13284005224704742
Epoch 70/100, Step 101, Loss: 0.08119923621416092
Epoch 70/100, Step 111, Loss: 0.24677924811840057
Epoch 70/100, Step 121, Loss: 0.10103500634431839
Epoch 70/100, Step 131, Loss: 0.09196673333644867
Epoch 70/100

Epoch 74/100, Step 131, Loss: 0.11835917830467224
Epoch 74/100, Step 141, Loss: 0.17400135099887848
Epoch 74/100, Step 151, Loss: 0.1196489930152893
Epoch 74/100, Step 161, Loss: 0.08591332286596298
Epoch 74/100, Step 171, Loss: 0.08058497309684753
Epoch 74/100, Step 181, Loss: 0.2843994200229645
Epoch 74/100, Step 191, Loss: 0.09727000445127487
Epoch 74/100, Step 201, Loss: 0.052719686180353165
Epoch 74/100, Step 211, Loss: 0.07099110633134842
Epoch 74/100, Step 221, Loss: 0.07119055837392807
Epoch 74/100, Step 231, Loss: 0.0668235495686531
Epoch 74/100, Step 241, Loss: 0.16568918526172638
Epoch 74/100, Step 251, Loss: 0.2896624803543091
Epoch 74/100, Step 261, Loss: 0.14789904654026031
Epoch 74/100, Step 271, Loss: 0.3090450167655945
Epoch 74/100, Step 281, Loss: 0.18405230343341827
Epoch 74/100, Step 291, Loss: 0.08928730338811874
Epoch 74/100, Step 301, Loss: 0.16824761033058167
Epoch 74/100, Step 311, Loss: 0.22134734690189362
Epoch 74/100, Step 321, Loss: 0.06810959428548813
Epoc

Epoch 78/100, Step 311, Loss: 0.026018353179097176
Epoch 78/100, Step 321, Loss: 0.06975358724594116
Epoch 78/100, Step 331, Loss: 0.07054958492517471
Epoch 78/100, Step 341, Loss: 0.04473228380084038
Epoch 78/100, Step 351, Loss: 0.1201004907488823
Epoch 78/100, Step 361, Loss: 0.06311196833848953
Epoch 79/100, Step 1, Loss: 0.20825116336345673
Epoch 79/100, Step 11, Loss: 0.0499516986310482
Epoch 79/100, Step 21, Loss: 0.08231004327535629
Epoch 79/100, Step 31, Loss: 0.03244840353727341
Epoch 79/100, Step 41, Loss: 0.08076972514390945
Epoch 79/100, Step 51, Loss: 0.03961808606982231
Epoch 79/100, Step 61, Loss: 0.08409851789474487
Epoch 79/100, Step 71, Loss: 0.0935150682926178
Epoch 79/100, Step 81, Loss: 0.03959199786186218
Epoch 79/100, Step 91, Loss: 0.04892711713910103
Epoch 79/100, Step 101, Loss: 0.15784940123558044
Epoch 79/100, Step 111, Loss: 0.06104040890932083
Epoch 79/100, Step 121, Loss: 0.07426733523607254
Epoch 79/100, Step 131, Loss: 0.08094759285449982
Epoch 79/100,

Epoch 83/100, Step 121, Loss: 0.5265539288520813
Epoch 83/100, Step 131, Loss: 0.31713318824768066
Epoch 83/100, Step 141, Loss: 0.09790930151939392
Epoch 83/100, Step 151, Loss: 0.6452054381370544
Epoch 83/100, Step 161, Loss: 0.29539018869400024
Epoch 83/100, Step 171, Loss: 0.33747467398643494
Epoch 83/100, Step 181, Loss: 0.321271151304245
Epoch 83/100, Step 191, Loss: 0.13685068488121033
Epoch 83/100, Step 201, Loss: 0.11547006666660309
Epoch 83/100, Step 211, Loss: 0.2233303338289261
Epoch 83/100, Step 221, Loss: 0.14867988228797913
Epoch 83/100, Step 231, Loss: 0.13568419218063354
Epoch 83/100, Step 241, Loss: 0.07809022814035416
Epoch 83/100, Step 251, Loss: 0.09117498248815536
Epoch 83/100, Step 261, Loss: 0.2728705108165741
Epoch 83/100, Step 271, Loss: 0.19388408958911896
Epoch 83/100, Step 281, Loss: 0.19730278849601746
Epoch 83/100, Step 291, Loss: 0.12254873663187027
Epoch 83/100, Step 301, Loss: 0.11566881090402603
Epoch 83/100, Step 311, Loss: 0.2602601647377014
Epoch 8

Epoch 87/100, Step 301, Loss: 0.1260048896074295
Epoch 87/100, Step 311, Loss: 0.15387536585330963
Epoch 87/100, Step 321, Loss: 0.08898380398750305
Epoch 87/100, Step 331, Loss: 0.0738033801317215
Epoch 87/100, Step 341, Loss: 0.09582659602165222
Epoch 87/100, Step 351, Loss: 0.0719393938779831
Epoch 87/100, Step 361, Loss: 0.11398891359567642
Epoch 88/100, Step 1, Loss: 0.07856415212154388
Epoch 88/100, Step 11, Loss: 0.07868774235248566
Epoch 88/100, Step 21, Loss: 0.06853215396404266
Epoch 88/100, Step 31, Loss: 0.06836822628974915
Epoch 88/100, Step 41, Loss: 0.061834923923015594
Epoch 88/100, Step 51, Loss: 0.07789359241724014
Epoch 88/100, Step 61, Loss: 0.19579914212226868
Epoch 88/100, Step 71, Loss: 0.02791762910783291
Epoch 88/100, Step 81, Loss: 0.0869540125131607
Epoch 88/100, Step 91, Loss: 0.06936310976743698
Epoch 88/100, Step 101, Loss: 0.04081251472234726
Epoch 88/100, Step 111, Loss: 0.0752982571721077
Epoch 88/100, Step 121, Loss: 0.06592937558889389
Epoch 88/100, S

Epoch 92/100, Step 101, Loss: 0.11823468655347824
Epoch 92/100, Step 111, Loss: 0.04680758714675903
Epoch 92/100, Step 121, Loss: 0.038971640169620514
Epoch 92/100, Step 131, Loss: 0.03560321405529976
Epoch 92/100, Step 141, Loss: 0.10075259953737259
Epoch 92/100, Step 151, Loss: 0.08951966464519501
Epoch 92/100, Step 161, Loss: 0.022507142275571823
Epoch 92/100, Step 171, Loss: 0.05621959641575813
Epoch 92/100, Step 181, Loss: 0.06394900381565094
Epoch 92/100, Step 191, Loss: 0.02360684610903263
Epoch 92/100, Step 201, Loss: 0.04137718677520752
Epoch 92/100, Step 211, Loss: 0.04037009924650192
Epoch 92/100, Step 221, Loss: 0.03868776932358742
Epoch 92/100, Step 231, Loss: 0.01956910453736782
Epoch 92/100, Step 241, Loss: 0.04909563064575195
Epoch 92/100, Step 251, Loss: 0.038201697170734406
Epoch 92/100, Step 261, Loss: 0.01930229365825653
Epoch 92/100, Step 271, Loss: 0.05203394964337349
Epoch 92/100, Step 281, Loss: 0.051193807274103165
Epoch 92/100, Step 291, Loss: 0.05325635150074

Epoch 96/100, Step 261, Loss: 0.01693134568631649
Epoch 96/100, Step 271, Loss: 0.02141587994992733
Epoch 96/100, Step 281, Loss: 0.02692672424018383
Epoch 96/100, Step 291, Loss: 0.06052294746041298
Epoch 96/100, Step 301, Loss: 0.01999417133629322
Epoch 96/100, Step 311, Loss: 0.025944044813513756
Epoch 96/100, Step 321, Loss: 0.027111582458019257
Epoch 96/100, Step 331, Loss: 0.024091392755508423
Epoch 96/100, Step 341, Loss: 0.035403985530138016
Epoch 96/100, Step 351, Loss: 0.03156869485974312
Epoch 96/100, Step 361, Loss: 0.04596955329179764
Epoch 97/100, Step 1, Loss: 0.022454453632235527
Epoch 97/100, Step 11, Loss: 0.05348461866378784
Epoch 97/100, Step 21, Loss: 0.027130452916026115
Epoch 97/100, Step 31, Loss: 0.03595319762825966
Epoch 97/100, Step 41, Loss: 0.014694973826408386
Epoch 97/100, Step 51, Loss: 0.01837245002388954
Epoch 97/100, Step 61, Loss: 0.02216074988245964
Epoch 97/100, Step 71, Loss: 0.019360220059752464
Epoch 97/100, Step 81, Loss: 0.027408115565776825
E

### Save and Load Seq2Seq model

In [89]:
# Save the entire Seq2Seq model
torch.save(seq2seq_model.state_dict(), 'seq2seq_model.pth')

seq2seq_model.load_state_dict(torch.load('seq2seq_model.pth'))

  seq2seq_model.load_state_dict(torch.load('seq2seq_model.pth'))


<All keys matched successfully>

### Save/Load Encoder/Decoder separately

In [87]:
# Saving the model
torch.save(encoder.state_dict(), 'encoder.pth')
torch.save(decoder.state_dict(), 'decoder.pth')

# Loading the model
encoder.load_state_dict(torch.load('encoder.pth'))
decoder.load_state_dict(torch.load('decoder.pth'))

  encoder.load_state_dict(torch.load('encoder.pth'))
  decoder.load_state_dict(torch.load('decoder.pth'))


<All keys matched successfully>

### Evaluation function

In [113]:
import torch
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# BLEU score calculation function
def calculate_bleu(reference, candidate):
    smooth = SmoothingFunction().method4
    return sentence_bleu([reference], candidate, smoothing_function=smooth)

# Function to evaluate the model and calculate BLEU scores
def evaluate_model(test_dataloader, seq2seq_model, vocab):
    seq2seq_model.eval()
    total_bleu_score = 0
    total_samples = 0

    with torch.no_grad():
        for features, captions in test_dataloader:
            # Forward pass through the model
            outputs = seq2seq_model(features, captions[:, :-1])
            _, predicted_indices = torch.max(outputs, dim=2)

            # Iterate over each sample in the batch
            for i in range(features.size(0)):

                predicted_caption = vocab.denumericalize(predicted_indices[i])
                reference_caption = vocab.denumericalize([idx for idx in captions[i].tolist() if idx != vocab.word2idx[vocab.pad_token]])

                # Calculates BLEU score
                bleu_score = calculate_bleu(reference_caption, predicted_caption)
                total_bleu_score += bleu_score
                total_samples += 1

                # Print some examples (optional)
#                 print(f"Reference: {' '.join(reference_caption)}")
#                 print(f"Predicted: {' '.join(predicted_caption)}")
#                 print(f"BLEU Score: {bleu_score}\n")

    # Computes average BLEU score
    average_bleu_score = total_bleu_score / total_samples if total_samples > 0 else 0
    print(f"Average BLEU Score: {average_bleu_score}")


evaluate_model(test_dataloader, seq2seq_model, vocab)


  padded_caption = F.pad(torch.tensor(caption), (0, max_length - len(caption)), value=vocab.word2idx[vocab.pad_token])


Average BLEU Score: 0.00944761940980335
