# Introduction
In this notebook, we will create a video captioning model which uses sign-language video frames as input and outputs a translation of it in arabic. For simplicity we can translate the arabic sentences to english and then translate them back. For this assignment, we will use english sentences which were translated from original arabic captions given.

### Imports

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms

from torchmetrics import WER

import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import random

### Feature extraction using Vgg16
We will extract features from both train and test using vgg16. Finally, each label will have (num_samples X 80 X 4096) features. 
Since, there are 534 samples in train and test. We will have our final df of shape (534, 80 X 4097). The last entry in 4097 is that of the label.
When we create our custom dataset class, we will use this entry to fetch the sentence label from the groundTruth.txt file.

In [None]:
vgg16 = torchvision.models.vgg16(pretrained=True)
print(vgg16)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [None]:
class VGG16FeatureExtractor(nn.Module):
    def __init__(self) -> None:
        super().__init__()

        self.vgg16 = torchvision.models.vgg16(pretrained=True)
        self.fine_tune()
    
    
    def forward(self, x):
        # Shape of x: (batch_size, channels, height, width)
        x = self.vgg16(x)
        return x


    def fine_tune(self):
        for param in self.vgg16.parameters():
            param.requires_grad = False
        
        self.vgg16.classifier = nn.Sequential(*[self.vgg16.classifier[i] for i in range(4)]) # Keeping only till classifier(3) layer. 

In [None]:
def extract_features(root_dir_path, saved_feat_dir_path):

    dirs = ['0001', '0002', '0003', '0004', '0005', '0006', '0007', '0008', '0009', '0010']

    feature_extractor_vgg16 = VGG16FeatureExtractor()

    for dir in dirs:
        print(f'Extracting features from {root_dir_path}/{dir}')
        
        dirwise_feats = []

        label = int(dir[-1]) 
        # NOTE: For the dir '0010', label would be 0.0 i.e. this signifies 10th label.
        
        transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])

        dataset = ImageFolder(root=os.path.join('train', dir), transform=transform)
        loader = DataLoader(dataset, batch_size=80)
        
        for img, _ in tqdm(loader):
            features = feature_extractor_vgg16(img) # Shape: (80, 4096)
            features_np = features.numpy() # Shape: (80, 4096)
            features_np = features_np.flatten() # Shape: (327680,)
            features_label_np = np.append(features_np, label) # Shape: (327681,)
            dirwise_feats.append(features_label_np)
            
        dirwise_feats = np.array(dirwise_feats) # Shape: (48, 327681)
        print(f'dir {dir} dirwise_feats.shape = {dirwise_feats.shape}')
        pd.DataFrame(dirwise_feats).to_csv(f'{saved_feat_dir_path}\{dir}.csv', sep=',', header=None, index=None)
        

In [None]:
extract_features(root_dir_path='train', saved_feat_dir_path='features_train')

Extracting features from train/0001


100%|██████████| 48/48 [06:10<00:00,  7.71s/it]


dir 0001 dirwise_feats.shape = (48, 327681)
Extracting features from train/0002


100%|██████████| 50/50 [06:39<00:00,  7.98s/it]


dir 0002 dirwise_feats.shape = (50, 327681)
Extracting features from train/0003


100%|██████████| 48/48 [06:23<00:00,  8.00s/it]


dir 0003 dirwise_feats.shape = (48, 327681)
Extracting features from train/0004


100%|██████████| 48/48 [06:30<00:00,  8.14s/it]


dir 0004 dirwise_feats.shape = (48, 327681)
Extracting features from train/0005


100%|██████████| 60/60 [08:04<00:00,  8.07s/it]


dir 0005 dirwise_feats.shape = (60, 327681)
Extracting features from train/0006


100%|██████████| 49/49 [06:32<00:00,  8.01s/it]


dir 0006 dirwise_feats.shape = (49, 327681)
Extracting features from train/0007


100%|██████████| 70/70 [09:12<00:00,  7.89s/it]


dir 0007 dirwise_feats.shape = (70, 327681)
Extracting features from train/0008


100%|██████████| 62/62 [07:59<00:00,  7.73s/it]


dir 0008 dirwise_feats.shape = (62, 327681)
Extracting features from train/0009


100%|██████████| 48/48 [06:05<00:00,  7.62s/it]


dir 0009 dirwise_feats.shape = (48, 327681)
Extracting features from train/0010


100%|██████████| 51/51 [06:30<00:00,  7.65s/it]


dir 0010 dirwise_feats.shape = (51, 327681)


In [None]:
extract_features(root_dir_path='test', saved_feat_dir_path='features_test')

Extracting features from test/0001


100%|██████████| 48/48 [05:59<00:00,  7.50s/it]


dir 0001 dirwise_feats.shape = (48, 327681)
Extracting features from test/0002


100%|██████████| 50/50 [06:23<00:00,  7.67s/it]


dir 0002 dirwise_feats.shape = (50, 327681)
Extracting features from test/0003


100%|██████████| 48/48 [06:10<00:00,  7.72s/it]


dir 0003 dirwise_feats.shape = (48, 327681)
Extracting features from test/0004


100%|██████████| 48/48 [06:09<00:00,  7.69s/it]


dir 0004 dirwise_feats.shape = (48, 327681)
Extracting features from test/0005


100%|██████████| 60/60 [07:31<00:00,  7.52s/it]


dir 0005 dirwise_feats.shape = (60, 327681)
Extracting features from test/0006


100%|██████████| 49/49 [06:00<00:00,  7.35s/it]


dir 0006 dirwise_feats.shape = (49, 327681)
Extracting features from test/0007


100%|██████████| 70/70 [08:38<00:00,  7.40s/it]


dir 0007 dirwise_feats.shape = (70, 327681)
Extracting features from test/0008


100%|██████████| 62/62 [07:36<00:00,  7.37s/it]


dir 0008 dirwise_feats.shape = (62, 327681)
Extracting features from test/0009


100%|██████████| 48/48 [05:51<00:00,  7.32s/it]


dir 0009 dirwise_feats.shape = (48, 327681)
Extracting features from test/0010


100%|██████████| 51/51 [06:14<00:00,  7.34s/it]


dir 0010 dirwise_feats.shape = (51, 327681)


### Dataset/Dataloaders preparation

In [None]:
def load_dataframes(root_dir, name):
    files = os.listdir(root_dir)
    all_df = [] # A list to store all dfs so that they can be concatenated at the end.
    for file in files:
        df = pd.read_csv(os.path.join(root_dir, file), sep=',', header=None, engine='python')
        all_df.append(df)
        print(f'{file} done')

    combined_df = pd.concat(all_df, axis=0)
    print(f'{name} shape = {combined_df.shape}')
    return combined_df

In [None]:
# Takes around 7 mins to load all csv files.

print('Loading train data:')
train_df = load_dataframes(root_dir='features_train', name='train_df')
print()

print('Loading test data:')
test_df = load_dataframes(root_dir='features_test', name='test_df')
print()

Loading train data:
0001.csv done
0002.csv done
0003.csv done
0004.csv done
0005.csv done
0006.csv done
0007.csv done
0008.csv done
0009.csv done
0010.csv done
train_df shape = (534, 327681)

Loading test data:
0001.csv done
0002.csv done
0003.csv done
0004.csv done
0005.csv done
0006.csv done
0007.csv done
0008.csv done
0009.csv done
0010.csv done
test_df shape = (534, 327681)



In [None]:
def prepare_captions(filepath):
    # Add all the captions.
    captions = []
    f = open(filepath, "r", encoding='utf-8')
    for line in f.readlines():
        captions.append(line.split())


    vocab = set() # Total unique words including <SOS>, <EOS>, <PAD> forms the vocab. 
    for caption in captions:
        print(caption, f'len = {len(caption)}')
        for token in caption:
            vocab.add(token)
    print(f'\nVocab:\n{vocab} len = {len(vocab)}')

    # Mapping string/word to an index.
    stoi = {
    '<PAD>': 0,
    '<SOS>': 1,
    '<EOS>': 2,
    }

    temp = {}
    idx = 3 # Since indices 0,1,2 are already reversed for tokens <PAD>, <SOS>, <EOS> respectively.
    for caption in captions:
        for tok in caption:
            if tok not in ['<PAD>', '<SOS>', '<EOS>'] and tok not in temp:
                temp[tok] = idx
                idx += 1
                
    stoi.update(temp)
    print(f'\nString-to-index mapping:\n{stoi}\n')

    # Mapping index to string/word.
    itos = {value : key for (key, value) in stoi.items()}
    print(f'\nIndex-to-string mapping:\n{itos}\n')

    return captions, stoi, itos

In [None]:
captions, stoi, itos = prepare_captions(filepath='groundTruth.txt')

['<SOS>', 'god', 'name', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'] len = 14
['<SOS>', 'thank', 'god', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'] len = 14
['<SOS>', 'all', 'deaf', 'arab', 'listeners', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'] len = 14
['<SOS>', 'peace', 'be', 'upon', 'you', 'may', 'gods', 'mercy', 'and', 'blessings', 'be', 'upon', 'you', '<EOS>'] len = 14
['<SOS>', 'today', 'i', 'present', 'to', 'you', 'another', 'program', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'] len = 14
['<SOS>', 'the', 'subject', 'of', 'studying', 'arabic', 'sign', 'language', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'] len = 14
['<SOS>', 'words', 'of', 'the', 'day', 'are', 'scattered', 'in', 'religion', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'] len = 14
['<SOS>', 'also', 'normal', 'words', '<EOS>', '<PAD>', '<PAD>', '<PAD>'

In [None]:
class VideoCaptionDataset(Dataset):
    def __init__(self, df, stoi, captions) -> None:
        super().__init__()
        self.df = train_df
        self.stoi = stoi
        self.captions = captions


    def __len__(self):
        return len(self.df)
    

    def __getitem__(self, index):
        # Reshaping the data to (frames, extracted_features) i.e. (80, 4096) in our case.
        sample = self.df.iloc[index, :-1].to_numpy().reshape(80, 4096) 
        sample = torch.tensor(sample).float()
        
        # Label corresponds to the last column of the df. This is just a number from 1.0 - 9.0 with 0.0 signifying label 10.0
        label = self.df.iloc[index, -1]
        if label == 0.0:
            label = 10.0
        
        tokenized_caption = self.captions[int(label) - 1]
        
        mapped_caption = []
        # Convert the sentences to their mapping through stoi.
        for tok in tokenized_caption:
            mapped_caption.append(stoi[tok])
        
        return sample, torch.tensor(mapped_caption)

In [None]:
train_dataset = VideoCaptionDataset(train_df, stoi, captions)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = VideoCaptionDataset(test_df, stoi, captions)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

### Encoder-Decoder

#### Model creation

In [None]:
device = 'cuda'

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout) -> None:
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)       


    def forward(self, x):
        # Shape of x: (frames, batch_size, input_size) i.e. (80, batch_size, 4096)
        
        outputs, (hidden, cell) = self.lstm(x)
        # Shape of outputs: (80, batch_size, 512)
        # Shape of hidden: (2, batch_size, 512)
        # Shape of cell: (2, batch_size, 512)

        return hidden, cell


class DecoderRNN(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, input_size, hidden_size, num_layers, dropout, vocab_size) -> None:
        super().__init__()
        self.embed = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
        self.fc = nn.Linear(in_features=hidden_size, out_features=vocab_size)
        
        
    def forward(self, captions, enc_hidden, enc_cell):
        # We will give decoder one word at a time => seq_len = 1

        # Shape of captions: (batch_size,). But we need to create a sequence. Hence, we will shape it into (1, batch_size)
        # Shape of enc_hidden: (2, batch_size, 512)
        # Shape of enc_cell: (2, batch_size, 512)

        captions = captions.unsqueeze(0) # Shaping into (1, batch_size)
        
        embeddings = self.embed(captions)
        # Shape of embeddings: (seq_len, batch_size, embedding_dim)

        outputs, (hidden, cell) = self.lstm(embeddings, (enc_hidden, enc_cell)) # We pass the enc_hidden & enc_cell to the hidden states of our decoder as initial states
        # Shape of outputs: (1, batch_size, hidden_size) i.e. (1, batch_size, 512)
        # Shape of hidden: (num_layers, batch_size, hidden_size) i.e. (2, batch_size, 512)
        # Shape of cell: (num_layers, batch_size, hidden_size) i.e. (2, batch_size, 512)

        predictions = self.fc(outputs)
        # Shape of predictions: (1, batch_size, vocab_size) i.e. (1, batch_size, 45)
        # But these outputs will go to softmax and softmax expects (batch_size, classes) i.e. (batch_size, 45)
        # Therefore, we need to remove the first dimentsion i.e. 1 to make preditions shape to be (batch_size, 45)
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell


class EncoderDecoderModel(nn.Module):
    def __init__(self, encoder, decoder, vocab_size) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.vocab_size = vocab_size


    def forward(self, img_frames, captions, teacher_force_ratio=0.5):
        # Shape of img_frames: (frames, batch_size, input_size) i.e. (80, batch_size, 4096)
        # Shape of captions: (seq_len, batch_size) i.e. (14, batch_size)

        seq_len, batch_size = captions.size()
        
        hidden, cell = self.encoder(img_frames)

        outputs = torch.zeros(seq_len, batch_size, self.vocab_size).to(device)
        
        x = captions[0] # Grab the start token in the batch i.e. the <SOS> token whose index is 1.
        
        for t in range(1, seq_len):
            # Use previous hidden, cell as context from encoder at start i.e. use enc_hidden & enc_cell.
            predictions, hidden, cell = self.decoder(x, hidden, cell)

            # Store the prediction.
            outputs[t] = predictions

            # Get the best word the decoder predicted (index in the vocabulary)
            best_guess = predictions.argmax(1)
           
            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            x = captions[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

#### Running the model

In [None]:
def start_training(train_loader):
    LR = 1e-4
    WD = 1e-4
    PATIENCE = 5
    EPOCHS = 10

    # Inputs for the encoder, decoder & encoder-decoder combined model.
    input_size_encoder = 4096
    hidden_size = 512
    num_layers = 2
    dropout_encoder = 0.4
    num_embeddings = 45
    embedding_dim = 300
    input_size_decoder = 300 
    dropout_decoder = 0.1
    vocab_size = 45

    encoder = EncoderRNN(input_size_encoder, hidden_size, num_layers, dropout_encoder).to(device)
    decoder = DecoderRNN(num_embeddings, embedding_dim, input_size_decoder, hidden_size, num_layers, dropout_decoder, vocab_size).to(device)
    encoder_decoder = EncoderDecoderModel(encoder, decoder, vocab_size).to(device)

    optimizer = optim.Adam(encoder_decoder.parameters(), lr=LR, weight_decay=WD)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.01, patience=PATIENCE, verbose=True)

    loss_fn = nn.CrossEntropyLoss(ignore_index=0) # We ignore the index of <PAD> which is 0.


    # Training phase.
    for epoch in range(1, EPOCHS+1):
        encoder_decoder.train()
        
        running_loss = 0.0
    
        prog_bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=True)
        for batch_idx, (img_frames, captions) in prog_bar:   
            optimizer.zero_grad() 
            
            img_frames = img_frames.permute(1, 0, 2) # Reshape into (frames, batch_size, features) i.e. (80, batch_size, 4096)
            captions = captions.permute(1, 0) # Reshap into (seq_len, batch_size) i.e. (14, batch_size)

            img_frames, captions = img_frames.to(device), captions.long().to(device)
            
            predicted_captions = encoder_decoder(img_frames, captions)
            # Shape of predicted_captions is (seq_len, batch_size, vocab_size) i.e. (14, batch_size, 45)
            
            # We dont want the <SOS> token. Hence, we take from the first word/token.
            predicted_captions = predicted_captions[1:].reshape(-1, predicted_captions.shape[2])
            captions = captions[1:].reshape(-1)

            loss = loss_fn(predicted_captions, captions) 
            
            running_loss += loss.item()
        
            loss.backward()
            optimizer.step()
            
            prog_bar.set_description(f'Epoch {epoch}/{EPOCHS}')
        
        tr_loss = running_loss / len(train_loader)
        scheduler.step(tr_loss)
        print(f'\ttrain_loss = {tr_loss:.6f}')
    
    return encoder_decoder

In [None]:
model = start_training(train_loader)

Epoch 1/10: 100%|██████████| 34/34 [00:03<00:00, 10.13it/s]


	train_loss = 3.445835


Epoch 2/10: 100%|██████████| 34/34 [00:03<00:00, 10.20it/s]


	train_loss = 2.729215


Epoch 3/10: 100%|██████████| 34/34 [00:03<00:00, 10.69it/s]


	train_loss = 2.106164


Epoch 4/10: 100%|██████████| 34/34 [00:03<00:00, 10.65it/s]


	train_loss = 1.550732


Epoch 5/10: 100%|██████████| 34/34 [00:03<00:00, 10.39it/s]


	train_loss = 1.214832


Epoch 6/10: 100%|██████████| 34/34 [00:03<00:00, 10.24it/s]


	train_loss = 0.813443


Epoch 7/10: 100%|██████████| 34/34 [00:03<00:00, 10.63it/s]


	train_loss = 0.552890


Epoch 8/10: 100%|██████████| 34/34 [00:03<00:00, 10.88it/s]


	train_loss = 0.383524


Epoch 9/10: 100%|██████████| 34/34 [00:03<00:00, 10.86it/s]


	train_loss = 0.280035


Epoch 10/10: 100%|██████████| 34/34 [00:03<00:00, 10.88it/s]

	train_loss = 0.192840





#### Testing the model and getting the word-error-rate

In [None]:
def remove_special_tokens(sentence):
    cleaned_sent = ''
    tokenized_sent = sentence.split()
    for tok in tokenized_sent:
        if tok not in ['<PAD>', '<SOS>', '<EOS>']:
            cleaned_sent += tok + ' '
    return cleaned_sent


def decode_sentence(sentence):
    sent = ''
    for idx in sentence:
        if isinstance(idx, torch.Tensor):
            sent += itos[idx.item()] + ' '
        else:
            sent += itos[idx] + ' '
    
    sent = remove_special_tokens(sent)
    return sent


def get_caption(model, img_frame, true_caption):
    # Shape of img_frame: (frames, features) i.e. (80, 4090). Hence, we will reshape it into (80, 1, 4090)
    # Shape of true_caption: (seq_len,) i.e. (14,). Hence, we will reshape it into (14, 1)

    img_frame = img_frame.unsqueeze(1) # Now shape becomes (80, 1, 4090)
    true_caption = true_caption.unsqueeze(1) # Now shape becomes (14, 1)

    model.eval()
    with torch.no_grad():
        img_frame, true_caption = img_frame.to(device), true_caption.long().to(device)

        hidden, cell = model.encoder(img_frame)

        seq_len, batch_size = true_caption.size()

        predicted_caption = [stoi['<SOS>']]
        for t in range(1, seq_len):
            x = torch.LongTensor([predicted_caption[-1]]).to(device)

            prediction, hidden, cell = model.decoder(x, hidden, cell)

            best_guess = prediction.argmax(1).item()
            predicted_caption.append(best_guess)

            if best_guess == stoi['<EOS>']:
                break

    predicted_caption_decoded = decode_sentence(predicted_caption)
    return predicted_caption_decoded


def test_the_model(test_loader, model):
    wer = WER()

    all_predicted = []
    all_true = []

    for batch_idx, (img_frames, captions) in enumerate(test_loader):
        for img_f, cap in zip(img_frames, captions):
            predicted_caption = get_caption(model, img_f, cap)
            true_caption = decode_sentence(cap)

            all_true.append(true_caption)
            all_predicted.append(predicted_caption)
    
    for true_c, pred_c in zip(all_true, all_predicted):
        print(f'ACTUAL:    {true_c}')
        print(f'PREDICTED: {pred_c}')
        print()
    
    error_rate = wer(all_predicted, all_true).item()
    print(f'WER = {error_rate}') 

In [None]:
test_the_model(test_loader, model)

ACTUAL:    words of the day are scattered in religion 
PREDICTED: words of the day are scattered in religion 

ACTUAL:    god name 
PREDICTED: god name 

ACTUAL:    god is the greatest 
PREDICTED: god is the greatest 

ACTUAL:    peace be upon you may gods mercy and blessings be upon you 
PREDICTED: peace be upon you may gods mercy and blessings be upon you 

ACTUAL:    god is the greatest 
PREDICTED: god is the greatest 

ACTUAL:    also normal words 
PREDICTED: also normal words 

ACTUAL:    the subject of studying arabic sign language 
PREDICTED: the subject of studying arabic sign language 

ACTUAL:    words of the day are scattered in religion 
PREDICTED: words of the day are scattered in religion 

ACTUAL:    peace be upon you may gods mercy and blessings be upon you 
PREDICTED: peace be upon you may gods mercy and blessings be upon you 

ACTUAL:    the subject of studying arabic sign language 
PREDICTED: the subject of studying arabic sign language 

ACTUAL:    thank god 
PREDIC

A WER of 0 can be acheived if trained for more epochs. Since, there are only 10 sentences, our model can easily predict these sentences. The complexity in the 10 sentences is also not much. Therefore, a metric of WER is useful when we have 50-100 unique captions. For example, if we train the model for 20 epochs, it will obviosuly learn all the patterns and we will get WER = 0%.

### Encoder decoder with attention

#### Model creation

In [None]:
device = 'cuda'

class Attention(nn.Module):
    def __init__(self, hidden_size) -> None:
        super().__init__()
        self.hidden_size = hidden_size
        self.attention = torch.nn.Linear(self.hidden_size, self.hidden_size)
    

    def forward(self, hidden, enc_outputs):
        energy = self.attention(enc_outputs)
        attention_energy = torch.sum(hidden * energy, dim=2)
        attention_energy = attention_energy.t() # Transposing the attention_energy tensor.
        softmax_scores = nn.Softmax(dim=1)(attention_energy).unsqueeze(1)
        return softmax_scores


class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout) -> None:
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)       


    def forward(self, x):
        # Shape of x: (frames, batch_size, input_size) i.e. (80, batch_size, 4096)
        
        outputs, (hidden, cell) = self.lstm(x)
        # Shape of outputs: (80, batch_size, 512)
        # Shape of hidden: (2, batch_size, 512)
        # Shape of cell: (2, batch_size, 512)

        return hidden, cell


class DecoderRNN(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, input_size, hidden_size, num_layers, dropout, vocab_size) -> None:
        super().__init__()
        self.embed = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
        self.concat = nn.Linear(hidden_size*2, hidden_size)
        self.fc = nn.Linear(in_features=hidden_size, out_features=vocab_size)
        
        self.attention_model = Attention(hidden_size)
        

    def forward(self, captions, enc_hidden, enc_cell):
        # We will give decoder one word at a time => seq_len = 1

        # Shape of captions: (batch_size,). But we need to create a sequence. Hence, we will shape it into (1, batch_size)
        # Shape of enc_hidden: (2, batch_size, 512)
        # Shape of enc_cell: (2, batch_size, 512)

        captions = captions.unsqueeze(0) # Shaping into (1, batch_size)
        
        embeddings = self.embed(captions)
        # Shape of embeddings: (seq_len, batch_size, embedding_dim)

        outputs, (hidden, cell) = self.lstm(embeddings, (enc_hidden, enc_cell)) # We pass the enc_hidden & enc_cell to the hidden states of our decoder as initial states
        # Shape of outputs: (1, batch_size, hidden_size) i.e. (1, batch_size, 512)
        # Shape of hidden: (num_layers, batch_size, hidden_size) i.e. (2, batch_size, 512)
        # Shape of cell: (num_layers, batch_size, hidden_size) i.e. (2, batch_size, 512)

        attention_weights = self.attention_model(outputs, enc_hidden)
        context = attention_weights.bmm(enc_hidden.transpose(0,1))
        outputs = outputs.squeeze(0)
        context = context.squeeze(1)
        concat_input =  torch.cat((outputs, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))

        predictions = self.fc(concat_output)
        predictions = predictions.squeeze(0)
        # Shape of predictions: (vocab_size) i.e. (45,)
        
        return predictions, hidden, cell


class EncoderDecoderModel(nn.Module):
    def __init__(self, encoder, decoder, vocab_size) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.vocab_size = vocab_size


    def forward(self, img_frames, captions, teacher_force_ratio=0.5):
        # Shape of img_frames: (frames, batch_size, input_size) i.e. (80, batch_size, 4096)
        # Shape of captions: (seq_len, batch_size) i.e. (14, batch_size)

        seq_len, batch_size = captions.size()
        
        hidden, cell = self.encoder(img_frames)

        outputs = torch.zeros(seq_len, batch_size, self.vocab_size).to(device)
        
        x = captions[0] # Grab the start token in the batch i.e. the <SOS> token whose index is 1.
        
        for t in range(1, seq_len):
            # Use previous hidden, cell as context from encoder at start i.e. use enc_hidden & enc_cell.
            predictions, hidden, cell = self.decoder(x, hidden, cell)

            # Store the prediction.
            outputs[t] = predictions

            # Get the best word the decoder predicted (index in the vocabulary)
            best_guess = predictions.argmax(1)
           
            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            x = captions[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

#### Running the model

In [None]:
def start_training(train_loader):
    LR = 1e-4
    WD = 1e-4
    PATIENCE = 5
    EPOCHS = 10

    # Inputs for the encoder, decoder & encoder-decoder combined model.
    input_size_encoder = 4096
    hidden_size = 512
    num_layers = 2
    dropout_encoder = 0.4
    num_embeddings = 45
    embedding_dim = 300
    input_size_decoder = 300 
    dropout_decoder = 0.1
    vocab_size = 45

    encoder = EncoderRNN(input_size_encoder, hidden_size, num_layers, dropout_encoder).to(device)
    decoder = DecoderRNN(num_embeddings, embedding_dim, input_size_decoder, hidden_size, num_layers, dropout_decoder, vocab_size).to(device)
    encoder_decoder = EncoderDecoderModel(encoder, decoder, vocab_size).to(device)

    optimizer = optim.Adam(encoder_decoder.parameters(), lr=LR, weight_decay=WD)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.01, patience=PATIENCE, verbose=True)

    loss_fn = nn.CrossEntropyLoss(ignore_index=0) # We ignore the index of <PAD> which is 0.


    # Training phase.
    for epoch in range(1, EPOCHS+1):
        encoder_decoder.train()
        
        running_loss = 0.0
    
        prog_bar = tqdm(enumerate(train_loader), total=len(train_loader), leave=True)
        for batch_idx, (img_frames, captions) in prog_bar:   
            optimizer.zero_grad() 
            
            img_frames = img_frames.permute(1, 0, 2) # Reshape into (frames, batch_size, features) i.e. (80, batch_size, 4096)
            captions = captions.permute(1, 0) # Reshap into (seq_len, batch_size) i.e. (14, batch_size)

            img_frames, captions = img_frames.to(device), captions.long().to(device)
            
            predicted_captions = encoder_decoder(img_frames, captions)
            
            # We dont want the <SOS> token. Hence, we take from the first word/token.
            predicted_captions = predicted_captions[1:].reshape(-1, predicted_captions.shape[2])
            captions = captions[1:].reshape(-1)

            loss = loss_fn(predicted_captions, captions) 
            
            running_loss += loss.item()
        
            loss.backward()
            optimizer.step()
            
            prog_bar.set_description(f'Epoch {epoch}/{EPOCHS}')
        
        tr_loss = running_loss / len(train_loader)
        scheduler.step(tr_loss)
        print(f'\ttrain_loss = {tr_loss:.6f}')
    
    return encoder_decoder

In [None]:
model = start_training(train_loader)

Epoch 1/10: 100%|██████████| 34/34 [00:03<00:00,  9.06it/s]


	train_loss = 3.297794


Epoch 2/10: 100%|██████████| 34/34 [00:03<00:00,  9.35it/s]


	train_loss = 2.537997


Epoch 3/10: 100%|██████████| 34/34 [00:03<00:00,  9.28it/s]


	train_loss = 1.924046


Epoch 4/10: 100%|██████████| 34/34 [00:03<00:00,  9.30it/s]


	train_loss = 1.333827


Epoch 5/10: 100%|██████████| 34/34 [00:03<00:00,  9.14it/s]


	train_loss = 0.902239


Epoch 6/10: 100%|██████████| 34/34 [00:03<00:00,  9.26it/s]


	train_loss = 0.601928


Epoch 7/10: 100%|██████████| 34/34 [00:03<00:00,  9.34it/s]


	train_loss = 0.365076


Epoch 8/10: 100%|██████████| 34/34 [00:03<00:00,  9.25it/s]


	train_loss = 0.232666


Epoch 9/10: 100%|██████████| 34/34 [00:03<00:00,  9.27it/s]


	train_loss = 0.147593


Epoch 10/10: 100%|██████████| 34/34 [00:03<00:00,  9.39it/s]

	train_loss = 0.175970





#### Testing the model and getting word-error rate

In [None]:
def remove_special_tokens(sentence):
    cleaned_sent = ''
    tokenized_sent = sentence.split()
    for tok in tokenized_sent:
        if tok not in ['<PAD>', '<SOS>', '<EOS>']:
            cleaned_sent += tok + ' '
    return cleaned_sent


def decode_sentence(sentence):
    sent = ''
    for idx in sentence:
        if isinstance(idx, torch.Tensor):
            sent += itos[idx.item()] + ' '
        else:
            sent += itos[idx] + ' '
    
    sent = remove_special_tokens(sent)
    return sent


def get_caption(model, img_frame, true_caption):
    # Shape of img_frame: (frames, features) i.e. (80, 4090). Hence, we will reshape it into (80, 1, 4090)
    # Shape of true_caption: (seq_len,) i.e. (14,). Hence, we will reshape it into (14, 1)

    img_frame = img_frame.unsqueeze(1) # Now shape becomes (80, 1, 4090)
    true_caption = true_caption.unsqueeze(1) # Now shape becomes (14, 1)

    model.eval()
    with torch.no_grad():
        img_frame, true_caption = img_frame.to(device), true_caption.long().to(device)

        hidden, cell = model.encoder(img_frame)

        seq_len, batch_size = true_caption.size()

        predicted_caption = [stoi['<SOS>']]
        for t in range(1, seq_len):
            x = torch.LongTensor([predicted_caption[-1]]).to(device)

            prediction, hidden, cell = model.decoder(x, hidden, cell)
            
            best_guess = prediction.argmax(0).item()
            predicted_caption.append(best_guess)

            if best_guess == stoi['<EOS>']:
                break

    predicted_caption_decoded = decode_sentence(predicted_caption)
    return predicted_caption_decoded


def test_the_model(test_loader, model):
    wer = WER()

    all_predicted = []
    all_true = []

    for batch_idx, (img_frames, captions) in enumerate(test_loader):
        for img_f, cap in zip(img_frames, captions):
            predicted_caption = get_caption(model, img_f, cap)
            true_caption = decode_sentence(cap)

            all_true.append(true_caption)
            all_predicted.append(predicted_caption)
    
    for true_c, pred_c in zip(all_true, all_predicted):
        print(f'ACTUAL:    {true_c}')
        print(f'PREDICTED: {pred_c}')
        print()
    
    error_rate = wer(all_predicted, all_true).item()
    print(f'WER = {error_rate}') 

In [None]:
test_the_model(test_loader, model)

ACTUAL:    peace be upon you may gods mercy and blessings be upon you 
PREDICTED: peace be upon you may gods mercy and blessings be upon you 

ACTUAL:    today i present to you another program 
PREDICTED: today i present to you another program 

ACTUAL:    god is the greatest 
PREDICTED: god is the greatest 

ACTUAL:    also normal words 
PREDICTED: also normal words 

ACTUAL:    god does not shirk 
PREDICTED: god does not shirk 

ACTUAL:    god name 
PREDICTED: god name 

ACTUAL:    words of the day are scattered in religion 
PREDICTED: words of the day are scattered in religion 

ACTUAL:    god is the greatest 
PREDICTED: god is the greatest 

ACTUAL:    peace be upon you may gods mercy and blessings be upon you 
PREDICTED: peace be upon you may gods mercy and blessings be upon you 

ACTUAL:    god is the greatest 
PREDICTED: god is the greatest 

ACTUAL:    the subject of studying arabic sign language 
PREDICTED: the subject of studying arabic sign language 

ACTUAL:    today i pres

Hence, we have come to an end of this assignment.  
We have seen an encoder-decoder model without attention and with attention.  
We have also seen the WER of both the models - which are pretty good. The main reason for this good error rate is the number of sentences to predict.  
The captions are just 10. However, we can increase the dataset quality to attain more videos and captions and then evaluate the model architecture.  
