# DLOps Assignment 2: RNN, LSTM and Docker [100 Marks]

### Submitted By Debonil Ghosh [M21AIE225]

##### Q1. You have been provided a DATASET, which contains pairs of the words (x,y) i.e. akhbaar अख़बार in which the first word is a Latin word( words we usually type while chatting with friends in WhatsApp) and the second word is its corresponding word in native script. Your main goal is to train a seq2seq model which takes as input the romanized string and produces the corresponding word in native script.
For Example, Jabki yah Jainon se km hai. ⇒ जबकि यह जनै ों सेकम है। [75]

In [57]:
import pandas as pd
data_dev = pd.read_csv('../downloads/Dakshina Dataset/hi/lexicons/hi.translit.sampled.dev.tsv',
                       sep='\t', names=['word', 'romanization', 'no_of_attestation'])
data_test = pd.read_csv('../downloads/Dakshina Dataset/hi/lexicons/hi.translit.sampled.test.tsv',
                        sep='\t', names=['word', 'romanization', 'no_of_attestation'])
data_train = pd.read_csv('../downloads/Dakshina Dataset/hi/lexicons/hi.translit.sampled.train.tsv',
                         sep='\t', names=['word', 'romanization', 'no_of_attestation'])

print(data_dev.sample(20))


           word  romanization  no_of_attestation
1508     टकराने      takraane                  1
2995       मधेश       madhesh                  3
3360     रामायण       ramayan                  2
1818         दन           dan                  3
3397  रिलेशनशिप  relationship                  3
1755   तीरंदाजी     tirandaji                  1
1816     दगाबाज       dagabaj                  1
2358    पेपरवेट   paperweight                  2
2281   पारदर्शी     pardarshi                  2
3496     लटकाया       latkaya                  2
338        इट्स           its                  3
1292      जनसंघ     janasangh                  1
1748   तिरस्कृत     tiraskrit                  1
2198     पधारना     padharana                  2
2188      पतीला       patilaa                  1
742       कितनी         kitni                  2
2718    बायोटेक       biotech                  3
235        आचमन       aachman                  3
3825   श्रद्धेय    shraddheya                  1
3024   महागणपति   ma

### Loading Dakshina Dataset and preprocess data

In [58]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd

import os
import re
import random
import torch
from torch.utils.data import TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class LoadDakshinaDataset():

    def __init__(self, DATAPATH, source_lang='en', target_lang="bn"):

        self.source_lang = source_lang
        self.target_lang = target_lang

        self.trainpath = os.path.join(
            DATAPATH, target_lang, "lexicons", target_lang+".translit.sampled.train.tsv")
        self.valpath = os.path.join(
            DATAPATH, target_lang, "lexicons", target_lang+".translit.sampled.dev.tsv")
        self.testpath = os.path.join(
            DATAPATH, target_lang, "lexicons", target_lang+".translit.sampled.test.tsv")
        self.train = pd.read_csv(
            self.trainpath,
            sep="\t",
            names=["tgt", "src", "count"],
        )
        self.val = pd.read_csv(
            self.valpath,
            sep="\t",
            names=["tgt", "src", "count"],
        )
        self.test = pd.read_csv(
            self.testpath,
            sep="\t",
            names=["tgt", "src", "count"],
        )

        # create train data
        self.train_data = self.preprocess(
            self.train["src"].to_list(), self.train["tgt"].to_list())
        # print(self.train_data.shape)
        # print(self.train_data[0])
        (
            self.train_encoder_input,
            self.train_decoder_input,
            self.train_decoder_target,
            self.source_vocab,
            self.target_vocab,
        ) = self.train_data
        self.source_char2int, self.source_int2char = self.source_vocab
        self.target_char2int, self.target_int2char = self.target_vocab

        self.trainloader = torch.utils.data.DataLoader(TensorDataset(torch.Tensor(
            self.train_encoder_input, device=device), torch.Tensor(self.train_decoder_target, device=device)), batch_size=32, shuffle=True, num_workers=2)

        # create val data (only encode function suffices as the dictionary lookup should be kep the same.
        self.val_data = self.encode(
            self.val["src"].to_list(),
            self.val["tgt"].to_list(),
            list(self.source_char2int.keys()),
            list(self.target_char2int.keys()),
            source_char2int=self.source_char2int,
            target_char2int=self.target_char2int,
        )
        self.val_encoder_input, self.val_decoder_input, self.val_decoder_target = self.val_data
        self.source_char2int, self.source_int2char = self.source_vocab
        self.target_char2int, self.target_int2char = self.target_vocab

        self.valloader = torch.utils.data.DataLoader(TensorDataset(torch.Tensor(
            self.val_encoder_input, device=device), torch.Tensor(self.val_decoder_target, device=device)), batch_size=32, shuffle=True, num_workers=2)

        # create test data
        self.test_data = self.encode(
            self.test["src"].to_list(),
            self.test["tgt"].to_list(),
            list(self.source_char2int.keys()),
            list(self.target_char2int.keys()),
            source_char2int=self.source_char2int,
            target_char2int=self.target_char2int,
        )
        self.test_encoder_input, self.test_decoder_input, self.test_decoder_target = self.test_data
        self.source_char2int, self.source_int2char = self.source_vocab
        self.target_char2int, self.target_int2char = self.target_vocab

        self.testloader = torch.utils.data.DataLoader(TensorDataset(torch.Tensor(
            self.test_encoder_input, device=device), torch.Tensor(self.test_decoder_target, device=device)), batch_size=32, shuffle=True, num_workers=2)

    def dictionary_lookup(self, vocab):
        char2int = dict([(char, i) for i, char in enumerate(vocab)])
        int2char = dict((i, char) for char, i in char2int.items())
        return char2int, int2char

    def encode(self, source, target, source_chars, target_chars, source_char2int=None, target_char2int=None):
        #print(f"encode ==> source:{source[500:510]}")
        #print(f"encode ==> target:{target[500:510]}")
        num_encoder_tokens = len(source_chars)
        num_decoder_tokens = len(target_chars)
        max_source_length = max([len(txt) for txt in source])
        max_target_length = max([len(txt) for txt in target])

        source_vocab, target_vocab = None, None
        if source_char2int == None and target_char2int == None:
            print(
                "Generating the dictionary lookups for character to integer mapping and back")
            source_char2int, source_int2char = self.dictionary_lookup(
                source_chars)
            target_char2int, target_int2char = self.dictionary_lookup(
                target_chars)

            source_vocab = (source_char2int, source_int2char)
            target_vocab = (target_char2int, target_int2char)

        encoder_input_data = np.zeros(
            (len(source), max_source_length+1, num_encoder_tokens), dtype="int32"
        )
        decoder_input_data = np.zeros(
            (len(source), max_target_length, num_decoder_tokens), dtype="int32"
        )
        decoder_target_data = np.zeros(
            (len(source), max_target_length, num_decoder_tokens), dtype="int32"
        )

        for i, (input_text, target_text) in enumerate(zip(source, target)):
            for t, char in enumerate(input_text):
                encoder_input_data[i, t, source_char2int[char]] = 1.0
            encoder_input_data[i, t + 1:, source_char2int[" "]] = 1.0
            for t, char in enumerate(target_text):
                # decoder_target_data is ahead of decoder_input_data by one timestep
                decoder_input_data[i, t, target_char2int[char]] = 1.0
                if t > 0:
                    # decoder_target_data will be ahead by one timestep
                    # and will not include the start character.
                    decoder_target_data[i, t - 1, target_char2int[char]] = 1.0
            decoder_input_data[i, t + 1:, target_char2int[" "]] = 1.0
            decoder_target_data[i, t:, target_char2int[" "]] = 1.0

        if source_vocab != None and target_vocab != None:
            return (
                encoder_input_data,
                decoder_input_data,
                decoder_target_data,
                source_vocab,
                target_vocab,
            )
        else:
            return encoder_input_data, decoder_input_data, decoder_target_data

    def preprocess(self, source, target):
        source_chars = set()
        target_chars = set()

        source = [str(x) for x in source]
        target = [str(x) for x in target]

        source_words = []
        target_words = []
        for src, tgt in zip(source, target):
            tgt = "\t" + tgt + "\n"
            source_words.append(src)
            target_words.append(tgt)
            for char in src:
                if char not in source_chars:
                    source_chars.add(char)
            for char in tgt:
                if char not in target_chars:
                    target_chars.add(char)

        source_chars = sorted(list(source_chars))
        target_chars = sorted(list(target_chars))

        # The space needs to be appended so that the encode function doesn't throw errors
        source_chars.append(" ")
        target_chars.append(" ")

        num_encoder_tokens = len(source_chars)
        num_decoder_tokens = len(target_chars)
        max_source_length = max([len(txt) for txt in source_words])
        max_target_length = max([len(txt) for txt in target_words])

        print("Number of samples:", len(source))
        print("Source Vocab length:", num_encoder_tokens)
        print("Target Vocab length:", num_decoder_tokens)
        print("Max sequence length for inputs:", max_source_length)
        print("Max sequence length for outputs:", max_target_length)

        return self.encode(source_words, target_words, source_chars, target_chars)


In [59]:
dataset = LoadDakshinaDataset(
    '../downloads/Dakshina Dataset', source_lang='en', target_lang='hi')


Number of samples: 44204
Source Vocab length: 27
Target Vocab length: 66
Max sequence length for inputs: 20
Max sequence length for outputs: 21
Generating the dictionary lookups for character to integer mapping and back


### a) Build a seq2seq model which contains the following layers - [20]

i) input layer for character embeddings

ii) one encoder which sequentially encodes the input character sequence (Latin)

iii) one decoder which takes the last state of the encoder as an input and produces one
character output at a time (native).


Please note that the dimension of input character embeddings, the hidden state of
encoders and decoders, the cell(RNN and LSTM), and the number of layers in the encoder and
decoder should be passed as an argument.
(Note:- For Reference you may refer to this Blog, but the implementation must be in
PyTorch only.)

In [60]:
import torch
import torch.nn as nn


class Encoder(nn.Module):
    def __init__(self, cell_type, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.cell_type = cell_type
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        #self.embedding = nn.Embedding(input_dim, emb_dim)

        if self.cell_type == "RNN":
            self.rnn = nn.RNN(
                input_dim, emb_dim, num_layers=self.n_layers, dropout=dropout)
        else:
            self.rnn = nn.LSTM(
                input_dim, emb_dim, num_layers=self.n_layers, dropout=dropout)

        # self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src : [sen_len, batch_size]
        #embedded = self.dropout(self.embedding(src))

        # embedded : [sen_len, batch_size, emb_dim]
        outputs, (hidden, cell) = self.rnn(src)
        # outputs = [sen_len, batch_size, hid_dim * n_directions]
        # hidden = [n_layers * n_direction, batch_size, hid_dim]
        # cell = [n_layers * n_direction, batch_size, hid_dim]
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, cell_type, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.cell_type = cell_type
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        #self.embedding = nn.Embedding(output_dim, emb_dim)

        if self.cell_type == "RNN":
            self.rnn = nn.RNN(
                emb_dim, hid_dim, num_layers=self.n_layers, dropout=dropout)
        else:
            self.rnn = nn.LSTM(
                emb_dim, hid_dim, num_layers=self.n_layers, dropout=dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

       # self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):

        # input = [batch_size]
        # hidden = [n_layers * n_dir, batch_size, hid_dim]
        # cell = [n_layers * n_dir, batch_size, hid_dim]

        input = input.unsqueeze(0)
        # input : [1, ,batch_size]

        # embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch_size, emb_dim]

        output, (hidden, cell) = self.rnn(input, (hidden, cell))
        # output = [seq_len, batch_size, hid_dim * n_dir]
        # hidden = [n_layers * n_dir, batch_size, hid_dim]
        # cell = [n_layers * n_dir, batch_size, hid_dim]

        # seq_len and n_dir will always be 1 in the decoder
        prediction = output.squeeze(0)
        # prediction = [batch_size, output_dim]
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, \
            'hidden dimensions of encoder and decoder must be equal.'
        assert encoder.n_layers == decoder.n_layers, \
            'n_layers of encoder and decoder must be equal.'

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [sen_len, batch_size]
        # trg = [sen_len, batch_size]
        # teacher_forcing_ratio : the probability to use the teacher forcing.
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size,
                              trg_vocab_size).to(self.device)

        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)

        # first input to the decoder is the <sos> token.
        input = trg[0, :]
        for t in range(1, trg_len):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states.
            output, hidden, cell = self.decoder(input, hidden, cell)

            # replace predictions in a tensor holding predictions for each token
            outputs[t] = output

            # decide if we are going to use teacher forcing or not.
            teacher_force = random.random() < teacher_forcing_ratio

            # get the highest predicted token from our predictions.
            top1 = output.argmax(1)
            # update input : use ground_truth when teacher_force
            input = trg[t] if teacher_force else top1

        return outputs


In [61]:
import time
import math


class S2STranslation():

    def __init__(self, modelConfigDict, srcChar2Int, tgtChar2Int):
        print(modelConfigDict)
        #self.native_vocabulary = modelConfigDict["native_vocabulary"]
        self.numEncoders = modelConfigDict["numEncoders"]
        self.cell_type = modelConfigDict["cell_type"]
        self.latentDim = modelConfigDict["latentDim"]
        self.dropout = modelConfigDict["dropout"]
        self.numDecoders = modelConfigDict["numDecoders"]
        self.hidden = modelConfigDict["hidden"]

        self.tgtChar2Int = tgtChar2Int
        self.srcChar2Int = srcChar2Int

        self.epochs = modelConfigDict["epochs"]
        self.batch_size = modelConfigDict["batch_size"]
        self.optimiser = modelConfigDict["optimiser"]
        self.optimiser_patience = modelConfigDict["optimiser_patience"]

    def build_configurable_model(self):

        encoder = Encoder(self.cell_type, len(
            self.srcChar2Int), self.latentDim, self.hidden, self.numEncoders, self.dropout)
        decoder = Decoder(self.cell_type, len(
            self.tgtChar2Int), self.latentDim, self.hidden, self.numDecoders, self.dropout)

        self.model = Seq2Seq(encoder, decoder, device).to(device)
        return self.model

    def init_weights(self):
        for name, param in self.model.named_parameters():
            nn.init.uniform_(param.data, -0.08, 0.08)

    def count_parameters(self):
        parameters = sum(p.numel()
                         for p in self.model.parameters() if p.requires_grad)
        print(f'The model has {parameters:,} trainable parameters')
        return parameters

    def summary(self):
        print(self.model)
        self.count_parameters()

    def train_epoch(self, iterator, optimizer, criterion, clip):

        self.model.train()

        epoch_loss = 0

        for local_batch, local_labels in iterator:

            optimizer.zero_grad()
            # trg = [sen_len, batch_size]
            # output = [trg_len, batch_size, output_dim]
            output = self.model(local_batch, local_labels)
            output_dim = output.shape[-1]

            # transfrom our output : slice off the first column, and flatten the output into 2 dim.
            output = output[1:].view(-1, output_dim)
            local_labels = local_labels[1:].view(-1)
            # trg = [(trg_len-1) * batch_size]
            # output = [(trg_len-1) * batch_size, output_dim]

            loss = criterion(output, local_labels)

            loss.backward()

            torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip)

            optimizer.step()

            epoch_loss += loss.item()

        return epoch_loss / len(iterator)

    def train(self, train_iter, valid_iter):
        CLIP = 1

        best_valid_loss = float('inf')

        optimizer = optim.Adam(self.model.parameters())
        criterion = nn.CrossEntropyLoss()

        for epoch in range(self.epochs):

            start_time = time.time()

            train_loss = self.train_epoch(
                train_iter, optimizer, criterion, CLIP)
            valid_loss = self.evaluate(valid_iter, criterion)

            end_time = time.time()
            epoch_mins, epoch_secs = self.epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(self.model.state_dict(), 'Seq2SeqModel.pt')
            print(f"Epoch: {epoch+1:02} | Time {epoch_mins}m {epoch_secs}s")
            print(
                f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
            print(
                f"\tValid Loss: {valid_loss:.3f} | Valid PPL: {math.exp(valid_loss):7.3f}")


In [62]:
import matplotlib.pyplot as plt


def plot_history(history):
    # Plot the training history
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()


In [63]:
[dataset.train_encoder_input.shape, dataset.train_decoder_input.shape]


[(44204, 21, 27), (44204, 21, 66)]

In [64]:
def train(config):

    modelInit = S2STranslation(
        config, srcChar2Int=dataset.source_char2int, tgtChar2Int=dataset.target_char2int)

    model = modelInit.build_configurable_model()

    modelInit.summary()

    train_hist = modelInit.train(dataset.trainloader, dataset.valloader)

    plot_history(train_hist)

    return model, train_hist


In [65]:
config_default = {
    "cell_type": "LSTM",
    "latentDim": 256,
    "hidden": 64,
    "optimiser": 'adam',
    "numEncoders": 1,
    "numDecoders": 1,
    "dropout": 0.1,
    "epochs": 100,
    "batch_size": 32,
    'optimiser_patience': 5
}
model, _ = train(config_default)


{'cell_type': 'LSTM', 'latentDim': 256, 'hidden': 64, 'optimiser': 'adam', 'numEncoders': 1, 'numDecoders': 1, 'dropout': 0.1, 'epochs': 100, 'batch_size': 32, 'optimiser_patience': 5}
Seq2Seq(
  (encoder): Encoder(
    (rnn): LSTM(27, 256, dropout=0.1)
  )
  (decoder): Decoder(
    (rnn): LSTM(256, 64, dropout=0.1)
    (fc_out): Linear(in_features=64, out_features=66, bias=True)
  )
)
The model has 378,562 trainable parameters




RuntimeError: input.size(-1) must be equal to input_size. Expected 256, got 66

In [None]:
dataset.train_encoder_input.shape


In [None]:
dataset.train_decoder_input.shape


In [None]:
''.join([dataset.source_int2char[x]
    for x in dataset.train_encoder_input[-1].argmax(axis=1)])


'om                  '

In [None]:
len(''.join([dataset.target_int2char[x]
    for x in dataset.train_decoder_input[-1].argmax(axis=1)]))


21

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
trainloader = torch.utils.data.DataLoader(TensorDataset(torch.tensor(dataset.train_encoder_input), torch.tensor(
    dataset.train_decoder_target)), batch_size=32, shuffle=True, num_workers=2)


In [None]:
for local_batch, local_labels in trainloader:
    print([local_batch.shape, local_labels.shape])
