[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepak4669/Conversational-Agents/blob/master/Colab-Notebooks/transformer_wow.ipynb)


In [1]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Framework for running model on GPU
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

# For Visualising gradient flow across the model
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# For Visualising training loss, accuracy Vs epochs
# from visdom import Visdom

# Basic computation library needed for visdom visualising class
import numpy as np

# Other libraries for data loading, processing etc
import json
import os
import re
import math
import copy
import os
import string
import time

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [0]:
running_device = "colab"

In [0]:
def raw_data_loader(running_device):
    """ 
        Function for loading json data into a list
    """

    file_dir = ""
    file_name = "data.json"
    if running_device == "local":
        file_dir = "/home/naive/Documents/rohit/Wizard Of Wikipedia/Dataset"
    elif running_device == "colab":
        file_dir = "/content/drive/My Drive/Data/Wizard of Wikipedia"
    else:
        print("Invalid running device")
        return

    data = os.path.join(file_dir, file_name)

    json_data = None
    with open(data) as f:
        json_data = json.load(f)

    return json_data

In [6]:
json_data = None

print("Loading Raw Data into a list....")

t1 = time.time()
json_data = raw_data_loader(running_device)
t2 = time.time()

print("Loading raw data took " + str(t2 - t1) + " seconds")

Loading Raw Data into a list....
Loading raw data took 16.714849710464478 seconds


In [0]:
def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [8]:
def load_data(data_json):

    context, knowledge, wizard = [], [], []
    dict2cnt = {}
    count = 0

    N = len(data_json)
    for id in range(N):

        current_knowledge, current_wizard, context_knowledge, previous_context = (
            [],
            [],
            [],
            [],
        )
        tmp_context = []

        current_conv = data_json[id]

        topic_chosen = normalizeString(current_conv["chosen_topic"])
        tmp_context.append(topic_chosen)

        conversation_length = len(current_conv["dialog"])

        for i in range(conversation_length):

            if current_conv["dialog"][i]["speaker"] == "0_Wizard":

                dict2cnt[count] = len(current_conv["dialog"][i]["retrieved_passages"])
                count = count + 1

                for x in range(len(current_conv["dialog"][i]["retrieved_passages"])):

                    idx = current_conv["dialog"][i]["retrieved_passages"][x]

                    for value in idx.values():

                        for j in range(len(value)):
                            value[j] = normalizeString(value[j])

                        current_knowledge.append(value)

                wizard_dialog = normalizeString(current_conv["dialog"][i]["text"])

                current_wizard.append(wizard_dialog)
                tmp_context.append(wizard_dialog)

            else:
                apperentice_dialog = normalizeString(current_conv["dialog"][i]["text"])
                tmp_context.append(apperentice_dialog)

        knowledge.append(current_knowledge)
        wizard.append(current_wizard)

        final_context = []
        flag = True

        if current_conv["dialog"][0]["speaker"] == "0_Wizard":
            flag = False

        for i in range(len(tmp_context)):
            temp = []

            for j in range(i + 1):
                temp.append(tmp_context[j])

            if flag:
                if i % 2 != 0 and len(final_context) < len(current_wizard):
                    final_context.append(temp)

            else:
                if i % 2 == 0 and len(final_context) < len(current_wizard):
                    final_context.append(temp)

        context.append(final_context)

    return context, knowledge, wizard, dict2cnt


print("Getting 22311 conversation's context, knowledge, wizard utterances..")
t1 = time.time()
context, knowledge, wizard, dict2cnt = load_data(json_data)
t2 = time.time()
print("This took: " + str(t2 - t1) + " seconds")

Getting 22311 conversation's context, knowledge, wizard utterances..
This took: 39.65553689002991 seconds


In [9]:
def after_load_process(context, knowledge, wizard):
    input_context = []
    input_knowledge = []
    output_wizard = []
    for i in range(len(wizard)):
        for c in context[i]:
            input_context.append(c)
        for k in knowledge[i]:
            input_knowledge.append(k)
        for w in wizard[i]:
            output_wizard.append(w)

    input_knowledge2 = []
    index = 0
    for i in range(50246):
        r = dict2cnt[i]
        tmp = []
        fr = index
        to = index + r
        for j in range(fr, to):
            tmp.append(input_knowledge[index])
            index = index + 1
        input_knowledge2.append(tmp)
    input_knowledge3 = []
    for i in range(len(input_knowledge2)):
        g = []
        for j in range(len(input_knowledge2[i])):
            s = ""
            for k in range(len(input_knowledge2[i][j])):
                s = s + " " + input_knowledge2[i][j][k]
            s = s.strip()
            g.append(s)
        input_knowledge3.append(g)
    return input_context, input_knowledge3, output_wizard


print("Converting the extracted sentences into lists...")
t1 = time.time()
final_context, final_knowledge, final_wizard = after_load_process(
    context, knowledge, wizard
)
t2 = time.time()
print("This took: " + str(t2 - t1) + " seconds")

Converting the extracted sentences into lists...
This took: 1.1126060485839844 seconds


In [10]:
con_context = []
max_context = 0
for context in final_context:
    cat_context = ""

    for sentence in context:
        cat_context = cat_context + " " + sentence
        cat_context = cat_context.strip()
    con_context.append(cat_context)
    max_context = max(max_context, len(cat_context.split(" ")))
print(len(con_context))
print(con_context[1])
print(max_context)

50246
science fiction i think science fiction is an amazing genre for anything . future science technology time travel ftl travel they re all such interesting concepts . i m a huge fan of science fiction myself !
428


In [0]:
max_length = 359


def process(sentence):
    """
        Preprocessing sentencing to make them of equal length and appending and terminating them with
        start (<START>) and end (<END>) token.
        
    """

    words = sentence.split()
    sentence_length = max_length - 2
    if len(words) <= sentence_length:
        for i in range(sentence_length - len(words)):
            words.append("PAD")
    else:
        words = words[:sentence_length]

    res = ""
    words.append("EOS")
    words.insert(0, "SOS")
    assert len(words) == max_length
    for w in words:
        res = res + " " + w
    res = res.strip()
    return res

In [0]:
PAD_Token = 0
START_Token = 1
END_Token = 2


class Vocabulary:
    def __init__(self):
        self.trimmed = False
        self.word2count = {}
        self.index2word = {PAD_Token: "PAD", START_Token: "SOS", END_Token: "EOS"}
        self.word2index = {"PAD": PAD_Token, "SOS": START_Token, "EOS": END_Token}
        self.num_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.index2word[self.num_words] = word
            self.word2count[word] = 1
            self.num_words = self.num_words + 1
        else:
            self.word2count[word] += 1

    def trim(self, min_count):

        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for word, freq in self.word2count.items():
            if freq >= min_count:
                keep_words.append(word)

        self.word2count = {}
        self.index2word = {PAD_Token: "PAD", START_Token: "SOS", END_Token: "EOS"}
        self.word2index = {"PAD": PAD_Token, "SOS": START_Token, "EOS": END_Token}
        self.num_words = 3

        for word in keep_words:
            self.addWord(word)

In [14]:
vocab = Vocabulary()


def process_loaded(input_context, input_knowledge, output_wizard):
    """ Processing sentences and forming vocabulary
    """

    for i in range(len(output_wizard)):
        vocab.addSentence(output_wizard[i])
        output_wizard[i] = process(output_wizard[i])

        assert len(output_wizard[i].split()) == max_length

    # for i in range(len(input_knowledge)):
    #     for j in range(len(input_knowledge[i])):
    #         input_knowledge[i][j]=process(input_knowledge[i][j])
    #         vocab.addSentence(input_knowledge[i][j])
    #         assert len(input_knowledge[i][j].split())==max_length

    for i in range(len(input_context)):
        vocab.addSentence(input_context[i])
        input_context[i] = process(input_context[i])

        assert len(input_context[i].split()) == max_length
    return input_context, input_knowledge, output_wizard


print("Processing fetched sentences...")
t1 = time.time()
p_context, p_knowledge, p_wizard = process_loaded(
    con_context, final_knowledge, final_wizard
)
t2 = time.time()
print("This process took: " + str(t2 - t1))

Processing fetched sentences...
This process took: 9.443119525909424


In [15]:
print(vocab.num_words)

32506


In [0]:
for i in range(len(p_context)):
    if len(p_context[i].split()) != max_length:
        print(i)

In [18]:
##########################SANITY CHECK##########################

assert len(p_wizard) == 50246
assert len(p_context) == 50246
assert len(p_knowledge) == 50246

print("Running Sanity Checks on the data...")

t1 = time.time()

for i in range(len(p_wizard)):
    assert len(p_wizard[i].split()) == max_length

for i in range(len(p_context)):
    assert len(p_context[i].split(" ")) == max_length


# for i in range(len(p_knowledge)):
#     for j in range(len(p_knowledge[i])):
#         assert len(p_knowledge[i][j].split())==max_length

t2 = time.time()
print("Everything seems fine.")
print("Sanity Checks took: " + str(t2 - t1) + " seconds")

################################################################

Running Sanity Checks on the data...
Everything seems fine.
Sanity Checks took: 1.2309849262237549 seconds


In [19]:
print(vocab.num_words)

32506


# Model


In [0]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, source_embed, target_embed, generator):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder

        self.source_embed = source_embed
        self.target_embed = target_embed

        self.generator = generator  # Linear + Log_softmax

    def forward(self, source, target, source_mask, target_mask):
        return self.decode(
            self.encode(source, source_mask), source_mask, target, target_mask
        )

    def encode(self, source, source_mask):
        return self.encoder(self.source_embed(source), source_mask)

    def decode(self, memory, source_mask, target, target_mask):
        return self.decoder(self.target_embed(target), memory, source_mask, target_mask)

In [0]:
class Generator(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.projection = nn.Linear(d_model, vocab_size)

    def forward(self, decoder_output):
        return F.log_softmax(self.projection(decoder_output), dim=-1)

In [0]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [0]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super().__init__()

        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):

        for layer in self.layers:
            x = layer(x, mask)

        return self.norm(x)

In [0]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super().__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [0]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super().__init__()

        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNorm(size)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [0]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super().__init__()

        self.attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):

        x = self.sublayer[0](x, lambda x: self.attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [0]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super().__init__()

        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, curr_mask, tgt_mask):

        for layer in self.layers:
            x = layer(x, memory, curr_mask, tgt_mask)

        return self.norm(x)

In [0]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super().__init__()

        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward

        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):

        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [0]:
def attention(query, key, value, mask=None, dropout=None):

    d_k = query.size(-1)

    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    p_attn = F.softmax(scores, dim=-1)

    if dropout is not None:
        p_attn = dropout(p_attn)

    return torch.matmul(p_attn, value), p_attn

In [0]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()

        assert d_model % h == 0

        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, values, mask=None):

        if mask is not None:
            mask = mask.unsqueeze(1)

        nbatches = query.size(0)

        query, key, values = [
            l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for l, x in zip(self.linears, (query, key, values))
        ]

        x, self.attn = attention(query, key, values, mask=mask, dropout=self.dropout)

        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)

        return self.linears[-1](x)

In [0]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()

        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [0]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super().__init__()

        self.embed = nn.Embedding(vocab, d_model)
        self.embed.weight.requires_grad = False
        self.d_model = d_model

    def forward(self, x):
        #         print(x.device)
        return self.embed(x) * math.sqrt(self.d_model)

In [0]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()

        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model, dtype=torch.float)
        position = torch.arange(0.0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0.0, d_model, 2) * -(math.log(10000.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):

        x = x + Variable(self.pe[:, : x.size(1)], requires_grad=False)
        return self.dropout(x)

In [0]:
"""
triu function generates a copy of matrix with elemens below kth diagonal zeroed.
The main diagonal is zeroeth diagonal above is first(k=1) and so on.

Eg:
A=[[1,2,3],[4,5,6],[7,8,9]]
for above matrix:
triu(A,k=1)
will give [[0,2,3],[0,0,6],[0,0,0]]
"""


def subsequent_mask(size):
    attn_shape = (1, size, size)
    mask = np.triu(np.ones(attn_shape), k=1).astype("uint8")

    return torch.from_numpy(mask) == 0

In [0]:
def make_model2(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):

    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    )

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [0]:
class Batch:
    "Object for holding a batch of data with mask during training."

    def __init__(self, src, trg=None, pad=0):
        src = src.to(torch.int64)
        trg = trg.to(torch.int64)
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
        self.src.to(device)
        self.trg.to(device)
        self.src_mask.to(device)
        self.trg_mask.to(device)

    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)
        )
        return tgt_mask

In [0]:
def word2index(sentence):
    tokenised_sentence = []
    for word in sentence.split(" "):
        tokenised_sentence.append(vocab.word2index[word])
    return tokenised_sentence

In [0]:
batches = []
n_batches = 100
batch_size = 2
for i in range(n_batches):
    curr_batch_context = []
    curr_batch_wizard = []
    for j in range(batch_size):
        curr_id = i * batch_size + j
        curr_batch_context.append(word2index(p_context[curr_id]))
        curr_batch_wizard.append(word2index(p_wizard[curr_id]))

    curr_batch_context = torch.LongTensor(curr_batch_context)
    curr_batch_wizard = torch.LongTensor(curr_batch_wizard)
    batches.append(Batch(curr_batch_context, curr_batch_wizard))

In [60]:
print(len(batches))
print(batches[0].src.size())

100
torch.Size([2, 359])


In [0]:
def calculate_length(x):
    length = 0
    for i in range(x.size()[0]):
        if x[i].item() == 2:
            break
        length += 1
    return length

In [0]:
def F1_score(x, y):
    #     print(x)
    #     print(y)

    inp = torch.argmax(x, dim=-1)
    score = 0

    for i in range(inp.size()[0]):
        common_tokens = 0
        true_len = calculate_length(y[i][:])
        pred_len = calculate_length(inp[i][:])

        for j in range(inp.size()[1]):
            t = y[i][j].item()
            p = inp[i][j].item()

            if t == 0 or t == 1 or t == 2 or p == 0 or p == 1 or p == 2:
                continue

            if t == p:
                common_tokens = common_tokens + 1
        try:
            score += common_tokens / (true_len + pred_len)
        except ZeroDivisionError:
            print(str(true_len) + " " + str(pred_len))
            print(y[i][:])
            print(inp[i][:])

    score = score / inp.size()[0]
    return 2 * score

In [0]:
def run_single_batch(data, model, loss_compute):

    start_time = time.time()

    total_tokens = 0
    total_loss = 0
    tokens = 0

    source = data.src
    source_mask = data.src_mask
    target = data.trg
    target_mask = data.trg_mask
    target_y = data.trg_y

    source = source.to(device)
    target = target.to(device)
    source_mask = source_mask.to(device)
    target_mask = target_mask.to(device)
    target_y = target_y.to(device)

    out = model(source, target, source_mask, target_mask)

    loss, f1_score, ppl = loss_compute(out, target_y, data.ntokens)

    return loss.item(), f1_score, data.ntokens.item(), ppl

In [0]:
class SimpleLossCompute:
    "A simple loss compute and train function."

    def __init__(self, generator, opt=None):
        self.generator = generator
        self.criterion = nn.CrossEntropyLoss()
        self.opt = opt

    def __call__(self, x, y, norm):
        x = self.generator(x)
        sentence_length = x.size()[1]
        #         print(str(x.size())+" "+str(y.size()))
        f1_score = F1_score(x, y)
        #         ppl=perplexity(x,y)

        loss = self.criterion(
            x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
        )
        loss.backward()
        _ = nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # plot_grad_flow(model.named_parameters())
        if self.opt is not None:
            self.opt.step()
            self.opt.zero_grad()

        return loss, f1_score, math.exp((loss.item() * norm.item()) / sentence_length)

In [0]:
def training(batches, model, n_epochs, n_batches, model_opt, loadFile, save_every):

    start_epoch = 0
    total_time = 0
    if loadFile:
        start_epoch = torch.load(loadFile)["epoch"]
        total_time = torch.load(loadFile)["time"]
        start_epoch = start_epoch + 1

    for epoch in range(start_epoch, n_epochs):
        t1 = time.time()
        loss = 0
        f1_score = 0
        ppl = 0
        n_tokens = 0
        for i in range(n_batches):
            current_batch = batches[i]
            loss_val, current_f1_score, current_tokens, curr_ppl = run_single_batch(
                current_batch, model, SimpleLossCompute(model.generator, model_opt)
            )
            loss += loss_val
            f1_score += current_f1_score
            n_tokens += current_tokens
            ppl += curr_ppl

        loss = loss / (n_batches)
        ppl = ppl / n_batches
        f1_score = f1_score / n_batches

        # if epoch%save_every==0:
        #     directory=os.path.join(save_dir,'transformer','cornell-movie')
        #     if not os.path.exists(directory):
        #         os.makedirs(directory)
        #     torch.save({
        #         "epoch":epoch,
        #         "model":model.state_dict(),
        #         "opt":model_opt.state_dict(),
        #         "loss":loss,
        #         "ppl":ppl,
        #         "f1":f1_score,
        #         "time":total_time
        #     },os.path.join(directory,'{}_{}.tar'.format(epoch,'checkpoint')))

        print("=" * 100)
        print(
            "| End of Epoch : "
            + str(epoch)
            + "| Loss Value: "
            + str(loss)
            + "| F1 Score: "
            + str(f1_score / n_batches)
            + "| PPL: "
            + str(ppl)
            + "| Time Took: "
            + str(time.time() - t1)
            + " |"
        )
        print("=" * 100)
        total_time += time.time() - t1

    print("| Training Finished | Total Training Time: " + str(total_time) + " |")

In [0]:
def get_parameter_count(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [0]:
### Hyperparameters

N = 2
d_model = 256
d_ff = 512
num_head = 8
dropout = 0.1

batch_size = 2
n_epochs = 10

In [0]:
print("Initialising and creating models....")
V = vocab.num_words
t1 = time.time()
# criterion=LabelSmoothing()

model = make_model2(V, V, N, d_model, d_ff, num_head, dropout)
model.to(device)

learnable_parameter_count = get_parameter_count(model)

print(
    "Number of learnable parameters for this model: " + str(learnable_parameter_count)
)

# model_opt=torch.optim.Adam(model.parameters(),lr=0.0001,betas=(0.9,0.988),eps=1e-9)
model_opt = torch.optim.Adam(
    model.parameters(), lr=0.00001, betas=(0.9, 0.98), eps=1e-9
)
print("=" * 100)
print("Creating Models took: " + str(time.time() - t1))

save_dir = "/content/drive/My Drive/Model Data"
# loadFile=os.path.join(save_dir,'transformer','cornell-movie','2_checkpoint')
# loadFile="C:\\Users\\deepa\\Conversational Agents\\transformer\\cornell-movie\\1_checkpoint.tar"
# loadFile='/content/drive/My Drive/Model Data/transformer/cornell-movie-10K/600_checkpoint.tar'
loadFile = None

if loadFile:
    checkpoint = torch.load(loadFile)
    model.load_state_dict(checkpoint["model"])
    model_opt.load_state_dict(checkpoint["opt"])

    # /content/drive/My Drive/Model Data/transformer/cornell-movie-10K/600_checkpoint.tar

model.train()
training(batches, model, 1000, 100, model_opt, loadFile, 5)

Initialising and creating models....
Number of learnable parameters for this model: 10990842
Creating Models took: 0.4139430522918701
| End of Epoch : 0| Loss Value: 9.468409328460693| F1 Score: 0.0| PPL: 3.213265551018498| Time Took: 11.805978059768677 |
| End of Epoch : 1| Loss Value: 8.363695669174195| F1 Score: 0.0| PPL: 2.785162050402352| Time Took: 11.840904712677002 |
| End of Epoch : 2| Loss Value: 7.802096219062805| F1 Score: 0.0| PPL: 2.5942100592387884| Time Took: 11.847726345062256 |
| End of Epoch : 3| Loss Value: 7.301797771453858| F1 Score: 0.0| PPL: 2.43603911978409| Time Took: 11.818906784057617 |
| End of Epoch : 4| Loss Value: 6.811555542945862| F1 Score: 0.0| PPL: 2.2908041097894625| Time Took: 11.678988695144653 |
| End of Epoch : 5| Loss Value: 6.324893193244934| F1 Score: 0.0| PPL: 2.1557795727153373| Time Took: 11.754007577896118 |
| End of Epoch : 6| Loss Value: 5.840961141586304| F1 Score: 0.0| PPL: 2.029966000241145| Time Took: 11.798341035842896 |
| End of E

In [0]:
for name, p in model.named_parameters():
    if p.requires_grad:
        print(name, p.size())

In [0]:
model.eval()


def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len - 1):
        out = model.decode(
            memory,
            src_mask,
            Variable(ys),
            Variable(subsequent_mask(ys.size(1)).type_as(src.data)),
        )
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys

In [58]:
for i in range(10):
    batch = batches[i]
    for i in range(2):
        source = batch.src[i].view(-1, 359).to(device)
        source_mask = batch.src_mask[i].view(1, -1, 359).to(device)
        # output=beam_search(model,source,source_mask,10,1,2,200,1)
        output = greedy_decode(model, source, source_mask, 359, 1)

        src = batch.src[i].view(-1)
        trg = batch.trg[i].view(-1)
        pred = output.view(-1)
        # print(src.size())
        # for id in src:
        #     print(id)
        src_sentence = [vocab.index2word[id.item()] for id in src]
        trg_sentence = [vocab.index2word[id.item()] for id in trg]
        pred_sentence = [vocab.index2word[id.item()] for id in pred]
        print(src_sentence)
        print(trg_sentence)
        print(pred_sentence)
        print("-" * 80)
#         print("-"*80)
#         print(str(output)+" "+str(batch.src[i])+" "+str(batch.trg[i]))

['SOS', 'science', 'fiction', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD