In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!nvidia-smi 
!pip install wandb -q

Sat Aug 22 20:35:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import re
import copy
import math
import time
import random
import pickle
import itertools
import numpy as np
from collections import namedtuple, Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import wandb
import logging
logging.propagate=False
logging.getLogger().setLevel(logging.ERROR)


In [4]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("The device found: "+str(device))

The device found: cuda


# Preprocessing


In [5]:
path='/content/drive/My Drive/Data'
dataset='cornell movie-dialogs corpus'

data_folder=os.path.join(path,dataset)

train_path=os.path.join(data_folder,'train')
test_path=os.path.join(data_folder,'test')

print("The final data corpus folder: "+str(data_folder))
print("The training data folder: "+str(train_path))
print("The testing data folder: "+str(test_path))

The final data corpus folder: /content/drive/My Drive/Data/cornell movie-dialogs corpus
The training data folder: /content/drive/My Drive/Data/cornell movie-dialogs corpus/train
The testing data folder: /content/drive/My Drive/Data/cornell movie-dialogs corpus/test


In [6]:
def get_lines_conversations():
    """
    Loads movie lines and conversations from the dataset.
    
    @returns list[lines],list[conversations]: The lines and conversations in the cornell movie dataset
    """
    movie_lines=[]
    movie_conversations=[]

    with open(os.path.join(data_folder,'movie_lines.txt'),'r',encoding='iso-8859-1') as f:
        for line in f:
            movie_lines.append(line)
    
    with open(os.path.join(data_folder,'movie_conversations.txt'),'r', encoding='iso-8859-1') as f:
        for line in f:
            movie_conversations.append(line)
                                       

    return movie_lines,movie_conversations

In [7]:
t1=time.time()
print("Extracting movie lines and movie conversations...")
movie_lines,movie_conversations=get_lines_conversations()

print("Number of distinct lines: "+str(len(movie_lines)))
print("Number of conversations: "+str(len(movie_conversations)))
print("Average Number of lines per conversations: "+str(len(movie_lines)/len(movie_conversations)))

print(movie_lines[0])
print(movie_conversations[0])

print("Extracting took place in: "+str(time.time()-t1)+" seconds")

Extracting movie lines and movie conversations...
Number of distinct lines: 304713
Number of conversations: 83097
Average Number of lines per conversations: 3.6669554857576085
L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']

Extracting took place in: 1.772658109664917 seconds


In [8]:
def loadLines(movie_lines,fields):
    """ The function to load based on id from the movie_lines from above

    @param movie_lines(List of Lines): The lines extracted from above.
    @param fields(List of Strings): Each line in movie_lines is a dictionary with keys described by this.

    @returns dictionary[lineID:line]: Line corresponding to each line_id.

    """
    lines={}
    for lineid in range(len(movie_lines)):
        
        line=movie_lines[lineid]
        values=line.split(" +++$+++ ")
        lineVals={}
           
        for i,field in enumerate(fields):
            lineVals[field]=values[i]
            
        lines[lineVals['lineID']]=lineVals
    
    return lines

def loadConversations(movie_conversations,lines,fields):
    """ The function to load lines of the conversation

    @param movie_conversations List[String]: Movie Conversation extracted from the original dataset
    @param lines Dict[lineID:line]: Extracted from loadLines function
    @param fields List[String]: Fields we have for each string from conversation element from original conversation data

    @returns List[Dict]: Conversations extracted from original data from raw Strings
    """

    conversations=[]
    
    for convo in movie_conversations:
        values=convo.split(" +++$+++ ")
        conVals={}
       
        for i,field in enumerate(fields):
            conVals[field]=values[i]
        
        lineIDs=eval(conVals["utteranceIDs"])
        
        conVals["lines"]=[]
        
        for lineID in lineIDs:
            conVals["lines"].append(lines[lineID])
        conversations.append(conVals)
        
    return conversations

def sentencePairs(conversations):
    """
        The function to give query based pairs from each conversation.

    @param conversations List[Dict]: The conversations retrieved from previous function

    @returns List[List]: The query response pairs for each conversational exchange

    """
    qr_pairs=[]
    
    for conversation in conversations:
        for i in range(len(conversation["lines"])-1):
            query=conversation["lines"][i]["text"].strip()
            response=conversation["lines"][i+1]["text"].strip()
            
            if query and response:
                qr_pairs.append([query,response])
        
    return qr_pairs

In [9]:
t1=time.time()
print("Separating meaningfull information for our model...")

lines={}
conversations=[]
qr_pairs=[]

movie_lines_fields=["lineID","characterID","movieID","character","text"]
movie_convo_fields=["charcaterID","character2ID","movieID","utteranceIDs"]

lines=loadLines(movie_lines,movie_lines_fields)
conversations=loadConversations(movie_conversations,lines,movie_convo_fields)
qr_pairs=sentencePairs(conversations)

print("The number of query-response pairs are: "+str(len(qr_pairs)))
print("Separation took place in: "+str(time.time()-t1))

Separating meaningfull information for our model...
The number of query-response pairs are: 221282
Separation took place in: 1.88006591796875


In [10]:
PAD_Token=0
START_Token=1
END_Token=2

Min_Count=3
Max_Length=10

In [11]:
class Vocabulary:
    """
        Vocabulary class for the words in the data
    """

    def __init__(self):

        self.trimmed=False
        self.word2count={}
        self.index2word={PAD_Token:"PAD",START_Token:"SOS",END_Token:"EOS"}
        self.word2index={"PAD":PAD_Token,"SOS":START_Token,"EOS":END_Token}
        self.num_words=3
        
    def addSentence(self,sentence):
        """ Take into the account all the words occuring in the input sentence

        @param String [sentence]: Input sentence

        """
        for word in sentence.split(" "):
            self.addWord(word)

    def addWord(self,word):
        """ Takes into consideration of the word appeared in the corpus

        @param word: A single word

        """
        if word not in self.word2index:
            self.word2index[word]=self.num_words
            self.index2word[self.num_words]=word
            self.word2count[word]=1
            self.num_words=self.num_words+1
        else:
            self.word2count[word]+=1
            
    def trim(self,min_count):
        """ Reconstructs the entire vocabulary by removing words with frequency less than min_count

        @param min_count(int): The count threshold you want to keep for the words

        """
        
        if self.trimmed:
            return
        self.trimmed=True
        
        keep_words=[]
        
        for word,freq in self.word2count.items():
            if freq>=min_count:
                keep_words.append(word)
        
        self.word2count={}
        self.index2word={PAD_Token:"PAD",START_Token:"SOS",END_Token:"EOS"}
        self.word2index={"PAD":PAD_Token,"SOS":START_Token,"EOS":END_Token}
        self.num_words=3
        
        for word in keep_words:
            self.addWord(word)

In [12]:

def normalizeString(s):
    """ Preprocess sentence given. Space between punctuations, lowercase all the letters.

    @param String : The sentence to undergo preprocessing

    @returns String: The preprocessed string
    """
    s=s.lower().strip()
    s=re.sub(r"([.!?])", r" \1", s)
    s=re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s=re.sub(r"\s+", r" ", s).strip()
    return s

def readVocs(qr_pairs):
    """ Normalise each of the sentence in the query-reponse pair and create vocab

    @param qr_pair List[List[]]: The list query response pairs
    
    @returns Voc(Vocabulary Object), qr_pairs(List[List]): The vocabulary and the processed q-r pairs list

    """
    
    for qr_pair in qr_pairs:
        qr_pair[0]=normalizeString(qr_pair[0])
        qr_pair[1]=normalizeString(qr_pair[1])
    
    voc=Vocabulary()
    return voc,qr_pairs

def filterPair(pair):
    """ Checks whether a pair(both sentences) have words <Max_length(Globally defined)

    @param pair list[q,r]: The particular pair

    @returns boolean: Whether pair has both sentences having length less than the Max_length
    """
    return len(pair[0].split(" "))<Max_Length and len(pair[1].split(" "))<Max_Length

def filterPairs(qr_pairs):
    """ Filter pairs which have any sentence having length greater than Max_length

    @param qr_pair List[List[q,r]]: The list of query-response pairs

    @returns List[List[q,r]]: The filtered out list of query response pairs
    """
    return [pair for pair in qr_pairs if filterPair(pair)]

def prepareDataset(qr_pairs):
    """ Prepares vocabulary and preprocesses sentences from the dataset

    @param qr_pairs List[List[q,r]]: The list of query-response pairs

    @returns Vocabulary Object, List[List[q,r]]: The prepared vocabulary and the processed query-response pairs

    """

    voc, qr_pairs=readVocs(qr_pairs)
    qr_pairs=filterPairs(qr_pairs)
       
    for pair in qr_pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])

    return voc,qr_pairs

t1=time.time()
print("Preparing dataset and corresponding vocabulary...")
voc, pairs=prepareDataset(qr_pairs)
print("Preparation took place in: "+str(time.time()-t1))

Preparing dataset and corresponding vocabulary...
Preparation took place in: 5.679647207260132


In [13]:
def trimRareWords(voc,qr_pairs,min_count):
    """ Trims the rare words from the vocabulary with min_count threshold. This also
        removes those pairs which contains these words.

    @param voc Vocabulary Object: The vocabulary so far created
    @param qr_pairs List[List[]]: The list of query-response pairs
    @param min_count Integer: The threshold below which frequency words are removed

    @returns List[List[]]: The query-response with pairs removed pairs which contains less freq
                           words

    """
    
    voc.trim(min_count)
    keep_pairs=[]
    
    for pair in qr_pairs:
        input_sentence=pair[0]
        output_sentence=pair[1]
        
        keep_input=True
        keep_output=True
        
        for word in input_sentence.split(" "):
            if word not in voc.word2index:
                keep_input=False
                break
        
        for word in output_sentence.split(" "):
            if word not in voc.word2index:
                keep_output=False
                break
                
        if keep_input and keep_output:
            keep_pairs.append(pair)
            
    return keep_pairs

t1=time.time()
print("Trimming rare words from vocabulary and dataset..")

pairs=trimRareWords(voc,pairs,Min_Count)

print("Trimming took place in: "+str(time.time()-t1))

Trimming rare words from vocabulary and dataset..
Trimming took place in: 0.1347339153289795


In [14]:
random.shuffle(pairs)
training_pairs=pairs[0:40000]
testing_pairs=pairs[40000:len(pairs)]

if not os.path.exists(train_path):
    os.makedirs(train_path)
    os.makedirs(test_path)
    with open(os.path.join(train_path,"training_data"),'wb') as fp:
        pickle.dump(training_pairs,fp)

    with open(os.path.join(test_path,"testing_pairs"),"wb") as fp:
        pickle.dump(testing_pairs,fp)
        

In [15]:
with open(os.path.join(train_path,"training_data"),"rb") as fp:
    training_data=pickle.load(fp)

with open(os.path.join(test_path,"testing_pairs"),"rb") as fp:
    testing_data=pickle.load(fp)


In [16]:
def indexesFromSentence(voc, sentence):
    """ Replace words in the sentence with the index from the vocabulary
    
    @param voc: The vocabulary we have created
    @param sentence: The sentence to be indexed

    @returns List[]: The list of tokens/index corresponding to each word in the sentence

    """

    return [voc.word2index[word] for word in sentence.split(' ')] + [END_Token]


def zeroPadding(l, fillvalue=PAD_Token):
    """ The function takes each tokenised sentences and take out each token from their respective position in a single list.

    [[1,2,3],[1,2]]==>[[1,1],[2,2],[3,0]]

    @l list[list]: The batch of tokens for each sentence (bs,random sizes)
    @fillvalue Integer: The padding value to be used

    @returns list[list]: Of size (max_length_sentence,bs)

    """
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_Token):
    """ Create a binary mask. 0 where pad value appears otherwise its 1.

    @param l list[list]: The batch of tokens in the sentences (max_length_sentence,bs)
    @param value Integer: The value to which to be masked

    @returns m list[list]: The mask corresponding to l (max_length_sentence,bs)
    """
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_Token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def inputVar(l, voc):
    """ The function to preprocess input sentences. Returns padded input sentences and lengths of each sentences

    @param l list[String]: The list of sentences
    @param voc Vocabulary: The vocabulary of our corpus

    @returns tensor(max_length_sentence,bs),tensor(bs): 
    """
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths


def outputVar(l, voc):
    """ Returns padded output variable, boolean mask and the maximum target length

    @param l list[Str]: The list of sentences
    @voc voc Vocabulary: The voc of our corpus

    @returns tensor(max_length_sentence,bs), tensor(max_length_sentence,bs), integer

    """

    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

def batch2TrainData(voc, pair_batch):
    """ Returns padded, indexed input and out put vectors,lengths of input sentences,mask, maximum target sentence length

    @param voc Vocabulary: The vocabulary of our corpus
    @param pair_batch list[list[q,r]]: The query-response pairs list

    @returns tensor(max_sentence_length,bs),tensort(bs),output(max_target_length,bs),mask(max_sentence_length,bs), integer
    """
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


In [17]:
print("Number of query-response pairs after all the preprocessing: "+str(len(pairs)))
print("Number of Unique Words in our vocabulary: "+str(voc.num_words))

#Sample batch
batch=[random.choice(pairs) for _ in range(5)]
tokenised_input,input_lengths,tokenised_output,mask,max_out_length=batch2TrainData(voc,batch)

print("Input length: "+str(input_lengths)+" Size: "+str(input_lengths.shape))
print("-"*80)
print("Tokenised Input: "+str(tokenised_input)+" Size: "+str(tokenised_input.shape))
print("-"*80)
print("Max out length: "+str(max_out_length)+" Size: ")
print("-"*80)
print("Mask: "+str(mask)+" Size: "+str(mask.shape))
print("-"*80)
print("Tokenised Output: "+str(tokenised_output)+" Size: "+str(tokenised_output.shape))
print("-"*80)

Number of query-response pairs after all the preprocessing: 53113
Number of Unique Words in our vocabulary: 7816
Input length: tensor([8, 5, 5, 5, 5]) Size: torch.Size([5])
--------------------------------------------------------------------------------
Tokenised Input: tensor([[7353,   25,  383,   98,    7],
        [   6,  200,    7,   12,  974],
        [  50,  483, 2651,   99,  774],
        [ 115,    4,    4,    4,    4],
        [  36,    2,    2,    2,    2],
        [ 530,    0,    0,    0,    0],
        [   6,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]]) Size: torch.Size([8, 5])
--------------------------------------------------------------------------------
Max out length: 5 Size: 
--------------------------------------------------------------------------------
Mask: tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [False,  True,  True,  True, False],
        [F

# Model

In [18]:
class EncoderRNN(nn.Module):
    
    def __init__(self,hidden_size,embedding,n_layers=1,dropout=0):
        """
        Encoder module for seq2seq architechture.
        """
    
        super().__init__()
        
        self.n_layers=n_layers
        self.hidden_size=hidden_size
        
        self.embedding=embedding
        self.gru=nn.GRU(hidden_size,hidden_size,n_layers,dropout=(0 if n_layers==1 else dropout),bidirectional=True)
        
    def forward(self,input_seq,input_lengths,hidden=None):
        
        embedded_input=self.embedding(input_seq)
        packed=nn.utils.rnn.pack_padded_sequence(embedded_input,input_lengths)
        outputs,hidden=self.gru(packed,hidden)
        
        outputs,_=nn.utils.rnn.pad_packed_sequence(outputs)
        
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        
        return outputs,hidden

    def hidden_init(self,batch_size):
        return torch.zeros(self.n_layers*2,batch_size,self.hidden_size,device=device)


In [19]:
# Luong attention layer
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [20]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        
        rnn_output, hidden = self.gru(embedded, last_hidden)
        
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        # output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [21]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [22]:
def make_model(vocabulary_size,d_model=500,num_encoders=1,num_decoders=1,dropout_encoder=0.1,dropout_decoder=0.1,attn_model='general'):

    embedding=nn.Embedding(vocabulary_size,d_model)
    embedding.weight.requires_grad=False

    encoder=EncoderRNN(d_model,embedding,num_encoders,dropout_encoder)
    decoder=LuongAttnDecoderRNN(attn_model,embedding,d_model,vocabulary_size,num_decoders,dropout_decoder)

    for p in encoder.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)
    
    for p in decoder.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)
    
    num_parameters=0

    num_parameters+=count_parameters(encoder)
    num_parameters+=count_parameters(decoder)

    return encoder,decoder,num_parameters



# Training

In [23]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://app.wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 849679b0d800cf9a8ecb95e52bc8d8840d316a1e
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[32mSuccessfully logged in to Weights & Biases![0m


In [24]:
def f1_score(predictions, targets, average=True):
    predictions=predictions.tolist()
    targets=targets.tolist()

    def f1_score_items(pred_items, gold_items):
        common = Counter(gold_items) & Counter(pred_items)
        num_same = sum(common.values())

        if num_same == 0:
            return 0

        precision = num_same / len(pred_items)
        recall = num_same / len(gold_items)
        f1 = (2 * precision * recall) / (precision + recall)

        return f1
    
    scores = [f1_score_items(p, t) for p, t in zip(predictions, targets)]

    if average:
        return sum(scores) / len(scores)    

    return scores

In [75]:
def test(input_variable,lengths,target_variable,mask,max_target_len,encoder,decoder,batch_size,max_length=Max_Length):


    encoder.eval()
    decoder.eval()

    loss_func=nn.CrossEntropyLoss()

    input_variable=torch.tensor(input_variable).to(device)
    lengths=lengths.to(device)
    target_variable=torch.tensor(target_variable).to(device)
    mask=mask.to(device)

    loss=0
    print_losses=[]
    n_totals=0

    encoder_hidden=encoder.hidden_init(input_variable.size()[1])
    encoder_outputs, encoder_hidden=encoder(input_variable,lengths,encoder_hidden)

    decoder_input=torch.LongTensor([[START_Token for _ in range(batch_size)]])
    decoder_input=decoder_input.to(device)
    decoder_hidden=encoder_hidden[:decoder.n_layers]

    predicted=torch.ones(max_target_len,input_variable.size()[1])

    for t in range(max_target_len):
        decoder_output,decoder_hidden=decoder(decoder_input,decoder_hidden,encoder_outputs)
    
        _,topi=decoder_output.topk(1)
        decoder_input=torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
        decoder_input=decoder_input.to(device)
        
        mask_loss=loss_func(decoder_output,target_variable[t])
        loss+=mask_loss
        nTotal=(mask.sum()).item()
        print_losses.append(mask_loss.item()*nTotal)
        n_totals+=nTotal
        predicted[t]=torch.argmax(decoder_output,dim=-1)
    
    F1=f1_score(predicted,target_variable.transpose(0,1))

    return sum(print_losses)/n_totals,F1


In [79]:
def train(input_variable,lengths,target_variable,mask,max_target_len,encoder,decoder,
          encoder_optimizer,decoder_optimizer,batch_size,clip,max_length=Max_Length,teacher_forcing=True):
    
    
    encoder.train()
    decoder.train()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss_fn=nn.CrossEntropyLoss()
    
    input_variable=torch.tensor(input_variable).to(device)
    lengths=lengths.to(device)

    target_variable=torch.tensor(target_variable).to(device)
    mask=mask.to(device)
    
    loss=0
    print_losses=[]
    n_totals=0
    encoder_hidden=encoder.hidden_init(input_variable.size()[1])
    encoder_outputs, encoder_hidden=encoder(input_variable,lengths,encoder_hidden)
    
    decoder_input=torch.LongTensor([[START_Token for _ in range(batch_size)]])
    decoder_input=decoder_input.to(device)
    use_teacher_forcing=teacher_forcing #if random.random()<teacher_forcing_ratio else False
    
    decoder_hidden=encoder_hidden[:decoder.n_layers]

    predicted=torch.ones(max_target_len,input_variable.size()[1])
    
    if use_teacher_forcing:
        
        for t in range(max_target_len):
            decoder_output,decoder_hidden=decoder(decoder_input,decoder_hidden,encoder_outputs)
            
            decoder_input=target_variable[t].view(1,-1)
            
            
            # mask_loss=F.cross_entropy(decoder_output,target_variable[t])
            # mask_loss,nTotal=maskNLLLoss(decoder_output,target_variable[t],mask)
            mask_loss=loss_fn(decoder_output,target_variable[t])
            loss+=mask_loss
            nTotal=(mask.sum()).item()
            print_losses.append(mask_loss.item()*nTotal)
            n_totals+=nTotal
            predicted[t]=torch.argmax(decoder_output,dim=-1)
            

            
    else:
        
        for t in range(max_target_len):
            decoder_output,decoder_hidden=decoder(decoder_input,decoder_hidden,encoder_outputs)
            
            _,topi=decoder_output.topk(1)
            decoder_input=torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input=decoder_input.to(device)
            
            mask_loss=loss_fn(decoder_output,target_variable[t])
            loss+=mask_loss
            nTotal=(mask.sum()).item()
            print_losses.append(mask_loss.item()*nTotal)
            n_totals+=nTotal


            
    F1=f1_score(predicted,target_variable.transpose(0,1))
    if teacher_forcing:
        loss.backward()
        
        _=nn.utils.clip_grad_norm_(encoder.parameters(),clip)
        _=nn.utils.clip_grad_norm_(decoder.parameters(),clip)
        
        
        # plot_grad_flow(encoder.named_parameters())
        # plot_grad_flow(decoder.named_parameters())
        
        encoder_optimizer.step()
        decoder_optimizer.step()
        
    return sum(print_losses)/n_totals,F1
    


In [81]:
def trainIters(model_name,voc,pairs,encoder,decoder,encoder_optimizer,decoder_optimizer,
               encoder_n_layers,decoder_n_layers,save_dir,n_batches,batch_size,
               save_every,clip,corpus_name,loadFileName,n_epochs,training_batches,teacher_forcing,save_want,
               testing_batches,batch_size_test,n_batch_test):
    
    
    
    start_epoch=0
    loss=0
    perplexity=0
    time_taken=0
    f1_score=0
    
    if loadFileName:
        start_epoch=checkpoint['epoch']+1
        time_taken=checkpoint['time']
        
        
    for epoch in range(start_epoch,n_epochs):

        t1=time.time()
        loss=0
        perplexity=0
        f1_score=0

        test_loss=0
        test_perplexity=0
        test_f1_score=0

        for i in range(n_batches):
            
            training_batch=training_batches[i]
            
            input_variable,lengths,target_variable,mask,max_target_len=training_batch

            curr_loss,F1=train(input_variable,lengths,target_variable,mask,max_target_len,encoder,decoder,
                    encoder_optimizer,decoder_optimizer,batch_size,clip,10,teacher_forcing)
            
            loss+=curr_loss
            perplexity+=math.exp(curr_loss)
            f1_score+=F1
            
        
        
        loss=loss/n_batches
        perplexity=perplexity/n_batches
        f1_score=f1_score/n_batches

        for i in range(n_batch_test):

            testing_batch=testing_batches[i]
            input_variable,lengths,target_variable,mask,max_target_len=testing_batch

            curr_loss,F1=test(input_variable,lengths,target_variable,mask,max_target_len,encoder,decoder,
                              batch_size_test)
            
            test_loss+=curr_loss
            test_perplexity+=math.exp(curr_loss)
            test_f1_score+=F1

        test_loss=test_loss/n_batch_test
        test_perplexity=test_perplexity/n_batch_test
        test_f1_score=test_f1_score/n_batch_test
         
        
        if epoch%save_every==0 and save_want:

            directory=os.path.join(save_dir,model_name,corpus_name)
            if not os.path.exists(directory):
                os.makedirs(directory)
            
            torch.save({
                "epoch":epoch,
                "encoder":encoder.state_dict(),
                "decoder":decoder.state_dict(),
                "loss":loss,
                "encoder_opt":encoder_optimizer.state_dict(),
                "decoder_opt":decoder_optimizer.state_dict(),
                "ppl":perplexity,
                "time":time_taken,
                "F1":f1_score

            },os.path.join(directory,'{}_{}.tar'.format(epoch,"checkpoint")))
        
        print("="*100)
        print("| End of epoch : "+str(epoch)+"| Loss Value: "+str(loss)+"| PPL: "+str(perplexity)+"| F1: "+str(f1_score)+"| Time Took: "+
            str(time.time()-t1)+" |")
        print("="*100)
        time_taken+=(time.time()-t1)

        
        wandb.log({
            "Training loss": loss,
            "Training PPL(Perplexity)": perplexity,
            "Training F1 Score": f1_score,
            "Testing loss": test_loss,
            "Testing PPL(Perplexity)":test_perplexity,
            "Testing F1 Score": test_f1_score

        })

    print("| Training Finished | Took:"+str(time_taken))          
        

        

        

In [82]:

def data_generation(pairs,batch_size,n_batches,start=0):
    
    # sample_batches=[batch2TrainData(voc,[random.choice(pairs) for _ in range(batch_size)]) for _ in range(n_batches)]
    sample_batches=[]
    
    for i in range(n_batches):
        curr_batch=[]
        for j in range(batch_size):
            curr_id=i*batch_size+j+start
            curr_batch.append(pairs[curr_id])

        sample_batches.append(batch2TrainData(voc,curr_batch))

    return sample_batches

In [83]:
wandb.init(project="seq2seq_withattn")
wandb.watch_called=False
config=wandb.config


In [84]:
config.model_name='seq2seq_attn'
config.corpus_name='cornell-movie'

config.attn_model='dot'
config.num_encoder=2
config.num_decoder=2
config.d_model=500

config.dropout_encoder=0.1
config.dropout_decoder=0.1

config.batch_size=10
config.n_batches=4000

config.batch_size_test=10
config.n_batches_test=1000

config.clip=50.0
config.teacher_forcing_ratio=1.0
config.learning_rate=0.0001
config.decoder_learning_ratio=5.0


config.save_every=50
config.n_epochs=500

config.teacher_forcing=True
config.save_want=True

loadFile="/content/drive/My Drive/Model Data/seq2seq_attn/cornell-movie/200_checkpoint.tar"
loadFile=None

save_dir="/content/drive/My Drive/Model Data"

In [None]:
training_batches=data_generation(training_data,config.batch_size,config.n_batches,0)
testing_batches=data_generation(testing_data,config.batch_size_test,config.n_batches_test,0)

print("Making models....")
encoder,decoder,num_parameters=make_model(voc.num_words,config.d_model,config.num_encoder,config.num_decoder,config.dropout_encoder,config.dropout_decoder,config.attn_model)
print("Model building finished, the model has: "+str(num_parameters)+" parameters..")

encoder_optimizer=torch.optim.Adam(encoder.parameters(),lr=config.learning_rate)
decoder_optimizer=torch.optim.Adam(decoder.parameters(),lr=config.learning_rate*config.decoder_learning_ratio)

if loadFile:
    checkpoint=torch.load(loadFile)

    encoder.load_state_dict(checkpoint['encoder'])
    decoder.load_state_dict(checkpoint['decoder'])
    encoder_optimizer.load_state_dict(checkpoint['encoder_opt'])
    decoder_optimizer.load_state_dict(checkpoint['decoder_opt'])



encoder.to(device)
decoder.to(device)

encoder.train()
decoder.train()

for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

wandb.watch((encoder,decoder),log="all")

trainIters(config.model_name, voc, training_data, encoder, decoder, encoder_optimizer, decoder_optimizer,
        config.num_encoder, config.num_decoder, save_dir, config.n_batches, config.batch_size, config.save_every, 
        config.clip, config.corpus_name, loadFile,config.n_epochs,training_batches,config.teacher_forcing,config.save_want,
        testing_batches,config.batch_size_test,config.n_batches_test)



Making models....
Model building finished, the model has: 14934316 parameters..


  if sys.path[0] == '':
  from ipykernel import kernelapp as app
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


| End of epoch : 0| Loss Value: 2.7284285665109267| PPL: 28.565142200150433| F1: 0.3382457854194264| Time Took: 164.19071793556213 |
| End of epoch : 1| Loss Value: 2.379492056145009| PPL: 11.613224135172375| F1: 0.3441194499422537| Time Took: 163.4325864315033 |
| End of epoch : 2| Loss Value: 2.250132328340516| PPL: 10.126186555647134| F1: 0.3437790417526609| Time Took: 162.47869491577148 |
| End of epoch : 3| Loss Value: 2.142036951653526| PPL: 9.024630292537646| F1: 0.3437299841515508| Time Took: 163.82451605796814 |
| End of epoch : 4| Loss Value: 2.035143141217644| PPL: 8.053009597373732| F1: 0.3436980512555854| Time Took: 164.0412516593933 |
| End of epoch : 5| Loss Value: 1.9272947216304022| PPL: 7.181435472786187| F1: 0.3436582065150581| Time Took: 164.14976525306702 |
| End of epoch : 6| Loss Value: 1.8183058969509305| PPL: 6.400973403511774| F1: 0.34404383845643094| Time Took: 160.4093358516693 |
| End of epoch : 7| Loss Value: 1.7137640784832784| PPL: 5.735988582482881| F1:

In [None]:
class GreedySearchDecoder(nn.Module):
    
    def __init__(self,encoder,decoder):
        super().__init__()
        
        self.encoder=encoder
        self.decoder=decoder
        
    def forward(self,input_seq,input_length,max_length,target=None):
        
        encoder_outputs,encoder_hidden=self.encoder(input_seq,input_length) 
        decoder_hidden=encoder_hidden[:decoder.n_layers]
        decoder_input=torch.ones(1,1,device=device,dtype=torch.long)*START_Token
    
        all_tokens=torch.zeros([0],device=device,dtype=torch.long)
        all_scores=torch.zeros([0],device=device)
        loss=0
        
        for i in range(max_length):
            
            decoder_output,decoder_hidden=self.decoder(decoder_input,decoder_hidden,encoder_outputs)
            if target!=None:
                loss+=F.cross_entropy(decoder_output,target[i]).item()
            decoder_scores,decoder_input=torch.max(decoder_output,dim=1)
            all_scores=torch.cat((all_scores,decoder_scores),dim=0)
            all_tokens=torch.cat((all_tokens,decoder_input),dim=0)
            
            decoder_input=torch.unsqueeze(decoder_input,0)
            
        return all_tokens, all_scores,loss/max_length
        
        

In [None]:
def testing(encoder,decoder,searcher,pairs,starting_point):

    loss=0
    F1=0
    num=0

    for i in range(starting_point,len(pairs)):
        test_data=data_generation(pairs,1,1,i)
        input_variable,lengths,target_variable,mask,max_target_len=test_data[0]
        input_variable=input_variable.to(device)
        lengths=lengths.to(device)
        target_variable=target_variable.to(device)

        tokens,scores,curr_loss=searcher(input_variable,lengths,max_target_len,target_variable)
        loss+=curr_loss
        F1+=f1_score(tokens.view(-1,max_target_len),target_variable)
        num=num+1

    return loss/num,F1/num





In [None]:
encoder.eval()
decoder.eval()
searcher=GreedySearchDecoder(encoder,decoder)
testing(encoder,decoder,searcher,pairs,0)


(17.45475177370771, 0.0887326835004401)

In [None]:
def evaluate(encoder, decoder, searcher,voc,sentence,max_length=MAX_LENGTH):
    
    index_batch=[indexesFromSentence(voc,sentence)]
    lengths=torch.tensor([len(index) for index in index_batch])
    input_batch=torch.LongTensor(index_batch).transpose(0,1)
    
    input_batch=input_batch.to(device)
    lengths=lengths.to(device)
    
    tokens, scores,loss=searcher(input_batch,lengths,max_length)
    decoded_words=[voc.index2word[token.item()] for token in tokens]
    return decoded_words

def evaluateInput(encoder,decoder,searcher,voc):
    input_sentence=''
    while True:
        try:
            input_sentence=input('Human> ')
            
            if input_sentence=='q' or input_sentence=='quit':
                break
            input_sentence=normalizeString(input_sentence)
            output_words=evaluate(encoder,decoder,searcher,voc,input_sentence)
            
            output_words[:]=[x for x in output_words if not(x=="PAD" or x=="EOS")]
            print("Bot:"," ".join(output_words))
            
        except KeyError:
            print("Unknown Word")
            
            
    

In [None]:
encoder.eval()
decoder.eval()

LuongAttnDecoderRNN(
  (embedding): Embedding(7816, 500)
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(500, 500, num_layers=2, dropout=0.1)
  (concat): Linear(in_features=1000, out_features=500, bias=True)
  (out): Linear(in_features=500, out_features=7816, bias=True)
  (attn): Attn()
)

In [None]:
searcher=GreedySearchDecoder(encoder,decoder)
evaluateInput(encoder,decoder,searcher,voc)

Human> what do you work ?
Bot: i ain t a dream .
Human> what is it for ?
Bot: nothing . not bad .
Human> quit


In [None]:
def testing(encoder,decoder,encoder_optimizer,decoder_optimizer,pairs,starting_point,n_batches=1000,batch_size=10):
    test_data=data_generation(pairs,batch_size,n_batches,starting_point)

    loss=0
    f1_score=0
    ppl=0

    for i in range(n_batches):
        current_batch=test_data[i]
        input_variable,lengths,target_variable,mask,max_target_len=current_batch

        curr_loss,F1=train(input_variable,lengths,target_variable,mask,max_target_len,encoder,decoder,
                    encoder_optimizer,decoder_optimizer,batch_size,clip,10,True)
        
        loss+=curr_loss
        f1_score+=F1
        ppl+=math.exp(curr_loss)
    
    loss=loss/n_batches
    f1_score=f1_score/n_batches
    ppl=ppl/n_batches

    print("Loss Value: "+str(loss)+" F1_Score: "+str(f1_score)+" Current PPL: "+str(ppl))
        


