In [2]:
import pandas as pd
import os
import re
import unicodedata
import itertools
import random
import torch
import torch.nn as nn

In [3]:
corpus = "movie_corpus"
corpus_name = "movie_corpus"
datafile = os.path.join("..", "data", corpus,  "formatted_movie_lines.txt")
datafile
with open(datafile, "rb") as file:
    lines = file.readlines()
    for line in lines[:10]:
        print(str(line), "\n")
    


b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n" 

b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n" 

b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n" 

b"You're asking me out.  That's so cute. What's your name again?\tForget it.\n" 

b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\n" 

b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n" 

b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\n" 

b'Why?\tUnsolved myster

In [4]:
# building vocabulary

In [5]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Vocabulary():
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {
            PAD_token: "PAD",
            SOS_token: "SOS",
            EOS_token: "EOS",
        }
        self.num_words = 3
        
    def addWord(self, w):
        if w not in self.word2index:
            self.word2index[w] = self.num_words
            self.word2count[w] = 1
            self.index2word[self.num_words] = w
            
            self.num_words += 1
        else:
            self.word2count[w] += 1
            
            
    def addSentence(self, sent):
        for word in sent.split(" "):
            self.addWord(word)
            
    def trim(self, min_cnt):
        if self.trimmed:
            return
        self.trimmed = True
        words_to_keep = []
        for k, v in self.word2count.items():
            if v >=  min_cnt:
                words_to_keep.append(k)
                
        # re build       
        self.word2index = {}
        self.word2count = {}
        self.index2word = {
            PAD_token: "PAD",
            SOS_token: "SOS",
            EOS_token: "EOS",
        }
        self.num_words = 3
        
        for w in words_to_keep:
            self.addWord(w)
            
            
            

In [6]:
# Load data


In [7]:
def unicodeToAscii(s):
    return  ''.join(c for c in unicodedata.normalize('NFD', s) 
                   if unicodedata.category(c) != 'Mn')
def cleanString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return  s

def readVocs(datafile, corpus_name):
    lines  = open(datafile,
                  encoding = 'utf-8'
                 ).read().strip().split("\n")
    pairs = [[cleanString(s) for s in l.split("\t")] 
             for l in lines]
    
    voc = Vocabulary(corpus_name)
    return voc, pairs
    


In [8]:
# for sake of training, use only short sentences
def filterPair(p, max_length): 
    return len(p[0].split(" ")) < max_length and len(p[1].split(" ")) < max_length 

def filterPairs(pairs, max_length): 
    return [pair for pair in pairs if filterPair(pair, max_length)]



In [9]:
def loadData(corpus, corpus_name, datafile, max_length):
    voc, pairs = readVocs(datafile, corpus_name)
    print(f"{len(pairs)}  Sentence pairs")
    pairs = filterPairs(pairs, max_length)
    print(f"{len(pairs)}  Sentence pairs after trimming")
    
    for p in pairs:
        voc.addSentence(p[0])
        voc.addSentence(p[1])
    print(f"{voc.num_words}  distinct words in vocabilary")
    return voc, pairs

max_length = 10
voc, pairs = loadData(corpus, corpus_name, datafile, max_length)


221282  Sentence pairs
64271  Sentence pairs after trimming
18008  distinct words in vocabilary


In [10]:
print("Example pairs")
for pair in pairs[-10:]:
    print(pair)

Example pairs
['four', 'three minutes to go !']
['three minutes to go !', 'yes .']
['another fifteen seconds to go .', 'do something ! stall them !']
['yes sir name please ?', 'food !']
['food !', 'do you have a reservation ?']
['do you have a reservation ?', 'food ! !']
['grrrhmmnnnjkjmmmnn !', 'franz ! help ! lunatic !']
['what o clock is it mr noggs ?', 'eleven o clock my lorj']
['stuart ?', 'yes .']
['yes .', 'how quickly can you move your artillery forward ?']


In [11]:
#  remove rare words, so that we reduce complexity and less time to train, given  that we don't have a big dataset


In [12]:
def removeRareWords(voc, all_pairs, minimum):
    voc.trim(minimum)
    pairs_to_keep = []
    for p in all_pairs:
        keep = True
        for word in p[0].split(" "):
            if word not in voc.word2index:
                keep = False
                break
        for word in p[1].split(" "):
            if word not in voc.word2index:
                keep = False
                break
        if keep:
            pairs_to_keep.append(p)
    print(f"Trimmed from {len(all_pairs)} pairs to {len(pairs_to_keep)} {100*len(pairs_to_keep)/len(all_pairs)}")
    return  pairs_to_keep
        
    

In [13]:
minimum_count = 3
pairs = removeRareWords(voc, pairs, minimum_count)

Trimmed from 64271 pairs to 53165 82.72004481025657


In [14]:
# transform sentence pairs to vecotrs

In [15]:
def indexFromSentence(voc, sent):
    return [voc.word2index[word] for word in sent.split(' ')] + [EOS_token]

def zeroPad(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def inputVar(l, voc):
    indexes_batch = [indexFromSentence(voc, sentence) for sentence in l]
    padList = zeroPad(indexes_batch)
    padTensor = torch.LongTensor(padList)
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    return padTensor, indexes_batch

def getMask(l, value=PAD_token):
    # we don not want to train the modell on padded tokes (those that serve to  make input the same length)
    # so mask them
    
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == value:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def outputVar(l, voc):
    indexes_batch = [indexFromSentence(voc, sentence) for sentence in l]
    max_target_length = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPad(indexes_batch)
    mask = torch.BoolTensor(getMask(padList))
    padTensor = torch.LongTensor(padList)
    return padTensor, mask, max_target_length


def batch2Train(voc, batch):
    
    batch.sort(key = lambda x: len(x[0].split(" ")), reverse=True)
    input_batch = []
    output_batch = []

    for p in  batch:
        input_batch.append(p[0])
        output_batch.append(p[1])

    inp, length = inputVar(input_batch, voc)
    output, mask, max_target_length = outputVar(output_batch, voc)
    return inp, length, output, mask, max_target_length
        

In [16]:
random.seed(0)
test_batch_size = 5
batches = batch2Train(voc, [random.choice(pairs) for _ in range(test_batch_size)])

input_variable, lengths, target_variable, mask, max_target_len = batches
print("Input:")
print(input_variable)

print("Target:")
print(target_variable)

print("Mask:")
print(mask)


Input:
tensor([[  25,  167,   25,   92,   47],
        [ 260,    4,   68,    7,    7],
        [  76,   25,  746,  192,    6],
        [ 116,  534,  174,    6,    2],
        [ 117,   76,   66,    2,    0],
        [1434,    4,    2,    0,    0],
        [   7,    2,    0,    0,    0],
        [   4,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
Target:
tensor([[1434,   60,   50,  387,  281],
        [  83,    4,   47,   25,    7],
        [   6,    2,    7,  192,   14],
        [   2,    0,  260,    6,  159],
        [   0,    0,    6,    2,    4],
        [   0,    0,    2,    0,    2]])
Mask:
tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True, False,  True,  True,  True],
        [False, False,  True,  True,  True],
        [False, False,  True, False,  True]])


In [17]:
# constructing the model

In [18]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers ==1 else dropout),
                          bidirectional=True)
        
        
        def forward(self, input_seq, input_lengths, hidden=None):
            embedded = self.embedding(input_seq)
            # packed so all inputs are of same length
            packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
            outputs, hidden = self.gru(packed, hidden)
            # upack padding and sum the GRU outputs
            outputs , _ = nn.utils.rnn.pad_packed_sequence(outputs)
            outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
            return outputs, hidden
        
        

In [19]:
# build  attention

In [20]:
class Attn(nn.Module):
    def __init__(self, hidden_state):
        super(Attn, self).__init__()
        self.hidden_state = hidden_state
        
    def dot_score(self, hidden, encoder_outputs):
        return torch.sum(hidden * encoder_outputs, dim=2)
    
    def forward(self, hidden, encoder_outputs):
        attn_energies = self.dot_score(hidden, encoder_outputs)
        attn_energies = attn_energies.t()
        return F.softmax(attn_energies, dim=1).unsqueeze(1)
    
    

In [21]:
# decoder

In [22]:
class Decoder(nn.Module):
    def __init__(self, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding
        
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers ==1 else dropout),
                          bidirectional=False) # don't make it bi-directional
        self.concat = nn.Linear(2*hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attn(hidden_size)
        
    def forward(self, input_step, last_hidden, encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output, hidden  = self.gru(embedded, last_hidden)
        # get attention weights
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0,1))
        # concatenate weighted context vector with output from gru
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output =  torch.tanh(self.concat(concat_input))
        # use concatenated output to predict the next word using softmax
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        
        return output, hidden
        

In [23]:
#  first define the  measure of loss of the models ( there are the padded tokens whihc should not count)

In [24]:
def NLLMaskLoss(inp, target, mask):
    TotalN = mask.sum()
    CELoss = torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = CELoss.mask_select(mask).mean()
    loss = loss.to(device)
    return loss, TotalN.item()



In [26]:
def train(input_variable, lengths, target_variable,
          mask, max_target_length, encoder, decoder,
          embedding, encoder_optimizer, decoder_optimizer,
          batch_size, clip, max_length=max_length):
    
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    input_variable =  input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    loss = 0
    print_losses = []
    n_totals = 0
    
    # forward pass of inputs to encoder and get hidden state
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
    # create initial decoder input starting with SOS token
    # then set hidden state of decoder to  be equal to the one of the encoder
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    
    decoder_hidden =  encoder_hidden[:decoder.n_layers]
    
    # use teahcer forcing
    use_TF  =  True if random.random() < teacher_forcing_ration else False
    
    if use_TF:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(decoder_input,
                                                     decoder_hidden,
                                                     encoder_outputs)
            decoder_input = target_variable[t].view(1, -1)
            mask_loss, nTotal = NLLMaskLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item()*nTotal)
            n_totals +=nTotal
    else:
        decoder_output, decoder_hidden = decoder(decoder_input,
                                                     decoder_hidden,
                                                     encoder_outputs)
        _, topi = decoder_output.topk(1)
        decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
        decoder_input = decoder_input.to(device)
        mask_loss, nTotal = NLLMaskLoss(decoder_output, target_variable[t], mask[t])
        loss += mask_loss
        print_losses.append(mask_loss.item()*nTotal)
        n_totals +=nTotal
        
    # do backprop, gradient cliping, update weights
    loss.backward()
    _ =  nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ =  nn.utils.clip_grad_norm_(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return sum(print_losses)/n_totals


In [None]:

# need to create train callers on batches

# split data 

In [32]:
def trainIters(model_name, voc, pairs, encoder, decoder,
               encoder_optimizer, decoder_optimizer,
              embedding, encoder_n_layers, decoder_n_layers, save_dir, 
              n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):
    
    training_batches = [batch2Train(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]
                       
    print("starting...")
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration']+1
    print("beginning training...")
    for iteration in range(start_iteration, n_iteration+1):
        trainig_batch = trainig_batches[iteartion-1]
        input_variable, lengths, target_variable, mask, max_target_length = trainig_batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_length,
                    encoder, decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss +=loss
        
    
        
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent done: {:.1f}%; Mean loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
    
    

In [33]:
# evaluation is trycky as the is no exact asnwers, many are valid

In [34]:
class GreedySearchDencoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDencoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
        
    def forward(self, input_seq, input_length, max_length):
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        
        # create decoder input wiw SOS tokens and initiallize tensor to  append words
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # iterate decoding one word at a time, use max to get highest prediced word
        
        for _ in range(max_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # output becomes next iter input
            decoder_input = torch.unsqueeze(decoder_input, 0)
        return all_tokens, all_scores
    

        

In [None]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length = max_length):
    indices = [indexFromSentence(voc, sentence)]
    lengths = torch.tensor([len()])
