In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pickle
import numpy as np
import math
%matplotlib inline

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

# GLOVE Vectors
import torchtext.vocab as vocab

#For Batching
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from masked_cross_entropy import *

use_cuda = torch.cuda.is_available()
print(" I have a GPU? ", use_cuda)

MAX_SENT_LENGTH = 25
min_sent_len = 1
min_word_cnt = 3

 I have a GPU?  True


In [37]:
glove = vocab.GloVe(name='6B', dim=100)

print('Loaded {} words'.format(len(glove.itos)))

Loaded 400000 words


In [2]:
class Lang:
    
    def __init__(self, name):
        
        '''
        Store the string token to index token
        mapping in the word2index and index2word
        dictionaries. 
        '''
        
        self.name = name
        self.trimmed = False # gets changed to True first time Lang.trim(min_count) is called
        self.word2index = {"<PAD>" : 0 ,  "<SOS>" : 1, "<EOS>" : 2 , "<UNK>" : 3}
        self.word2count = {}
        self.index2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.n_words = len(self.index2word) # Count default tokens
        self.num_nonwordtokens = len(self.index2word)
        self.PAD_token = 0
        self.SOS_token = 1
        self.EOS_token = 2
        self.UNK_token = 3

    def index_sentence(self, sentence):
        '''
        Absorbs a sentence string into the token dictionary
        one word at a time using the index_word function
        increments the word count dictionary as well
        '''
        for word in sentence.split(' '):
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    
    def trim(self, min_count):
        '''
        Removes words from our 3 dictionaries that
        are below a certain count threshold (min_count)
        '''
        if self.trimmed: return
        self.trimmed = True
        
        keep_words = []
        
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words %s / %s = %.4f' % (
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.n_words = len(self.index2word) # Count default tokens
        self.num_nonwordtokens = len(self.index2word)
        self.PAD_token = 0
        self.SOS_token = 1
        self.EOS_token = 2
        self.UNK_token = 3

        for word in keep_words:
            self.index_word(word)
            
    def unicode_to_ascii(self, s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn')
            
    def normalize_string(self, s):
        s = self.unicode_to_ascii(s.lower().strip())
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
        s = re.sub("newlinechar", "", s)
        s = s.replace("'","")
        s = s.replace(".","")
        s = s.replace("n t ","nt ")
        s = s.replace("i m ","im ")
        s = s.replace("t s ","ts ")
        s = s.replace(" s ","s ")
        s = s.replace(" re "," are ")
        s = s.replace("i ve ","ive ")
        s = s.replace(" d ","d ")
        s = ' '.join(s.split())
        return s

    def filterPair(self, p, max_sent_len, min_sent_len):
        
        '''
        Your Preferences here
        '''

        return len(p[0].split(' ')) < max_sent_len and \
               len(p[1].split(' ')) < max_sent_len and \
               len(p[1].split(' ')) > min_sent_len and \
               len(p) == 2 and \
               "https://" not in p[1] 

    
    def make_pairs(self, path_to_tab_sep_dialogue, 
                   max_sent_len = 20, min_sent_len = 4):

        print("making final_pairs list ...")
        lines = open(path_to_tab_sep_dialogue).read().strip().split('\n')
        
        final_pairs = []
        i = 0
        for l in lines:
            
            pair = [self.normalize_string(sentence) for sentence in l.split('\t')]
            
            if self.filterPair(pair,max_sent_len, min_sent_len):
                
                filtered_pair = []
                
                for sentence in pair:

                    self.index_sentence(sentence)
                    filtered_pair.append(sentence)
                  
                final_pairs.append(filtered_pair)
        print("number of pairs", len(final_pairs))
        return final_pairs
    
    def tokens2glove(self, min_word_count,glove, mbed_dim = 50):
    
        print("trimming...")    
        self.trim(min_word_count)
        
        if glove is None:
            glove = vocab.GloVe(name='6B', dim=embed_dim)
            print('Loaded {} words'.format(len(glove.itos)))
        else:
            embed_dim = glove.vectors.size(1)
                    
        print("building embedding from glove...")
        embedding = np.zeros((len(self.index2word), embed_dim)).astype(np.float32)
        for i in range(self.num_nonwordtokens):
            embedding[i,:] = np.random.uniform(-1,1,embed_dim).astype(np.float32)
        for i in range(self.num_nonwordtokens,len(self.index2word)):
            if self.index2word[i] in glove.stoi:
                embedding[i,:] = glove.vectors[glove.stoi[self.index2word[i]]]
            else:
                embedding[i,:] = np.random.uniform(-1,1,embed_dim).astype(np.float32)
        
        return self.index2word, self.word2index, embedding, self.n_words #torch.from_numpy(embeddings).float() 
    

In [39]:
MAX_SENT_LENGTH = 20
min_sent_len = 4
min_word_cnt = 3
lang = Lang("chat")
pairs = lang.make_pairs("../data/1-25.txt", 
                              max_sent_len = MAX_SENT_LENGTH, min_sent_len = min_sent_len)

index2word, word2index, embedding, vocab_size = lang.tokens2glove(min_word_cnt ,glove)

making final_pairs list ...
number of pairs 257736
trimming...
keep_words 28704 / 46316 = 0.6197
building embedding from glove...


In [17]:
picklefile = (index2word, word2index, embedding, pairs)
pickle.dump(picklefile, 
            open( "saved_pickle/index2word28704_word2index_embedding_257736pairs.p", "wb" ) )

In [3]:
index2word, word2index, embedding, final_pairs  = \
pickle.load( open( "saved_pickle/index2word28704_word2index_embedding_257736pairs.p", "rb" ) )

lang = Lang("chat")
lang.index2word = index2word
lang.word2index = {**lang.word2index, **word2index}
MAX_SENT_LENGTH = 20

In [4]:
print(embedding.shape, len(index2word), len(lang.word2index))

(28708, 100) 28708 28708


In [4]:
##########  Converts [" input string ", " output string "] (pair) , appends <EOS> index and returns indices ######

def indexesFromSentence(lang, sentence):
    '''
    account for strings not in the vocabulary by using the unknown token
    '''
    sentence_as_indices = []
    sentence = lang.normalize_string(sentence)
    for word in sentence.split(' '):
        if word in lang.word2index:
            sentence_as_indices.append(lang.word2index[word])
        else:
            sentence_as_indices.append(lang.UNK_token)
            
    sentence_as_indices.append(lang.EOS_token)
    
    return sentence_as_indices


def variableFromSentence(lang, sentence):
    '''
    add EOS token to sequence of idices and make a column vector
    '''
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(lang.EOS_token)
    result = Variable(torch.LongTensor(indexes).view(-1, 1))
    if use_cuda:
        return result.cuda()
    else:
        return result


def variablesFromPair(pair,lang):
    input_variable = variableFromSentence(lang, pair[0])
    target_variable = variableFromSentence(lang, pair[1])
    return (input_variable, target_variable)

######## the pair indices are returned as 2 LongTensor Variables in torch #############


######## Tells you how long youve been training and how much longer you have left ####

import time
import math

def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

######################################################################3

############### plot_losses #######################################

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    
####################################################################

In [5]:
class EncoderRNN(nn.Module):
    
    def __init__(self, hidden_size, embedding,
                 num_layers = 3, bidirectional = False, train_embedding = True):
        
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        embedding = torch.from_numpy(embedding).float()
        if use_cuda:
            embedding.cuda()
        self.embedding = nn.Embedding(embedding.shape[0], embedding.shape[1])
        self.embedding.weight = nn.Parameter(embedding, requires_grad=train_embedding)
        self.gru = nn.GRU(embedding.shape[1], hidden_size, num_layers, bidirectional=bidirectional)
        
        if bidirectional:
            num_directions = 2
        else:
            num_directions = 1
        
        # make the initial hidden state learnable as well 
        hidden0 = torch.zeros(self.num_layers*num_directions, 1, self.hidden_size)
        
        if use_cuda:
            hidden0 = hidden0.cuda()
        else:
            hidden0 = hidden0

        self.hidden0 = nn.Parameter(hidden0, requires_grad=True)

    def forward(self, input_seqs, input_lengths, hidden):
        
        if use_cuda:
            input_seqs.cuda()
        batch_size = input_seqs.size(1)
        hidden = self.hidden0.repeat(1, batch_size, 1)

        self.embedded = self.embedding(input_seqs)
        #self.packed = torch.nn.utils.rnn.pack_padded_sequence(self.embedded, input_lengths)
        #output, hidden = self.gru(self.packed, hidden)
        output, hidden = self.gru(self.embedded, hidden)
        #output, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(output) # unpack (back to padded)
        
        if self.bidirectional:
            output = output[:, :, :self.hidden_size] + output[:, : ,self.hidden_size:] # Sum bidirectional outputs
        
        # ouput (max_len x batch_size x hidden_size)
        # hidden ( n_layers * 2(if bidirectional) x batch_size x hidden_size )   
        return output, hidden  
    

    def initHidden(self):
        
        if use_cuda:
            return self.hidden0.cuda()
        else:
            return self.hidden0

In [6]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))

    def forward(self, hidden, encoder_outputs):
        max_len = encoder_outputs.size(0)
        this_batch_size = encoder_outputs.size(1)

        # Create variable to store attention energies
        attn_energies = Variable(torch.zeros(this_batch_size, max_len)) # B x S

        if use_cuda:
            attn_energies = attn_energies.cuda()

        # For each batch of encoder outputs
        for b in range(this_batch_size):
            # Calculate energy for each encoder output
            for i in range(max_len):
                attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))

        # Normalize energies to weights in range 0 to 1, resize to 1 x B x S
        return F.softmax(attn_energies).unsqueeze(1)
    
    def score(self, hidden, encoder_output):
        
        if self.method == 'dot':
            energy = hidden.dot(encoder_output)
            return energy
        
        elif self.method == 'general':
            energy = self.attn(encoder_output)
            energy = hidden.dot(energy)
            return energy
        
        elif self.method == 'concat':
            energy = self.attn(torch.cat((hidden, encoder_output), 1))
            energy = self.v.dot(energy)
            return energy

In [7]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        # Choose attention model
        if attn_model != 'none':
            self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step at a time

        # Get the embedding of the current input word (last output word)
        batch_size = input_seq.size(0)
        #hidden = self.hidden0.repeat(1, batch_size, 1)
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size) # S=1 x B x N, view is like reshape()

        # Get current hidden state from input word and last hidden state
        rnn_output, hidden = self.gru(embedded, last_hidden)

        # Calculate attention from current RNN state and all encoder outputs;
        # apply to encoder outputs to get weighted average
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x S=1 x N

        # Attentional vector using the RNN hidden state and context vector
        # concatenated together (Luong eq. 5)
        rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
        context = context.squeeze(1)       # B x S=1 x N -> B x N
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = F.tanh(self.concat(concat_input))

        # Finally predict next token (Luong eq. 6, without softmax)
        output = self.out(concat_output)

        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights

In [8]:
# Pad a with the PAD symbol
def pad_seq(seq, max_length):
    seq += [lang.PAD_token for i in range(max_length - len(seq))]
    return seq

def random_batch(batch_size):
    input_seqs = []
    target_seqs = []

    # Choose random pairs
    for i in range(batch_size):
        pair = random.choice(pairs)
        input_seqs.append(indexesFromSentence(lang, pair[0]))
        target_seqs.append(indexesFromSentence(lang, pair[1]))

    # Zip into pairs, sort by length (descending), unzip
    seq_pairs = sorted(zip(input_seqs, target_seqs), key=lambda p: len(p[0]), reverse=True)
    input_seqs, target_seqs = zip(*seq_pairs)
    
    # For input and target sequences, get array of lengths and pad with 0s to max length
    input_lengths = [len(s) for s in input_seqs]
    input_padded = [pad_seq(s, max(input_lengths)) for s in input_seqs]
    target_lengths = [len(s) for s in target_seqs]
    target_padded = [pad_seq(s, max(target_lengths)) for s in target_seqs]

    # Turn padded arrays into (batch_size x max_len) tensors, transpose into (max_len x batch_size)
    input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
    target_var = Variable(torch.LongTensor(target_padded)).transpose(0, 1)
    
    if use_cuda:
        input_var = input_var.cuda()
        target_var = target_var.cuda()
        
    return input_var, input_lengths, target_var, target_lengths

In [9]:
def train(input_batches, input_lengths, target_batches, target_lengths, 
          encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 
          max_length=MAX_SENT_LENGTH ):
    
    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0 # Added onto for each word

    # Run words through encoder
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([lang.SOS_token] * batch_size))
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder

    max_target_length = max(target_lengths)
    all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))

    # Move new Variables to CUDA
    if use_cuda:
        decoder_input = decoder_input.cuda()
        all_decoder_outputs = all_decoder_outputs.cuda()

    # Run through decoder one time step at a time
    for t in range(max_target_length):
        decoder_output, decoder_hidden, decoder_attn = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )

        all_decoder_outputs[t] = decoder_output
        decoder_input = target_batches[t] # Next input is current target

    # Loss calculation and backpropagation
    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(), # -> batch x seq
        target_batches.transpose(0, 1).contiguous(), # -> batch x seq
        target_lengths
    )
    loss.backward()
    
    # Clip gradient norms
    ec = torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    dc = torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)

    # Update parameters with optimizers
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0], ec, dc

In [10]:
# Keep track of time elapsed and running averages
start = time.time()
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every
#print('encoder_outputs', encoder_outputs.size()) # max_len x batch_size x hidden_size
#print('encoder_hidden', encoder_hidden.size()) # n_layers * 2 x batch_size x hidden_size

# Configure models
dropout = 0.1
attn_model = 'general'
hidden_size = 512
num_layers = 2
bidirectional = True
vocab_size = len(lang.index2word)

encoder = EncoderRNN(hidden_size, embedding, num_layers = num_layers, 
                     bidirectional = bidirectional,
                     train_embedding = True)

decoder = LuongAttnDecoderRNN(attn_model, hidden_size, vocab_size, 
                              n_layers = num_layers, dropout=dropout)

if use_cuda:
    encoder = encoder.cuda()
    decoder = decoder.cuda()




# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 0.5



In [None]:
# Begin!
ecs = []
dcs = []
eca = 0
dca = 0

batch_size = 128
n_epochs = 2000
epoch = 0
print_every = 5
gamma = .99
learning_rate = 0.001
decoder_learning_ratio = 4.0
# Initialize optimizers and criterion

encoder_optimizer = optim.Adam(encoder.parameters(),
                               lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), 
                               lr=learning_rate * decoder_learning_ratio)

escheduler = optim.lr_scheduler.StepLR(encoder_optimizer, 
                                       step_size=print_every, gamma=gamma) 

dscheduler = optim.lr_scheduler.StepLR(decoder_optimizer, 
                                        step_size=print_every, gamma=gamma) 

criterion = nn.CrossEntropyLoss()

encoder.cuda()
decoder.cuda()

while epoch < n_epochs:
    epoch += 1
    
    # Get training data for this cycle
    input_batches, input_lengths, target_batches, target_lengths = random_batch(batch_size)

    # Run the train function
    loss, ec, dc = train(
        input_batches, input_lengths, target_batches, target_lengths,
        encoder, decoder,
        encoder_optimizer, decoder_optimizer, criterion
    )
    
    # Keep track of loss
    print_loss_total += loss
    plot_loss_total += loss
    eca += ec
    dca += dc

    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
        print(print_summary)
        print(sample())
        #print(chat("what are you doing ?"))
        
    escheduler.step()
    dscheduler.step()

  log_probs_flat = functional.log_softmax(logits_flat)
  seq_range = torch.range(0, max_len - 1).long()


2m 48s (- 1123m 26s) (5 0%) 7.8858
one of those outside agitators
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
4m 7s (- 819m 24s) (10 0%) 7.6177
why should we care ? because love thats why
<EOS>
5m 26s (- 719m 54s) (15 0%) 6.8275
two hundred dollars a toe
<EOS>
6m 45s (- 669m 46s) (20 1%) 6.5146
no youre not but a presence in your house is not something to be taken lightly
<EOS>
8m 6s (- 639m 58s) (25 1%) 6.6599
i think you have the wrong number
you <EOS>
9m 24s (- 617m 32s) (30 1%) 6.5687
i had it coming from someone
i i i <EOS>
10m 43s (- 602m 4s) (35 1%) 6.6667
whats all that wood ?
i i i i i i i i i i i i i i i i <EOS>
12m 2s (- 590m 7s) (40 2%) 6.5676
yes it is would you tell me your name again please ?
you you <EOS>
13m 18s (- 578m 3s) (45 2%) 6.5986
the duke ? what did you do ?
i i i i i i i i i i i you you <EOS>
14m 36s (- 569m 32s) (50 2%) 6.5689
oh angela ! go with these trappers ! theyll lead you safely down the mountain
you you you you you you you you you you <EOS>
15m 54s (- 56

In [108]:
name = "_backwardseq2seq_2L_512h_bi_"

torch.save(encoder.state_dict(), "saved_params/encoder"+name+".pth")
torch.save(decoder.state_dict(), "saved_params/decoder"+name+".pth")

encodercpu = encoder.cpu()
decodercpu = decoder.cpu()

torch.save(encodercpu.state_dict(), "saved_params/encoder"+name+"cpu.pth")
torch.save(decodercpu.state_dict(), "saved_params/decoder"+name+"cpu.pth")

encoder.cuda()
decoder.cuda()


LuongAttnDecoderRNN(
  (embedding): Embedding(28708, 512)
  (embedding_dropout): Dropout(p=0.1)
  (gru): GRU(512, 512, num_layers=2, dropout=0.1)
  (concat): Linear(in_features=1024, out_features=512)
  (out): Linear(in_features=512, out_features=28708)
  (attn): Attn(
    (attn): Linear(in_features=512, out_features=512)
  )
)

In [11]:
name = "_backwardseq2seq_2L_512h_bi_"
encoder.load_state_dict(torch.load("saved_params/encoder"+name+".pth"))
decoder.load_state_dict(torch.load("saved_params/decoder"+name+".pth"))
#encoder.cuda()
#decoder.cuda()

In [None]:
def chat(put_seq, max_length=MAX_SENT_LENGTH):

    #input_batches, input_lengths, target_batches, target_lengths = random_batch(batch_size)
    #input_lengths = [len(input_seq)]
    #input_seqs = [indexesFromSentence(lang, input_seq)]
    #input_batches = Variable(torch.LongTensor(input_seqs), volatile=True).transpose(0, 1)
    #[torch.cuda.LongTensor of size 9x1 (GPU 0)]
    
    put_seqs = []

    print(put_seq)
    put_seqs.append(indexesFromSentence(lang, put_seq))
    
    # For input and target sequences, get array of lengths and pad with 0s to max length
    put_lengths = [len(s) for s in put_seqs]
    put_padded = [pad_seq(s, max(put_lengths)) for s in put_seqs]

    # Turn padded arrays into (batch_size x max_len) tensors, transpose into (max_len x batch_size)
    put_batches = Variable(torch.LongTensor(put_padded)).transpose(0, 1)
    
    if use_cuda:
        
        put_batches =  put_batches.cuda()
    
    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)
    
    # Run through encoder
    encoder_puts, encoder_hid = encoder(put_batches, put_lengths, None)
    #input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
    
    # Prepare input and output variables
    decoder_put = Variable(torch.LongTensor([lang.SOS_token]))
    decoder_hid = encoder_hid[:decoder.n_layers] # Use last (forward) hidden state from encoder

    #max_target_length = max(target_lengths)
    #all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, decoder.output_size))

    # Move new Variables to CUDA
    if use_cuda:
        decoder_put = decoder_put.cuda()
    
    decoded_words = []
    
    # Run through decoder
    for di in range(max_length):
        
        decoder_put, decoder_hid, decoder_attention = decoder(
            decoder_put, decoder_hid, encoder_puts
        )

        # Choose top word from output
        topv, topi = decoder_put.data.topk(1)
        ni = topi[0][0]
        if ni == lang.EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(lang.index2word[ni])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        if use_cuda:
            decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)
    
    return ' '.join(decoded_words)
        
chat("what are you doing ?")

In [12]:
def sample(max_length=MAX_SENT_LENGTH):
  
    in_seqs = []
    pair = random.choice(pairs)
    in_seqs.append(indexesFromSentence(lang,pair[0]))
    print(pair[0])
    # For input and target sequences, get array of lengths and pad with 0s to max length
    in_lengths = [len(s) for s in in_seqs]
    in_padded = [pad_seq(s, max(in_lengths)) for s in in_seqs]

    # Turn padded arrays into (batch_size x max_len) tensors, transpose into (max_len x batch_size)
    in_var = Variable(torch.LongTensor(in_padded)).transpose(0, 1)
    
    if use_cuda:
        in_var = in_var.cuda()
    
    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)
    
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(in_var, in_lengths, None)
    #input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([lang.SOS_token]))
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder
    #print(decoder_hidden.size())
    # Move new Variables to CUDA
    if use_cuda:
        decoder_input = decoder_input.cuda()
    
    decoded_words = []
    
    # Run through decoder
    for di in range(max_length):
        
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs
        )

        # Choose top word from output
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == lang.EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(lang.index2word[ni])
            
        # Next input is chosen word
        decoder_input = Variable(torch.LongTensor([ni]))
        if use_cuda:
            decoder_input = decoder_input.cuda()

    # Set back to training mode
    encoder.train(True)
    decoder.train(True)

    return ' '.join(decoded_words)

In [14]:
#REVERSE DIRECTION

pairs = final_pairs

len(pairs)

reverse_pairs = []

for pair in pairs:
    reverse_pairs.append([pair[1],pair[0]])
    
pairs = reverse_pairs

In [15]:
pairs[0]

['sonnys and rivers only bbq places i know about in gnv',
 'why are you eating at sonnys ?']

In [25]:
# holds a candidate to be used in beam search.
# we need to store the current log_prob, hidden_state, decoded_words, 

logSoftMaxFunc = nn.LogSoftmax()
realSoftMax = nn.Softmax()

class BeamSearchCandidate:
    
    # should start using [SOS]
    def __init__(self, encoder_outputs, hidden_state, next_word_idx, log_prob, decoded_words, is_eos):
        self.encoder_outputs = encoder_outputs
        self.hidden_state = hidden_state
        self.seq_log_prob = log_prob # init
        self.decoded_seq_idx = decoded_words
        self.next_word_idx = next_word_idx
        self.is_eos = is_eos
    
    # return a list of BeamSearchCandidate by feeding next word into rnn_decoder 
    def feed_word_and_get_new_candidates(self, rnn_decoder, beam_width):
        
        # if you've already reached the EOS, don't return anymore
        # todo: add some parameters for beam search to fix the length problem
        if (self.is_eos):
            print("Got EOS output for this candidate already...not creating more candidates")
            return [self]
        
        else:      
            
            next_word_var = Variable(torch.LongTensor([self.next_word_idx]))
            
            if use_cuda:
                next_word_var.cuda()
                self.hidden_state.cuda()
                self.encoder_outputs.cuda()
                
            print(self.encoder_outputs)
            decoder_output, decoder_hidden, decoder_attention = rnn_decoder(
                next_word_var, self.hidden_state, self.encoder_outputs)

            log_probs = logSoftMaxFunc(decoder_output)

            top_log_probs, top_i = log_probs.data.topk(beam_width)

            new_candidates = []
            for b in range(0, beam_width):
                curr_word = top_i[0][b]
                curr_word_log_prob = top_log_probs[0][b]
                is_eos_tmp = False
                if (curr_word == lang.EOS_token):
                    is_eos_tmp = True
                # update log prob
                new_log_prob = self.seq_log_prob + curr_word_log_prob
                # update word seq
                new_decoded_word_seq = list(self.decoded_seq_idx)
                new_decoded_word_seq.append(curr_word)
                # create new candidate and append to list
                new_beam_search_candidate = BeamSearchCandidate(self.encoder_outputs, decoder_hidden, curr_word, new_log_prob, new_decoded_word_seq, is_eos_tmp)
                new_candidates.append(new_beam_search_candidate)

            return new_candidates

    # return the decoded sequence along with it's probability 
    # could just keep it as log_prob since this is probably going to be really small
    def get_decoded_words_and_prob(self):
        decoded_words = []
        for idx in self.decoded_seq_idx:
            decoded_words.append(lang.index2word[idx])
        return (decoded_words, np.exp(self.seq_log_prob))

def chat_using_beam_search(input_seq, max_length=MAX_SENT_LENGTH):

    #input_batches, input_lengths, target_batches, target_lengths = random_batch(batch_size)
    #input_lengths = [len(input_seq)]
    #input_seqs = [indexesFromSentence(lang, input_seq)]
    #input_batches = Variable(torch.LongTensor(input_seqs), volatile=True).transpose(0, 1)
    #[torch.cuda.LongTensor of size 9x1 (GPU 0)]
    
    input_seqs = []

    print(input_seq)
    input_seqs.append(indexesFromSentence(lang, input_seq))
    
    # For input and target sequences, get array of lengths and pad with 0s to max length
    input_lengths = [len(s) for s in input_seqs]
    input_padded = [pad_seq(s, max(input_lengths)) for s in input_seqs]

    # Turn padded arrays into (batch_size x max_len) tensors, transpose into (max_len x batch_size)
    input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
    
    if use_cuda:
        input_var = input_var.cuda()
    
    # Set to not-training mode to disable dropout
    encoder.train(False)
    decoder.train(False)
    
    # Run through encoder
    encoder_outputs, encoder_hidden = encoder(input_var, input_lengths, None)
    #encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)
    #input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([lang.SOS_token]))
    decoder_hidden = encoder_hidden[:decoder.n_layers] # Use last (forward) hidden state from encoder

    # Move new Variables to CUDA
    if use_cuda:
        decoder_input = decoder_input.cuda()
    
    ################# define beam search constants ######################
    
    beam_width = 3  # how many words to explore at each stage
    max_candidates = 10 # max number of sequences you want to consider at one time
    num_iterations = 10 # this should really be equal to the max length ???
    num_final_candidates = 5 # number of candidates you actually want to return
    
    ################ start beam search ##############################
    
    # initially we feed in SOS, we have no words in output so far, and log prob is 1.0 
    starting_candidate = BeamSearchCandidate(encoder_outputs, decoder_hidden, lang.SOS_token, 1.0, [], False)    
    curr_candidates = [starting_candidate]
    
    for i in range(0, num_iterations):

        candidates_for_next_iteration = []
        for can in curr_candidates:
            next_candidates = can.feed_word_and_get_new_candidates(decoder, beam_width)
            # todo: does python have flatMap(...) like scala? would be way easier
            for next_can in next_candidates:
                candidates_for_next_iteration.append(next_can)
        curr_candidates = candidates_for_next_iteration
        if (len(curr_candidates) > max_candidates):
            # prune the list of candidates down, sort by log prob of sequence
            # could be sped up using a priority queue or something ?
            print("We have {} candidates but the max is {}. Only keeping the top {} candidates".format(len(curr_candidates), max_candidates, max_candidates))  
            curr_candidates = sorted(curr_candidates, key = lambda can : can.seq_log_prob, reverse=True)[0:max_candidates]
    
    # sort by log prob once more for final output
    final_candidates = sorted(curr_candidates, key = lambda can : can.seq_log_prob, reverse=True)[0:num_final_candidates]
    
    for can in final_candidates:
        print(can.get_decoded_words_and_prob())


    # Set back to training mode
    encoder.train(True)
    decoder.train(True)
    
    return final_candidates
        
chat_using_beam_search("is this working ?")

is this working ?
Variable containing:
( 0 ,.,.) = 
 -7.7843e-02 -1.7828e-01 -9.6904e-01  ...  -4.5919e-02  8.6338e-01 -1.0753e+00

( 1 ,.,.) = 
 -8.1155e-02 -1.6344e-01 -9.5222e-01  ...  -4.4829e-02  9.8395e-01 -8.8712e-01

( 2 ,.,.) = 
 -8.3334e-02 -1.4063e-01 -4.5165e-01  ...  -4.2915e-02  9.9480e-01 -5.2019e-01

( 3 ,.,.) = 
 -8.4853e-02 -1.1292e-01 -2.9206e-05  ...  -4.1338e-02  9.9724e-01 -2.1333e-01
[torch.cuda.FloatTensor of size 4x1x512 (GPU 0)]



TypeError: torch.index_select received an invalid combination of arguments - got ([32;1mtorch.cuda.FloatTensor[0m, [32;1mint[0m, [31;1mtorch.LongTensor[0m), but expected (torch.cuda.FloatTensor source, int dim, torch.cuda.LongTensor index)

In [None]:
you are ?
i know im got to be <UNK> a little <UNK> <UNK> <UNK> <UNK> a little <UNK> <UNK> <UNK> a little

In [39]:
input_batches, input_lengths, target_batches, target_lengths = random_batch(5)
input_batches

Variable containing:
  6749   4702   3820  17427   7921
 17823  12660  26822  23654      2
 26504   2791  20437      2      0
 10831  19227  17412      0      0
 23730  26520  26664      0      0
  7921    251  11707      0      0
  9743  26822   1696      0      0
 20229   7475   5651      0      0
 19240  15364  20434      0      0
 17275   7838      2      0      0
  9367   1628      0      0      0
  4404   2791      0      0      0
 23264   8045      0      0      0
 20561  23916      0      0      0
 20229  11864      0      0      0
 26289  20229      0      0      0
 22168   1696      0      0      0
 25704   7341      0      0      0
  6397      2      0      0      0
     2      0      0      0      0
[torch.cuda.LongTensor of size 20x5 (GPU 0)]

In [97]:
indexesFromSentence(lang, "are you my friend?")

[18331, 26520, 27686, 23722, 20434, 2]