### Discussion


### Changes
- Used LSTM instead of GRU; adam instead of SGD
- Not used dropout, just for experimentation

In [205]:
from collections import Counter
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable
import torch.optim as optim
import io
import unicodedata
import unicodedata
import string
import re
import random

import numpy as np
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from IPython.core.debugger import Tracer
from nltk.translate.bleu_score import sentence_bleu
from nltk import word_tokenize

softmax = nn.Softmax()
use_cuda = use_cuda = torch.cuda.is_available()

In [227]:
# Data acuitition
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = io.open('data_att/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))


# Get cuda variables
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def variableFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    result = Variable(torch.LongTensor(indexes).view(-1, 1))
    if use_cuda:
        return result.cuda()
    else:
        return result

def variablesFromPair(pair):
    input_variable = variableFromSentence(input_lang, pair[0])
    target_variable = variableFromSentence(output_lang, pair[1])
    return (input_variable, target_variable)

def compute_bleu(reference_sentence, predicted_sentence):
    """
    Given a reference sentence, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = word_tokenize(reference_sentence.lower())
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu([reference_tokenized], predicted_tokenized)

Reading lines...
Read 135842 sentence pairs
Trimmed to 10853 sentence pairs
Counting words...
Counted words:
('fra', 4489)
('eng', 2925)
[u'je suis sur que tu vas reussir .', u'i m sure that you ll succeed .']


In [90]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # use some trainable embedding (size hidden_size) instead of one-hot
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1,1,-1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden
    
    def initHidden(self):
        result = Variable(torch.zeros(1,1,self.hidden_size))
        if use_cuda:
            return(result.cuda(), result.cuda())
        else:
            return (result,result)

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size): # input size is same as hidden_size
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        return output, hidden
    
    def initHidden(self):
        result = Variable(torch.zeros(1,1,self.hidden_size))
        if use_cuda:
            return (result.cuda(), result.cuda())
        else:
            return result, result
        
class AttentionDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, max_length):
        super(AttentionDecoder, self).__init__()
        # input size = hidden size assumed
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        # dropout not used for now
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(self.hidden_size *2 , self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size*2, self.hidden_size)
        # ignored dropout
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward():    

In [188]:
def train(input_var, target_var, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # no of words in input and target
    input_length = input_var.size()[0]
    target_length = target_var.size()[0]

    encoder_outputs = Variable(torch.zeros(MAX_LENGTH, hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    encoder_hidden = encoder.initHidden()
    for ei in range(input_length):
        enc_output, encoder_hidden = encoder(input_var[ei],encoder_hidden)
        encoder_outputs[ei] = enc_output[0][0]

    decoder_input =  Variable(torch.LongTensor([[SOS_token]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoder_hidden = encoder_hidden

    loss = 0
    output_sentence = []
    for di in range(target_length):
        target_variable = target_var[di]
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = softmax(decoder_output[0][0]).data.topk(1)
        ni = topi[0]
        predicted_word = output_lang.index2word[ni]
        output_sentence.append(predicted_word)

        if random.random() < teacher_forcing_ratio:
            decoder_input = target_var[di]
        else:
            decoder_input = Variable(torch.LongTensor([ni]))
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        loss+=criterion(decoder_output[0], target_variable)
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return (loss.data[0]/target_length, output_sentence)

In [229]:
encoder = Encoder(input_size, hidden_size)
decoder  = Decoder(hidden_size, output_size)

In [235]:
# define encoder
epochs = 10
learning_rate = 0.001
hidden_size = 256
teacher_forcing_ratio = 0.5
print_every = 20
input_size = input_lang.n_words # since one hot encoding is used
output_size = output_lang.n_words
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# there is no SOS in sentences
for epoch in range(epochs):
    losses = []
    for pi in range(len(pairs[0:100])):
        sentence_pair = pairs[pi]
        train_pair = variablesFromPair(sentence_pair)
        input_var = train_pair[0]
        target_var = train_pair[1]
        loss, predicted_sentence = train(input_var, target_var, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        losses.append(loss)

        if pi%print_every==0:
            print(str(pi) + " Mean - " + str(np.mean(losses)) + ' | Value - ' + str(losses[-1]))
            print (sentence_pair[1])
            print (' '.join(predicted_sentence))
            print('')



0 Mean - 0.801884174347 | Value - 0.801884174347
i m .
i m . EOS

20 Mean - 1.32915485019 | Value - 1.09625177383
i m calm .
i m . . EOS

40 Mean - 1.21077225034 | Value - 1.3781244278
i m safe .
i m fat . EOS

60 Mean - 1.41043026799 | Value - 2.12527751923
he s rich .
he m . . EOS

80 Mean - 1.47973383523 | Value - 2.29331073761
i m armed .
i am lazy . EOS

0 Mean - 1.26990938187 | Value - 1.26990938187
i m .
i m lazy EOS

20 Mean - 1.03202776001 | Value - 0.79400844574
i m calm .
i m lazy . EOS

40 Mean - 0.913121565377 | Value - 0.922255420685
i m safe .
i m lazy . EOS

60 Mean - 0.957338267988 | Value - 0.948811721802
he s rich .
he s wet . EOS

80 Mean - 1.06532614476 | Value - 1.54863367081
i m armed .
i am lazy . EOS

0 Mean - 1.10468626022 | Value - 1.10468626022
i m .
i m lazy EOS

20 Mean - 0.965221554892 | Value - 0.813848876953
i m calm .
i m lazy . EOS

40 Mean - 0.880181848712 | Value - 0.95529384613
i m safe .
i m lazy . EOS

60 Mean - 0.87746598004 | Value - 0.63667593

In [236]:
def seq2seq_inference(encoder, decoder, sentence, max_length=MAX_LENGTH):
    input_var = variableFromSentence(input_lang, sentence)
    input_length = input_var.size()[0]
    en_hidden_var = encoder.initHidden()
    
    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
    
    encoder_hidden = encoder.initHidden()
    for ei in range(input_length):
        enc_output, encoder_hidden = encoder(input_var[ei],encoder_hidden)
        encoder_outputs[ei] = enc_output[0][0]

    decoder_input =  Variable(torch.LongTensor([[SOS_token]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoder_hidden = encoder_hidden
    output_sentence = []

    ni=-1
    while ni != EOS_token:
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = softmax(decoder_output[0][0]).data.topk(1)
        ni = topi[0]
        decoder_input = Variable(torch.LongTensor([ni]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        predicted_word = output_lang.index2word[ni]
        output_sentence.append(predicted_word)
        
    return output_sentence

all_bleu = []
for sentence_pair in pairs[0:1000]:
    input_sentence = sentence_pair[0]
    output_sentence = seq2seq_inference(encoder, decoder, input_sentence)
    orig_sentence = sentence_pair[1] + ' EOS'
    out_sentence = ' '.join(output_sentence)
    bleu_score = compute_bleu(orig_sentence, out_sentence)
    all_bleu.append(bleu_score)
    orig_sentence = '<SOS> ' +  orig_sentence
    out_sentence = '<SOS> ' +  out_sentence    
    
    if random.random() <0.01:
        print(orig_sentence)
        print(out_sentence)
        print(bleu_score)
        print('\n')
    
print('Mean - ' + str(np.mean(all_bleu)))    



<SOS> i am lazy . EOS
<SOS> i m lazy . EOS
0.604275079471


<SOS> i m early . EOS
<SOS> i m lazy . EOS
0.795270728767


<SOS> i m ready ! EOS
<SOS> i m lazy . EOS
0.622332977288


<SOS> i m finicky . EOS
<SOS> i m lazy . EOS
0.795270728767


<SOS> you re free . EOS
<SOS> i m lazy . EOS
0.56234132519


<SOS> you re good . EOS
<SOS> i m sure . EOS
0.56234132519


<SOS> he is a poet . EOS
<SOS> he s rich . EOS
0.509523147161


<SOS> i m outraged . EOS
<SOS> i m fussy . EOS
0.795270728767


<SOS> we re pooped . EOS
<SOS> i m sure . EOS
0.56234132519


<SOS> he s a senior . EOS
<SOS> i m sure . EOS
0.46040613666


Mean - 0.674515222885
