In [2]:
"""
Data available from http://www.manythings.org/anki/
under eng-spa.zip link.
Inspired by:
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
Before you can use this script you need to install a unidecode package with:
pip3 install unidecode
"""

import random
import unidecode

SpipOS_token = 0
EOS_token = 1
UW_token = 2

MAX_SENTENCE_LENGTH = 10

class Vocab:
    """
    Keeping track of language vocabulary.
    """
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2:"UW"}
        self.n_words = 3  # Count SOS and EOS

    def add_sentence(self, sentence):
        """
        Add each word from sentence.
        """
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        """
        Add a new word to a vocabulary,
        update all the counters and indexes.
        """
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def qscleaner(w):
    """
    Just remove ? character from a word.
    """
    w=w.replace('?','')
    return w

def isquestion(s, max_length=MAX_SENTENCE_LENGTH):
    """
    Return True if sentence is valid according
    to our criteria.
    Here we're interested in questions that are
    no longer than mex_legth.
    """
    return len(s.split(' ')) < max_length and len(s.split(' ')) < max_length and s.find('?') != -1

def clean(s, extra_cleaner=qscleaner):
    """
    Clean up the whole sentence:
    Include only words, make
    them lower case and
    remove any non-english characters.
    """
    include_words=[]
    for word in s.split():
        word=word.strip().lower()
        word=unidecode.unidecode(word)
        word=extra_cleaner(word)
        if word.isdigit():
            continue
        include_words.append(word)
    return ' '.join(include_words)

def process_file(ilang, olang, limit, sfilter=isquestion):
    """
    Read a language file, clean up sentences
    and based on them create a Vocab object for
    each language, return only limit sentences.
    """
    print("Reading sentences...")
    sentences = open('data/%s.txt' % olang, encoding='utf-8').read().splitlines()
    pairs = [[clean(w) for w in s.split('\t')] for s in sentences if sfilter(s)]
    pairs = [list(p) for p in pairs]
    input_lang = Vocab(ilang)
    output_lang = Vocab(olang)
    return pairs, input_lang, output_lang

def get_data(ilang, olang, limit=100, log=print):
    """
    Return a limit number of sentences of both ilang and olang.
    Sentences has to match criteria defined by sfilter and
    are processed by wclean.
    ilang - input language that we want to translate from
    olang - output language that we want to tranlate to
    limit - a number of sentences to process for each language
            choose small number if you don't have GPU processing power
    """
    pairs, input_lang, output_lang = process_file(ilang, olang, limit)
    log("Got %d sentences in both langs" % len(pairs))
    pairs = [ pair for pair in pairs if pair][:limit]
    log("Reduced to %d sentences" % len(pairs))
    log("Counting words...")
    for pair in pairs:
        input_lang.add_sentence(pair[0])
        output_lang.add_sentence(pair[1])
    log("Counted words:")
    log(input_lang.name, input_lang.n_words)
    log(output_lang.name, output_lang.n_words)
    log('Random data sample:')
    log(random.choice(pairs))
    return pairs,input_lang, output_lang

In [12]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
#from prep import get_data, MAX_SENTENCE_LENGTH, EOS_token, SOS_token, UW_token
import random
import os

In [4]:
hidden_size=256
pairs, input_lang, output_lang=get_data('en','spa', limit=100)

Reading sentences...
Got 7926 sentences in both langs
Reduced to 100 sentences
Counting words...
Counted words:
en 82
spa 118
Random data sample:
['am i right', 'tengo razon']


In [8]:
pairs[99]

['was it fun', 'era divertido']

In [34]:
pairs[5]

['why me', 'por que yo']

In [13]:
class EncoderGRU(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderGRU, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderGRU(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderGRU, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
# Building two GRUs, encoder and decoder.
encoder = EncoderGRU(input_lang.n_words, hidden_size).to(device)
decoder = DecoderGRU(hidden_size, output_lang.n_words).to(device)

In [17]:
def sentence_to_idx(lang, sentence):
    """
    Encode sentences to indexes in our Vocabulary object
    for a given langauge.
    """
    out=[]
    for word in sentence.split(' '):
        if word in lang.word2index:
            out.append(lang.word2index[word])
        else:
            out.append(UW_token)
    return out

def sentence_to_tensor(lang, sentence):
    """
    Turn a sentence into a tensor.
    Add EOS_token at the end of the new tensor
    to mark end of the sentence.
    """
    indexes = sentence_to_idx(lang, sentence)
    indexes.append(EOS_token)
    #print('Sentence->Word indexes', sentence, indexes)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def pair_to_tensor(il, ol, pair):
    """
    Turn a pair of sentences into a pair of tensors.
    """
    input_tensor = sentence_to_tensor(il, pair[0])
    output_tensor = sentence_to_tensor(ol, pair[1])
    return (input_tensor, output_tensor)

def train(input_tensor, output_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_func, max_length=MAX_SENTENCE_LENGTH):
    """
    Encode input_tensor and feed the output to decode the output_tensor.
    """
    encoder_hidden = encoder.initHidden()

    input_length = input_tensor.size(0)
    # Forward pass, process input tensor vi encoder:
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)

    # Prepare input values for decoder, starting with start of the sentence character.
    decoder_input = torch.tensor([[SOS_token]], device=device)

    # Make encoder output decoder's input.
    decoder_hidden = encoder_hidden

    output_length = output_tensor.size(0)
    loss = 0
    # Now processing output tensor via decoder:
    for di in range(output_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        # Return the best guess for current word.
        _, topi = decoder_output.topk(1)
        # Prepare next input from cuurent output
        decoder_input = topi.squeeze().detach()

        # Calculate loss.
        loss += loss_func(decoder_output, output_tensor[di])

        # Stop if it's the end of the sentence.
        if decoder_input.item() == EOS_token:
            break
    # Clean up the "gradients" before
    # propagating changes to our network.
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Accumulate changes.
    loss.backward()

    # Propagate changes.
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / output_length

def train_all(pairs, encoder, decoder, il, ol, s_epochs, print_every=10):
    """
    Train on a s_epochs random pair of sentences using encoder
    and decoder.
    print_every - show stats on print_every sentence
    """
    loss_total = 0

    # Initialize optimizers for both networks.
    encoder_optimizer = optim.Adam(encoder.parameters())
    decoder_optimizer = optim.Adam(decoder.parameters())

    # Get a n_iters random sentences for training.
    training_pairs = [pair_to_tensor(il, ol, random.choice(pairs)) for i in range(s_epochs)]
    loss_func = nn.CrossEntropyLoss()

    # Feed each pair to both of our networks
    for se in range(s_epochs):
        # Get the next pair of sentences to train
        training_pair = training_pairs[se]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        # Do the actual training.
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_func)
        loss_total += loss

        if se % print_every == 0:
            loss_avg = loss_total / print_every
            loss_total = 0
            print('%d %d%% %.4f' % (se, se / s_epochs * 100, loss_avg))

In [29]:
def test(encoder, decoder, sentence, input_lang, output_lang, max_length=MAX_SENTENCE_LENGTH):
    """
    Generate translation of a sentence using encoder and decoder.
    """
    # This is not training, so we can
    # save up some memory.
    with torch.no_grad():
        # Prepare sentence for translation.
        input_tensor = sentence_to_tensor(input_lang, sentence)
        input_length = input_tensor.size()[0]

        # This is similar to training, but without running
        # .zero_grad(), .backward(), .set()
        encoder_hidden = encoder.initHidden()

        # First encode our sentence using the first network.
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],encoder_hidden)

        # Prepare data for the second network based on the output
        # of the first one.
        decoder_input = torch.tensor([[SOS_token]], device=device)

        decoder_hidden = encoder_hidden

        # Get the translation using the second network
        # decode it's output.
        decoded_words = []
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            # Get the best guess.
            topv, topi = decoder_output.data.topk(1)
            # We're done once we get at the end of the sentence.
            if topi.item() == EOS_token:
                break
            else:
                # otherwise just turn the encoded translation (as indexes)
                # back to their respective words.
                decoded_words.append(output_lang.index2word[topi.item()])
            # Preparing the next input for decoder.
            decoder_input = topi.squeeze().detach()

        return decoded_words

def test_random(encoder, decoder,ilang,olang, n=10):
    """
    Randomly get a pair of sentences and compare them
    with our translation.
    """
    for i in range(n):
        pair = random.choice(pairs)
        print('Question in %s: %s' % (ilang.name, pair[0].ljust(20)))
        print('Question in %s: %s' % (olang.name, pair[1].ljust(20)))
        output_words = test(encoder, decoder, pair[0], ilang, olang)
        output_sentence = ' '.join(output_words).strip()
        tick='V' if output_sentence == pair[1] else 'X'
        print('Our guess:%s %s' % (output_sentence.ljust(20), tick))
        print('')

In [19]:
SOS_token = 0
EOS_token = 1
UW_token = 2

MAX_SENTENCE_LENGTH = 10

In [22]:
train_all(pairs,encoder,decoder,input_lang,output_lang,1400,print_every=100)

0 0% 0.0002
100 7% 0.4286
200 14% 0.3810
300 21% 0.3278
400 28% 0.4159
500 35% 0.3132
600 42% 0.3503
700 50% 0.4438
800 57% 0.4513
900 64% 0.3618
1000 71% 0.5389
1100 78% 0.3212
1200 85% 0.3883
1300 92% 0.2942


In [32]:
print('Saving both models...')
torch.save(encoder.state_dict(), 'encoder.ckpt')
torch.save(decoder.state_dict(), 'decoder.ckpt')
print('Testing with random data...')
test_random(encoder, decoder, input_lang, output_lang)

Saving both models...
Testing with random data...
Question in en: is it here          
Question in spa: aqui                
Our guess:es                   X

Question in en: are you ok          
Question in spa: estas bien          
Our guess:estas bien           V

Question in en: who is he           
Question in spa: quien es el         
Our guess:quien es el          V

Question in en: really              
Question in spa: la verdad           
Our guess:en serio             X

Question in en: hi, guys.           
Question in spa: hola, que hay       
Our guess:que pasa, troncos    X

Question in en: who cares           
Question in spa: a quien le importa  
Our guess:a quien le importa   V

Question in en: what for            
Question in spa: para que            
Our guess:para que             V

Question in en: how is tom          
Question in spa: como esta tom       
Our guess:como esta tom        V

Question in en: who fell            
Question in spa: quien se callo      
Our

In [None]:
# Building two GRUs, encoder and decoder.
    encoder = EncoderGRU(input_lang.n_words, hidden_size).to(device)
    decoder = DecoderGRU(hidden_size, output_lang.n_words).to(device)
    print('Training models...')
    train_all(pairs,encoder,decoder,input_lang,output_lang,900,print_every=100)
    print('Saving both models...')
    torch.save(encoder.state_dict(), 'encoder.ckpt')
    torch.save(decoder.state_dict(), 'decoder.ckpt')
    print('Testing with random data...')
    test_random(encoder, decoder, input_lang, output_lang)

In [None]:
def pair_to_tensor(il, ol, pair):
    """
    Turn a pair of sentences into a pair of tensors.
    """
    input_tensor = sentence_to_tensor(il, pair[0])
    output_tensor = sentence_to_tensor(ol, pair[1])
    return (input_tensor, output_tensor)

In [37]:
pair_to_tensor(input_lang, output_lang, pairs[5])[1]

tensor([[11],
        [12],
        [13],
        [ 1]])

In [38]:
pair_to_tensor(input_lang, output_lang, pairs[5])[0]

tensor([[7],
        [8],
        [1]])

In [39]:
input_lang.n_words

82

In [41]:
encoder.initHidden().size()

torch.Size([1, 1, 256])

In [47]:
(146**2 + 150**2)**0.5

209.32271735289507