In [50]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Data pre-processing

In [51]:
SOS_token, EOS_token = 0, 1

class Language:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: 'SOS', 1: 'EOS'}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [52]:
def unicodeToAscii(s):  # turn a Unicode string to plain ASCII.
    normal_str = unicodedata.normalize('NFD', s)
    ascii = ''.join(c for c in normal_str
                    if unicodedata.category(c) != 'Mn')  # c is not "nonspacing mark" (accents, diaeresis, cedilla, etc.).
    return ascii

def normalizeString(s):  # lowercase, trim, and remove non-letter characters.
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r'([.!?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    return s

In [53]:
def readLangs(lang1, lang2, reverse=False):

    file = open(f'data/{lang1}-{lang2}.txt', encoding='utf-8')
    lines = file.read().strip().split('\n')  # split into lines.

    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]  # split every line w/r tabs.

    if reverse:  # translate lang2 -> lang1.
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Language(lang2)
        output_lang = Language(lang1)
    else:  # translate lang1 -> lang2.
        input_lang = Language(lang1)
        output_lang = Language(lang2)

    return input_lang, output_lang, pairs

In [54]:
MAX_LENGTH = 10

eng_prefixes = (
    'i am ', 'i m ',
    'he is ', 'he s ',
    'she is ', 'she s ',
    'you are ', 'you re ',
    'we are ', 'we re ',
    'they are ', 'they re '
)

def filterPair(p):  # check the max length of a pair and the beggining of it.
    short_length_lang1 = len(p[0].split(' ')) < MAX_LENGTH
    short_length_lang2 = len(p[1].split(' ')) < MAX_LENGTH
    prefix_filter = p[1].startswith(eng_prefixes)  # assuming translating to english.
    return all([short_length_lang1, short_length_lang2, prefix_filter])
    
def filterPairs(pairs):  # mantain sequences according to the previous filter.
    return [pair for pair in pairs if filterPair(pair)]

In [55]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    pairs = filterPairs(pairs)

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True)

### RNN encoder-decoder model

In [56]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(num_embeddings=input_size,
                                      embedding_dim=hidden_size)
        self.gru = nn.GRU(input_size=hidden_size,
                          hidden_size=hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [57]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(num_embeddings=output_size, embedding_dim=hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        self.alignment_model = nn.Linear(hidden_size * 2, max_length)
        self.attn_combine = nn.Linear(hidden_size * 2, hidden_size)  # restore dims: [attn_context, input_emb] -> hidden shape.
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.dropout(self.embedding(input).view(1, 1, -1))
        attention_weights = F.softmax(self.alignment_model(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        context_vector = torch.bmm(attention_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        gru_input = F.relu(self.attn_combine(torch.cat((embedded[0], context_vector[0]), 1)).unsqueeze(0))
        output, hidden = self.gru(gru_input, hidden)
        output = F.log_softmax(self.output_layer(output[0]), dim=1)
        return output, hidden, attention_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

### Training

In [58]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [59]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, max_length=MAX_LENGTH):
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    loss = 0

    # Encoder:
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    encoder_hidden = encoder.initHidden()
    for t in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[t], encoder_hidden)
        encoder_outputs[t] = encoder_output[0, 0]

    # Decoder:
    decoder_input, decoder_hidden = torch.tensor([[SOS_token]], device=device), encoder_hidden
    for t in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
        top_value, top_index = decoder_output.topk(1)
        decoder_input = top_index.squeeze().detach()

        loss += nn.NLLLoss()(decoder_output, target_tensor[t])
        if decoder_input.item() == EOS_token:
            break

    # Optimization:
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [61]:
def trainIters(encoder, decoder, n_iters, learning_rate=0.01):

    plot_losses = []

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer)

    showPlot(plot_losses)

In [62]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [63]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [64]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [65]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

KeyboardInterrupt: 

In [66]:
evaluateRandomly(encoder1, attn_decoder1)

> je n en suis pas trop convaincu .
= i m not too convinced .
< i m not . . <EOS>

> je ne suis pas un genie .
= i m not a genius .
< i m not to . <EOS>

> il a honte de son attitude .
= he is ashamed of his behavior .
< i m not to . <EOS>

> je recherche une maison .
= i m looking for a house .
< i m a . <EOS>

> je suis contre ce projet de loi .
= i m against the bill .
< i m not to . <EOS>

> je vous libere .
= i m letting you go .
< i re a . <EOS>

> je suis creve .
= i m pooped .
< i re a . <EOS>

> tu as le meme age que ma copine .
= you re the same age as my girlfriend .
< i m a . <EOS>

> je suis trop gros .
= i m too fat .
< i m a . <EOS>

> il est fort malade .
= he s very ill .
< i m a . <EOS>

