In [2]:
%pylab inline

import zipfile
import unicodedata
import re

import torch
from torch import nn
from torch import optim

from tqdm import tqdm_notebook as tqdm

Populating the interactive namespace from numpy and matplotlib


In [3]:
figsize(8, 8)

## Load text examples

In [4]:
z = zipfile.ZipFile("../../../datasets/por-eng.zip")
txt = z.read(z.filelist[1]).decode()
del z
txt.splitlines()[-1].split("\t")

["We recommend adding sentences and translations in your strongest language. If you are interested primarily in having your sentences corrected, you should try a site like Lang-8.com, where that's the focus.",
 'Recomendamos acrescentar frases e traduções na língua em que você é mais forte. Se está interessado principalmente em ter suas sentenças corrigidas, você deve tentar um site como Lang-8.com, onde esse é o foco.']

## filter sentences

len(words) <= 10

In [5]:
# source: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [6]:
eng, por = [],[]

limit = 10
for line in tqdm(txt.splitlines()):
    e, p = line.split("\t")
    if len(e.split()) < limit or len(p.split()) < limit:
        eng.append(normalizeString(e))
        por.append(normalizeString(p))

HBox(children=(IntProgress(value=0, max=135671), HTML(value='')))




In [7]:
len(eng), len(por)

(129489, 129489)

In [8]:
for i, j in zip(eng[25:35], por[25:35]):
    print(f"{i} -> {j}")

hello ! -> oi .
hello ! -> alo .
hello ! -> ola !
i ran . -> eu corri .
i see . -> estou vendo .
i try . -> eu tento .
i try . -> tento .
i won ! -> ganhei !
i won . -> eu venci .
oh no ! -> ah nao !


In [9]:
eng_vocab, por_vocab = set(), set()
eng_word2id, por_word2id = {}, {}

for i in range(len(eng)):
    for word_eng in eng[i]:
        eng_vocab.add(word_eng)
    for word_por in por[i]:
        por_vocab.add(word_por)

eng_vocab, por_vocab = np.array(list(eng_vocab)), np.array(list(por_vocab))
eng_word2id = {word: id_ for id_, word in enumerate(eng_vocab)}
por_word2id = {word: id_ for id_, word in enumerate(por_vocab)}

In [10]:
len(eng_vocab), len(por_vocab)

(30, 30)

In [11]:
max_voc = 0

for i in por_vocab:
    for j in i:
        x = ord(j)
        if x > max_voc:
            max_voc = x
max_voc, chr(max_voc)

(122, 'z')

In [181]:
from torch.utils.data import Dataset, DataLoader

In [None]:
class TranslateDataset(Dataset):
    def __init__(self, lang1, lang2):
        self._len = lang1
        self.x = []
        self.y = []
        pbar = tqdm(unit=" frases", total=self._len)
        for l1, l2 in zip(lang1, lang2):
            pass

## encoder, decoder, att_decoder

In [12]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(Encoder, self).__init__()
        
        self.emb = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        
    def forward(self, X, hidden):
        seq_len = len(X)
        emb = self.emb(X).view(seq_len, 1, -1)
        out, hidden = self.gru(emb, hidden)
        return out, hidden

In [13]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, out_size, n_layers=1):
        super(Decoder, self).__init__()
        
        self.emb = nn.Embedding(out_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.out = nn.Linear(hidden_size, out_size)
        
    def forward(self, X, hidden):
        out = self.emb(X).view(1, 1, -1)
        out, hidden = self.gru(out, hidden)
        out = self.out(out[0])
        return out, hidden

In [14]:
class AttnDecoder(nn.Module):
    def __init__(self, hidden_size, out_size, n_layers=1, dropout_p=0.1):
        super(AttnDecoder, self).__init__()
        
        self.emb = nn.Embedding(out_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size,
                          n_layers, dropout=dropout_p)
        
        self.attn = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size*2, out_size)
        
    def forward(self, X, last_hidden, encoder_hidden):
        out = self.emb(X).view(1, 1, -1)
        out, hidden = self.gru(out, last_hidden)
        
        attn_weights = self.get_attn_weights(
            out.squeeze(0), encoder_hidden
        )
        context = torch.bmm(attn_weights, encoder_hidden.transpose(0, 1))
        
        out = out.squeeze(0)
        context = context.squeeze(1)
        
        out = self.out(
            torch.cat((out, context), 1),
        )
        
        return out, hidden, attn_weights
    
    def get_attn_weights(self, hidden, encoder_hidden):
        seq_len = len(encoder_hidden)
        
        attn_scores = torch.zeros(seq_len)
        
        for i in range(seq_len):
            score = self.attn(encoder_hidden[i])
            attn_scores[i] = torch.dot(
                hidden.view(-1),
                score.view(-1)
            )
        try:
            return torch.softmax(attn_scores, dim=1).view(1, 1, -1)
        except:
            print(attn_scores)

In [15]:
def init_hidden(hidden_size, n_layers=1):
    return torch.zeros(n_layers, 1, hidden_size)

In [16]:
SOS_token = chr(0)
EOS_token = 1

In [17]:
N_EPOCH = 100
N_CHARS = 122  # max_voc
HIDDEN_SIZE = N_CHARS

In [103]:
enc = Encoder(N_CHARS, N_CHARS)
dec = Decoder(N_CHARS, N_CHARS)
hidden = init_hidden(N_CHARS)

word_input = torch.LongTensor([ord(i) for i in "Python"])
enc_out, hidden_enc = enc(word_input, hidden)
word_target = torch.LongTensor([ord(i) for i in "Python"])

for char in range(len(word_target)):
    dec_out, hidden = dec(word_target[char], hidden_enc)
    print(chr(dec_out.argmax()), end=".")

n.P.b.0.y.#.

In [109]:
por[4567], eng[4567]

('voces me assustam .', 'you scare me .')

In [159]:
a = tqdm(unit=" epoch", total=10)
a.write("teste")
a.update()
a.write("iuyui")

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

teste
iuyui


In [180]:
a.update()