In [1]:
import os
import tarfile

import nltk
import requests
from tqdm import tqdm

from model import Encoder, Decoder
from vocab import Vocab, VocabFull


In [2]:
# collect our data
language = 'es'
tarfilename = "{}-en.tgz".format(language)
tarfilepath = os.path.exists(os.path.join("data/", tarfilename))
def maybe_download():
    if not os.path.exists(tarfilepath):
        print('downloading {}...'.format(tarfilename))
        url = "http://www.statmt.org/europarl/v7/{}".format(tarfilename)
        os.makedirs('data/', exist_ok=True)
        r = requests.get(url, stream=True)
        with open(tarfile, 'wb') as fd:
            for content in tqdm(r.iter_content()):
                fd.write(content)
        print('download complete! Extracting...')
        with tarfile.open(tarfilepath) as tar:
            tar.extractall(path='data/')
        print('done!')
        
maybe_download()

In [3]:
# data data data
englishfile = 'data/europarl-v7.es-en_trunc.en'
spanishfile = 'data/europarl-v7.es-en_trunc.es'

def build_full_vocabs():
    with open(englishfile) as en_fd, open(spanishfile) as es_fd:
        en_lang = Vocab(name='english')
        es_lang = Vocab(name='spanish')
        try:
            en_lang.add_corpus(en_fd)
        except VocabFull:
            pass
        try:
            es_lang.add_corpus(es_fd)
        except:
            pass
    en_lang.calcify()
    es_lang.calcify()
    return en_lang, es_lang

if True:
    en_lang, es_lang = build_full_vocabs()

In [4]:
def corpora2vectors():
    with open(englishfile) as en_fd, open(spanishfile) as es_fd:
        eng = [en_lang.tokens2tensor(en_lang.word_tokenize(s)) for s in en_fd]
        es = [es_lang.tokens2tensor(es_lang.word_tokenize(s)) for s in es_fd]
    return eng, es

if True:
    X, y = corpora2vectors()

In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.init_hidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden


    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input

        loss += criterion(decoder_output, target_tensor[di])
        if decoder_input.item() == EOS_token:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length