In [1]:
!pip install torchtext --upgrade
!python -m spacy download fr
!python -m spacy download en

Collecting torchtext
[?25l  Downloading https://files.pythonhosted.org/packages/79/ef/54b8da26f37787f5c670ae2199329e7dccf195c060b25628d99e587dac51/torchtext-0.5.0-py3-none-any.whl (73kB)
[K     |████▌                           | 10kB 26.6MB/s eta 0:00:01[K     |█████████                       | 20kB 31.7MB/s eta 0:00:01[K     |█████████████▍                  | 30kB 23.6MB/s eta 0:00:01[K     |██████████████████              | 40kB 12.8MB/s eta 0:00:01[K     |██████████████████████▍         | 51kB 12.5MB/s eta 0:00:01[K     |██████████████████████████▉     | 61kB 12.5MB/s eta 0:00:01[K     |███████████████████████████████▍| 71kB 12.6MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 7.5MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 21.9MB/s

In [0]:
import os
import re
import tqdm
import random
import unicodedata
import numpy as np

import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Example, Field, Dataset
from torchtext.data.iterator import BucketIterator

In [0]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SEED = 781
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
if not os.path.exists('./data'):
    !mkdir ./data

!wget --no-check-certificate \
    http://www.statmt.org/europarl/v7/fr-en.tgz \
    -O ./data/fr-en.tgz

--2020-02-15 13:11:59--  http://www.statmt.org/europarl/v7/fr-en.tgz
Resolving www.statmt.org (www.statmt.org)... 129.215.197.184
Connecting to www.statmt.org (www.statmt.org)|129.215.197.184|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 202718517 (193M) [application/x-gzip]
Saving to: ‘./data/fr-en.tgz’


2020-02-15 13:12:44 (4.34 MB/s) - ‘./data/fr-en.tgz’ saved [202718517/202718517]



In [5]:
!tar -xzvf ./data/fr-en.tgz -C ./data/

europarl-v7.fr-en.en
europarl-v7.fr-en.fr


In [0]:
def read_file(filepath):
    try:
        with open(filepath, mode='rt', encoding='utf-8') as file:
            content = file.readlines()
        return content
    except:
        raise NotImplementedError(f'File {filepath} doesn\'t exist')

In [0]:
def unicode_to_ascii(s):
    # NFD => Normal Form Decompose
    # Mn => Non Marking Space
    return ''.join(c for c in unicodedata.normalize('NFD', s) \
                    if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z1-9!.?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    return s.strip()

In [31]:
%%time
pairs = [*zip(read_file('./data/europarl-v7.fr-en.fr'),
             read_file('./data/europarl-v7.fr-en.en'))]
pairs = [*map(lambda x: {'fr': x[0], 'en': x[1]}, pairs)]
print('Number of examples:', len(pairs))
pairs = np.random.choice(pairs, size=30000, replace=False)
pairs = [*map(lambda pair: {k: normalize_string(v) for k, v in pair.items()},
              pairs)]
print('Number of examples after sampling:', len(pairs))
print('Example:', pairs[0])

Number of examples: 2007723
Number of examples after sampling: 30000
Example: {'fr': 'madame le president c est parce que nous ne souhaitons pas pour le maroc ce que nous refusons pour nous memes que nous n avons pas vote l accord d association avec ce grand pays .', 'en': 'madam president it is because we would not wish on morocco something we reject for ourselves that we have not voted for the association agreement with that great country .'}
CPU times: user 6.92 s, sys: 504 ms, total: 7.42 s
Wall time: 7.43 s


In [32]:
%%time
FR = Field(init_token='<sos>',
           eos_token='<eos>',
           pad_token='<pad>',
           unk_token='<unk>',
           lower=True,
           tokenize='spacy',
           tokenizer_language='fr',
           preprocessing=lambda x: x[::-1])
EN = Field(init_token='<sos>',
           eos_token='<eos>',
           pad_token='<pad>',
           unk_token='<unk>',
           lower=True,
           tokenize='spacy',
           tokenizer_language='en')

examples = [Example.fromdict(data=pair, fields={'fr': ('src', FR),
                                                'en': ('dest', EN)})
            for pair in tqdm.tqdm(pairs)]
data = Dataset(examples, fields={'src': FR, 'dest': EN})
train, valid, test = data.split(split_ratio=[0.7, 0.2, 0.1])
print('train size:', len(train.examples))
print('valid size:', len(valid.examples))
print('test size:', len(test.examples))
print(vars(train.examples[0]))

100%|██████████| 30000/30000 [00:44<00:00, 670.08it/s]

train size: 21000
valid size: 3000
test size: 6000
{'src': ['.', 'fiscale', 'harmonisation', 'l', 'non', 'et', 'fiscale', 'concurrence', 'la', 'promouvoir', 'a', 'interet', 'tout', 'a', 'ue', 'l', 'que', 'fermement', 'crois', 'je'], 'dest': ['i', 'firmly', 'believe', 'that', 'the', 'eu', 'is', 'best', 'served', 'by', 'promoting', 'tax', 'competition', 'not', 'tax', 'harmonisation', '.']}
CPU times: user 49.4 s, sys: 268 ms, total: 49.7 s
Wall time: 50.2 s





In [0]:
FR.build_vocab(train, min_freq=5,
               specials=['<sos>', '<eos>', '<unk>', '<pad>'])
EN.build_vocab(train, min_freq=5,
               specials=['<sos>', '<eos>', '<unk>', '<pad>'])

In [34]:
print('Length of FR vocabulary:', len(FR.vocab))
print('Length of EN vocabulary:', len(EN.vocab))

Length of FR vocabulary: 6820
Length of EN vocabulary: 5812


In [0]:
BATCH_SIZE = 128

train_it, valid_it, test_it = BucketIterator.splits((train, valid, test),
                                                    batch_size=BATCH_SIZE,
                                                    device=DEVICE)

In [0]:
class Encoder(nn.Module):

    def __init__(self, vocab_size, embedded_dim,
                 hidden_units, n_layers, dropout, bi=True):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_size,
                            num_layers=n_layers,
                            bidirectional=bi,
                            dropout=dropout)

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        embedded = self.dropout(embedded)
        outputs, (h_state, c_state) = self.lstm(embedded)
        return outputs, (h_state, c_state)

In [0]:
class Decoder(nn.Module):

    def __init__(self, vocab_size, embedded_dim,
                 hidden_units, n_layers, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_size,
                            num_layers=n_layers,
                            dropout=dropout)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs, h_state, c_state):
        embedded = self.embedding(inputs)
        embedded = self.dropout(embedded)
        outputs, (h_state, c_state) = self.lstm(embedded, (h_state, c_state))
        logits = self.linear(outputs)
        return logits, (h_state, c_state)

In [0]:
class SeqToSeqNet(nn.Module):

    def __init__(self, encoder, decoder):
        super(SeqToSeqNet, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inputs, targets):
        _, (h_state, c_state) = self.encoder(inputs)
        target = targets[0, :]
        outputs = []
        for t in range(1, targets.size(0)):
            logits, (h_state, c_state) = self.decoder(target, h_state, c_state)
            outputs.append(logits)
            target = targets[t, :]
        return torch.stack(outputs, dim=0)

In [0]:
def loss_func():
    pass

In [0]:
def train(model, data_it, optimizer, grad_clip=1.0):
    pass

In [0]:
def evaluate(model, data_it):
    pass