In [109]:
!pip install d2l



In [95]:
import os
import tensorflow as tf
from d2l import tensorflow as d2l

d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip',
                           '94646ad1522d915e7b0f9296181140edcf86a4f5')

In [96]:
def read_data_nmt():
    data_dir = d2l.download_extract('fra-eng')
    with open(os.path.join(data_dir, 'fra.txt'), 'r') as f:
        return f.read()

raw_text = read_data_nmt()

def preprocess_nmt(text):
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '

    text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
           for i, char in enumerate(text)]
    return ''.join(out)

text = preprocess_nmt(raw_text)

In [97]:
def tokenize_nmt(text, num_examples=None):
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if num_examples and i >= num_examples:
            break
        parts = line.split('\t')
        if len(parts) == 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source, target

source, target = tokenize_nmt(text, num_examples=600)

src_vocab = d2l.Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
tgt_vocab = d2l.Vocab(target, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])

In [98]:
def truncate_pad(line, num_steps, padding_token):
    if len(line) > num_steps:
        return line[:num_steps]
    return line + [padding_token] * (num_steps - len(line))

def build_array_nmt(lines, vocab, num_steps):
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = tf.constant([truncate_pad(
        l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = tf.reduce_sum(tf.cast(array != vocab['<pad>'], tf.int32), 1)
    return array, valid_len

def load_data_nmt(batch_size, num_steps, num_examples=600):
    text = preprocess_nmt(read_data_nmt())
    source, target = tokenize_nmt(text, num_examples)
    src_vocab = d2l.Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = d2l.Vocab(target, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = d2l.load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab

batch_size, num_steps = 64, 10
train_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size, num_steps)

In [104]:
class Seq2SeqEncoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0.5, **kwargs):
        super(Seq2SeqEncoder, self).__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
        self.rnn = [tf.keras.layers.LSTM(num_hiddens, return_sequences=True, return_state=True)
                    for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, X, *args):
        X = self.embedding(X)
        X = self.dropout(X)
        state = []
        for rnn in self.rnn:
            X, h, c = rnn(X, initial_state=None)
            state.append([h, c])
        return X, state

class Seq2SeqDecoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0.5, **kwargs):
        super(Seq2SeqDecoder, self).__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
        self.rnn = [tf.keras.layers.LSTM(num_hiddens, return_sequences=True, return_state=True)
                    for _ in range(num_layers)]
        self.dense = tf.keras.layers.Dense(vocab_size)
        self.dropout = tf.keras.layers.Dropout(dropout)

    def call(self, X, state):
        X = self.embedding(X)
        X = self.dropout(X)
        new_state = []
        for i, rnn in enumerate(self.rnn):
            X, h, c = rnn(X, initial_state=state[i])
            new_state.append([h, c])
        X = self.dense(X)
        return X, new_state

class EncoderDecoder(tf.keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(EncoderDecoder, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def call(self, enc_X, dec_X, *args):
        enc_outputs, enc_state = self.encoder(enc_X)
        dec_outputs, dec_state = self.decoder(dec_X, enc_state)
        return dec_outputs

In [105]:
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.1
encoder = Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = EncoderDecoder(encoder, decoder)

def loss_fn(y_true, y_pred):
    mask = tf.cast(y_true != 0, dtype=tf.float32)
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
    return tf.reduce_mean(loss * mask)

def accuracy_fn(y_true, y_pred):
    mask = tf.cast(y_true != 0, dtype=tf.float32)
    y_pred = tf.argmax(y_pred, axis=2)
    return tf.reduce_mean(tf.cast(y_true == y_pred, dtype=tf.float32) * mask)

model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=[accuracy_fn])

In [108]:
for epoch in range(25):
    for X, X_valid_len, Y, Y_valid_len in train_iter:
        with tf.GradientTape() as tape:
            Y_hat = model(X, Y)
            l = loss_fn(Y, Y_hat)
        grads = tape.gradient(l, model.trainable_variables)
        model.optimizer.apply_gradients(zip(grads, model.trainable_variables))

    print(f'Epoch {epoch + 1}/10, Loss: {float(l):.3f}')

Epoch 1/10, Loss: 1.709
Epoch 2/10, Loss: 1.471
Epoch 3/10, Loss: 1.550
Epoch 4/10, Loss: 1.440
Epoch 5/10, Loss: 1.438
Epoch 6/10, Loss: 1.451
Epoch 7/10, Loss: 1.337
Epoch 8/10, Loss: 1.382
Epoch 9/10, Loss: 1.596
Epoch 10/10, Loss: 1.631
Epoch 11/10, Loss: 1.280
Epoch 12/10, Loss: 1.124
Epoch 13/10, Loss: 1.286
Epoch 14/10, Loss: 1.261
Epoch 15/10, Loss: 1.118
Epoch 16/10, Loss: 1.325
Epoch 17/10, Loss: 0.945
Epoch 18/10, Loss: 1.206
Epoch 19/10, Loss: 1.157
Epoch 20/10, Loss: 1.194
Epoch 21/10, Loss: 0.948
Epoch 22/10, Loss: 1.127
Epoch 23/10, Loss: 1.244
Epoch 24/10, Loss: 1.112
Epoch 25/10, Loss: 1.121
