# Encoder Decoder 解决翻译问题

翻译问题的解决方案 使用 EncoderDecoder attention模型, 是一种多对多RNN模型。 

./datasets/spa.txt 是一个英语到西班牙语的对应文件, 有 118964条数据

```
#下载路径

cd datasets
wget https://dikers-data.s3.cn-northwest-1.amazonaws.com.cn/dataset/spa.txt
```



```
<start>i m very sad .<end> <------> <start>estoy muy triste .<end>
<start>i m with him .<end> <------> <start>estoy con el .<end>
<start>i m worn out .<end> <------> <start>estoy exhausto .<end>
<start>i ve lost it .<end> <------> <start>lo he perdido .<end>
<start>i ve seen it .<end> <------> <start>lo he visto .<end>
<start>ice is solid .<end> <------> <start>el hielo es solido .<end>
<start>iron is hard .<end> <------> <start>el hierro es duro .<end>
<start>is monday ok ?<end> <------> <start>¿ esta bien el lunes ?<end>
<start>is tom cured ?<end> <------> <start>¿ esta tom curado ?<end>
<start>is tom drunk ?<end> <------> <start>¿ tom esta borracho ?<end>
<start>is tom lucid ?<end> <------> <start>¿ tom es lucido ?<end>
<start>is tom there ?<end> <------> <start>¿ esta ahi tom ?<end>
<start>is it a deer ?<end> <------> <start>¿ eso es un ciervo ?<end>
```

In [26]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print('tf    version: {}'.format(tf.__version__) )
print('keras version: {}'.format(keras.__version__) )
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)
tf.enable_eager_execution(
    config=None,
    device_policy=None,
    execution_mode=None
)
print('GPU : ', tf.test.is_gpu_available())


tf    version: 1.14.0
keras version: 2.2.4-tf
matplotlib 3.0.3
numpy 1.16.4
pandas 0.24.2
sklearn 0.21.2
tensorflow 1.14.0
tensorflow.python.keras.api._v1.keras 2.2.4-tf
GPU :  True


In [10]:
import re
import unicodedata

en_spa_file_path = './datasets/spa.txt'

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')



![image](https://pic1.zhimg.com/80/v2-1227cc0c1099fdcbf16f6e2886db60ec_1440w.jpg)

In [11]:
# Got it?	¿Entendiste?

en_sentence = 'Got it?'
sp_sentence = '¿Entendiste?'

print(unicode_to_ascii(en_sentence))
print(unicode_to_ascii(sp_sentence))

Got it?
¿Entendiste?


In [12]:
def preprocess_setence(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " " , s)
    s = re.sub(r'[^a-zA-z?.!,¿]', " ", s)
    s = s.rstrip().strip()
    s = '<start>' + s + '<end>'
    
    return s
print(preprocess_setence(sp_sentence))

<start>¿ entendiste ?<end>


In [14]:
def parse_data(filename):
    lines = open(filename, encoding='UTF-8').read().strip().split('\n')
    sentence_pairs = [line.split('\t') for line in lines]
    preprocessed_sentence_pairs = [
        (preprocess_setence(en), preprocess_setence(sp)) for en, sp in sentence_pairs]
    
    return zip(*preprocessed_sentence_pairs) 
    
en_dataset, sp_dataset = parse_data(en_spa_file_path)

In [6]:
a = [(1, 2), (3, 4), (5, 6)]
c, d = zip(*a)
print(c, d)

(1, 3, 5) (2, 4, 6)


In [28]:
print(len(en_dataset))
for i  in range(5):
    print('{} <------> {}'.format(en_dataset[i+15000] , sp_dataset[i+15000]))

118964
<start>tom was terrified .<end> <------> <start>tom estaba aterrorizado .<end>
<start>tom was very busy .<end> <------> <start>tom estaba muy ocupado .<end>
<start>tom was with mary .<end> <------> <start>tom estaba con mary .<end>
<start>tom wasn t afraid .<end> <------> <start>tom no tenia miedo .<end>
<start>tom wasn t honest .<end> <------> <start>tom no fue honesto .<end>


In [9]:
def max_length(tensor):
    return max(len(t) for t in tensor)
def tokenizer(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(
        num_words=None, filters='', split=' ')
    lang_tokenizer.fit_on_texts(lang)
    
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    return tensor, lang_tokenizer

num_examples = 30000
input_tensor, input_tokenizer = tokenizer(sp_dataset[0:num_examples])
output_tensor, output_tokerizer = tokenizer(en_dataset[0:num_examples])

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = max_length(output_tensor), max_length(input_tensor)

print(max_length_targ, max_length_inp)


9 14


In [10]:
from sklearn.model_selection import train_test_split
input_train, input_val, output_train, output_val = train_test_split(input_tensor, output_tensor, test_size=0.2)

# Show length
len(input_train), len(input_val), len(output_train), len(output_val)

(24000, 6000, 24000, 6000)

In [11]:
def conver(example, tokenizer):
    for t in example:
        if t!=0:
            print('%8d --> %s'%(t, tokenizer.index_word[t]))
            
conver(input_train[0], input_tokenizer)
print('')
conver(output_train[0], output_tokerizer)

     192 --> <start>odio
     131 --> estar
    1761 --> soltera
       1 --> .<end>

       2 --> <start>i
     135 --> hate
     505 --> being
     683 --> single
       1 --> .<end>


In [12]:
def make_dataset(input_train, output_train,batch_size, epochs, shuffle=True ):
    
    dataset = tf.data.Dataset.from_tensor_slices((input_train, output_train))
    if shuffle:
        dataset = dataset.shuffle(num_examples)
    
    dataset = dataset.repeat(epochs).batch(batch_size, drop_remainder=True)
    return dataset

In [13]:
batch_size = 64
epochs = 20
train_dataset = make_dataset(input_train, output_train, batch_size, epochs, True)
val_dataset = make_dataset(input_val, output_val, batch_size, 1, False)

In [14]:
embedding_units = 256
units = 1024
input_vocab_size = len(input_tokenizer.word_index)+1
output_vocab_size = len(output_tokerizer.word_index)+1

In [15]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([Dimension(64), Dimension(14)]),
 TensorShape([Dimension(64), Dimension(9)]))

In [16]:
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_units, encoding_units, batch_size):
        super(Encoder, self).__init__()
        
        self.batch_size = batch_size
        self.encoding_units = encoding_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_units)
        
        self.gru = keras.layers.GRU(self.encoding_units, 
                                return_sequences=True,
                                return_state = True, 
                                recurrent_initializer='glorot_uniform')
    
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoding_units))
    
    
encoder = Encoder(input_vocab_size, embedding_units, units, batch_size)

sample_hidden = encoder.initialize_hidden_state()

sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

print("sample_output", sample_output.shape)
print("sample_hidden", sample_hidden.shape)

sample_output (64, 14, 1024)
sample_hidden (64, 1024)


In [17]:
class BahdanauAttention(keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights
    
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 14, 1)


In [18]:
class Decoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoding_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.decoding_units = decoding_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.decoding_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
        self.fc = keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.decoding_units)
        
    def call(self, x, hidden, encoding_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, encoding_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights
decoder = Decoder(output_vocab_size, embedding_units, units, batch_size)
sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)),
                                      sample_hidden, sample_output)
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))    

Decoder output shape: (batch_size, vocab size) (64, 5433)


In [19]:
optimizer = keras.optimizers.Adam()
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

checkpoint_dir = './output/15_checkpoints'
if not os.path.exists(checkpoint_dir):
    os.mkdir(checkpoint_dir)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [20]:
def train_step(inp, targ, encoding_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        encoding_output, encoding_hidden = encoder(inp, encoding_hidden)

        decoding_hidden = encoding_hidden

#         decoding_input = tf.expand_dims([targ.word_index['<start>']] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(0, targ.shape[1] -1):
            decoding_input = tf.expand_dims(targ[:, t], 1)
            
            predictions, decoding_hidden, _ = \
            decoder(decoding_input, decoding_hidden, encoding_output)

            loss += loss_function(targ[:, t+1], predictions)

            # using teacher forcing
#             decoding_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[0]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [21]:


EPOCHS = 10
steps_per_epoch = len(input_train)//batch_size
for epoch in range(EPOCHS):
    start = time.time()

    encoding_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, encoding_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, 
                                        batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1 Batch 0 Loss 0.5522
Epoch 1 Batch 100 Loss 0.2970
Epoch 1 Batch 200 Loss 0.2707
Epoch 1 Batch 300 Loss 0.2564
Epoch 1 Loss 0.2872
Time taken for 1 epoch 174.42540216445923 sec

Epoch 2 Batch 0 Loss 0.2530
Epoch 2 Batch 100 Loss 0.2076
Epoch 2 Batch 200 Loss 0.1988
Epoch 2 Batch 300 Loss 0.1924
Epoch 2 Loss 0.2051
Time taken for 1 epoch 174.7465364933014 sec

Epoch 3 Batch 0 Loss 0.1910
Epoch 3 Batch 100 Loss 0.1566
Epoch 3 Batch 200 Loss 0.1478
Epoch 3 Batch 300 Loss 0.1420
Epoch 3 Loss 0.1539
Time taken for 1 epoch 174.30358695983887 sec

Epoch 4 Batch 0 Loss 0.1337
Epoch 4 Batch 100 Loss 0.1118
Epoch 4 Batch 200 Loss 0.1068
Epoch 4 Batch 300 Loss 0.1037
Epoch 4 Loss 0.1087
Time taken for 1 epoch 174.7017800807953 sec

Epoch 5 Batch 0 Loss 0.0840
Epoch 5 Batch 100 Loss 0.0795
Epoch 5 Batch 200 Loss 0.0683
Epoch 5 Batch 300 Loss 0.0703
Epoch 5 Loss 0.0729
Time taken for 1 epoch 173.792

In [22]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = preprocess_sentence(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    encoding_out, encoding_hidden = encoder(inputs, hidden)

    decoding_hidden = encoding_hidden
    decoding_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

    for t in range(max_length_targ):
        predictions, decoding_hidden, attention_weights = decoder(
            decoding_input, decoding_hidden, encoding_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + ' '

        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        decoding_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    plt.show()
    
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))
    

checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc54cd430f0>