In [1]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Model, Sequential
import tensorflow.keras.backend as K
from tensorflow.keras.layers import *
import tensorflow_datasets as tfds

In [2]:
examples = tfds.load('ted_hrlr_translate/pt_to_en', as_supervised=True)

train_examples, val_examples = examples['train'], examples['validation']

tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_examples),
    target_vocab_size=2**13
)

tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples),
    target_vocab_size=2**13
)

sample_string = 'Transformer is awesome.'

tokenized_string = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

[1mDownloading and preparing dataset ted_hrlr_translate/pt_to_en/1.0.0 (download: 124.94 MiB, generated: Unknown size, total: 124.94 MiB) to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompleteMCJEU7/ted_hrlr_translate-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=51785.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompleteMCJEU7/ted_hrlr_translate-validation.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1193.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompleteMCJEU7/ted_hrlr_translate-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1803.0), HTML(value='')))

[1mDataset ted_hrlr_translate downloaded and prepared to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0. Subsequent calls will reuse this data.[0m
Tokenized string is [7915, 1248, 7946, 7194, 13, 2799, 7877]
The original string: Transformer is awesome.


In [6]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64
MAX_LENGTH = 40

def filter_max_length(x, y, max_length=MAX_LENGTH):
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

def encode(lang1, lang2):
    lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
      lang1.numpy()) + [tokenizer_pt.vocab_size+1]

    lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang2.numpy()) + [tokenizer_en.vocab_size+1]

    return lang1, lang2

def tf_encode(pt, en):
    result_pt, result_en = tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
    result_pt.set_shape([None])
    result_en.set_shape([None])

    return result_pt, result_en

train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)
# 将数据集缓存到内存中以加快读取速度。
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [7]:
class MultiHeadAttention(Layer):
    def __init__(self, input_units, head_units, transform_units, **kargs):
        super().__init__()
        self.head_units = head_units    
        self.dense_q = TimeDistributed(Dense(transform_units * head_units))
        self.dense_k = TimeDistributed(Dense(transform_units * head_units))
        self.dense_v = TimeDistributed(Dense(transform_units * head_units))
        self.attention = Attention(**kargs)   
        self.dense_output = TimeDistributed(Dense(input_units))


    def _split_and_concat(self, x):
        return K.concatenate(tf.split(x, self.head_units, axis=-1), axis=0)


    def call(self, q, v, q_mask, v_mask):
        k = v
        q_transform = self._split_and_concat(self.dense_q(q))
        v_transform = self._split_and_concat(self.dense_v(v))
        k_transform = self._split_and_concat(self.dense_k(k))
        
        head_concat = K.concatenate(
            tf.split(
                self.attention(
                    [q_transform, v_transform, k_transform],
                    mask=[
                        K.tile(q_mask, [self.head_units, 1]),
                        K.tile(v_mask, [self.head_units, 1])
                    ]
                ),
                self.head_units, 
                axis=0
            ),
            axis=-1
        )
        return self.dense_output(head_concat)  

    
class ResNorm(Layer):
    def __init__(self, sequential):
        super().__init__()
        self.sequential = sequential
        self.layer_norm = LayerNormalization()

    def call(self, x):
        return self.layer_norm(x + self.sequential(x))

    
class ResNormAttention(Layer):
    def __init__(self, attention_layer):
        super().__init__()
        self.attention_layer = attention_layer
        self.layer_norm = LayerNormalization()

    def call(self, q, v, q_mask, v_mask):
        return self.layer_norm(q + self.attention_layer(q, v, q_mask, v_mask))


class PositionalLayer(Layer):
    def __init__(self, input_units):
        super().__init__()
        assert input_units % 2 == 0, "Input_units should be even."
        self.base = K.constant((1 / 10000) ** (np.arange(input_units / 2) * 2 / input_units))

    def call(self, x):
        length = K.shape(x)[1]
        angles = K.transpose(K.tile(self.base[:, None], [1, length]) * K.arange(0, length, dtype='float32'))
        positional_encoding = K.concatenate([K.sin(angles), K.cos(angles)], axis=1)
        return x + positional_encoding


class EncoderLayer(Layer):
    def __init__(self, input_units, head_units, transform_units, dropout, ffn_units):
        super().__init__()
        self.attention = ResNormAttention(
            MultiHeadAttention(
                input_units, 
                head_units,
                transform_units, 
                use_scale=True, 
                dropout=dropout
            )
        )
        self.ffn = ResNorm(Sequential([
            TimeDistributed(Dense(ffn_units, activation='relu')),
            TimeDistributed(Dense(input_units)),
        ]))
        

    def call(self, encoding, padding):
        return self.ffn(self.attention(encoding, encoding, padding, padding))
    
class DecoderLayer(Layer):
    def __init__(self, input_units, head_units, transform_units, dropout, ffn_units):
        super().__init__()
        self.attention1 = ResNormAttention(
            MultiHeadAttention(
                input_units, 
                head_units,
                transform_units, 
                use_scale=True,
                causal=True,
                dropout=dropout
            )
        ) 
        self.attention2 = ResNormAttention(
            MultiHeadAttention(
                input_units, 
                head_units,
                transform_units, 
                use_scale=True,
                dropout=dropout
            )
        )
        self.ffn = ResNorm(Sequential([
            TimeDistributed(Dense(ffn_units, activation='relu')),
            TimeDistributed(Dense(input_units)),
        ]))


    def call(self, encoding, decoding, encoding_padding, decoding_padding):
        return self.ffn(
            self.attention2(
                self.attention1(decoding, decoding, decoding_padding, decoding_padding),
                encoding,
                decoding_padding,
                encoding_padding
            )
        )


class Encoder(Layer):
    def __init__(self, embedding_input_dim, embedding_output_dim, layer_units, head_units, transform_units, dropout, ffn_units):
        super().__init__()
        self.embedding_output_dim = embedding_output_dim
        self.embedding_layer = Embedding(embedding_input_dim, embedding_output_dim)
        self.pos_layer = PositionalLayer(embedding_output_dim)
        self.encoder_layers = [EncoderLayer(embedding_output_dim, head_units, transform_units, dropout, ffn_units) for _ in range(layer_units)]
    
    def call(self, embedding_input, padding):
        encoding = self.embedding_layer(embedding_input) * K.sqrt(K.constant(self.embedding_output_dim))
        encoding = self.pos_layer(encoding)
        for layer in self.encoder_layers:
            encoding = layer(encoding, padding)
        return encoding

        
class Decoder(Layer):
    def __init__(self, embedding_input_dim, embedding_output_dim, layer_units, head_units, transform_units, dropout, ffn_units):
        super().__init__()
        self.embedding_output_dim = embedding_output_dim
        self.embedding_layer = Embedding(embedding_input_dim, embedding_output_dim)
        self.pos_layer = PositionalLayer(embedding_output_dim)
        self.decoder_layers = [DecoderLayer(embedding_output_dim, head_units, transform_units, dropout, ffn_units) for _ in range(layer_units)]
        self.final_layer = TimeDistributed(Dense(embedding_input_dim))
        
    def call(self, encoding, embedding_input, encoding_padding, decoding_padding):
        decoding = self.embedding_layer(embedding_input) * K.sqrt(K.constant(self.embedding_output_dim))
        decoding = self.pos_layer(decoding)
        for layer in self.decoder_layers:
            decoding = layer(encoding, decoding, encoding_padding, decoding_padding)
        decoding = self.final_layer(decoding)
        return decoding

In [8]:
ENCODER_EMBEDDING_INPUT_DIM = tokenizer_pt.vocab_size + 2
ENCODER_EMBEDDING_OUTPUT_DIM = 128
DECODER_EMBEDDING_INPUT_DIM = tokenizer_en.vocab_size + 2 
DECODER_EMBEDDING_OUTPUT_DIM = 128
LAYER_UNITS = 4
HEAD_UNITS = 8
TRANSFORM_UNITS = ENCODER_EMBEDDING_OUTPUT_DIM // HEAD_UNITS
FFN_UNITS = 512
DROPOUT = 0.1

encoder = Encoder(
    ENCODER_EMBEDDING_INPUT_DIM,
    ENCODER_EMBEDDING_OUTPUT_DIM,
    LAYER_UNITS,
    HEAD_UNITS,
    TRANSFORM_UNITS,
    DROPOUT,
    FFN_UNITS
)

decoder = Decoder(
    DECODER_EMBEDDING_INPUT_DIM,
    DECODER_EMBEDDING_OUTPUT_DIM,
    LAYER_UNITS,
    HEAD_UNITS,
    TRANSFORM_UNITS,
    DROPOUT,
    FFN_UNITS
)

def loss_func(decoding_real, decoding_pred):
    mask = K.not_equal(decoding_real, 0)
    # from_logits=True表示预测的解码向量没有经过softmax
    loss = tf.keras.losses.sparse_categorical_crossentropy(decoding_real, decoding_pred, from_logits=True)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return K.mean(loss)



encoder_embedding_input = Input([None], dtype='int64')
decoder_embedding_input = Input([None], dtype='int64')

# 遮挡编码为0的位置，编码0在分词器中为空串，不会出现在句子中间
encoding_padding = K.not_equal(encoder_embedding_input, 0)
decoding_padding = K.not_equal(decoder_embedding_input, 0)
encoding = encoder(encoder_embedding_input, encoding_padding)
decoding = decoder(encoding, decoder_embedding_input, encoding_padding, decoding_padding)



transformer = Model(
    inputs=[
        encoder_embedding_input,
        decoder_embedding_input
    ],
    outputs=decoding
)

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, embedding_dim, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.embedding_dim = tf.cast(embedding_dim, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.embedding_dim) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(ENCODER_EMBEDDING_OUTPUT_DIM)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
train_loss_metric = tf.keras.metrics.Mean()
train_accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy()
transformer.compile(optimizer=optimizer, loss=loss_func)
transformer.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
tf_op_layer_NotEqual_2 (TensorF [(None, None)]       0           input_3[0][0]                    
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
encoder_1 (Encoder)             (None, None, 128)    1844740     input_3[0][0]                    
____________________________________________________________________________________________

In [None]:
EPOCHS = 20
for epoch in range(EPOCHS):
    for batch, (inp, tar) in enumerate(train_dataset):
        print(f'{epoch}-{batch}')
        if batch % 50 == 0:
            clear_output()
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]
        transformer.fit([inp, tar_inp], tar_real, batch_size=BATCH_SIZE)

4-201
4-202
4-203
4-204
4-205
4-206
4-207
4-208
4-209
4-210
4-211


In [None]:
def translate(sentence):
    start_token = [tokenizer_pt.vocab_size]
    end_token = [tokenizer_pt.vocab_size + 1]

    # 输入语句是葡萄牙语，增加开始和结束标记
    sentence = start_token + tokenizer_pt.encode(sentence) + end_token
    encoder_input = K.expand_dims(sentence, 0)

    # 因为目标是英语，输入 transformer 的第一个词应该是
    # 英语的开始标记。
    decoder_input = [tokenizer_en.vocab_size]
    output = K.expand_dims(decoder_input, 0)
    encoding = encoder(encoder_input, encoder_input != 0)
    for i in range(MAX_LENGTH):
        # predictions.shape == (batch_size, seq_len, vocab_size)
        
        predictions = decoder(encoding, output, encoder_input != 0, output != 0)
        
        # 从 seq_len 维度选择最后一个词
        predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

        predicted_id = K.cast(np.argmax(predictions, axis=-1), tf.int32)
        
        # 如果 predicted_id 等于结束标记，就返回结果
        if predicted_id == tokenizer_en.vocab_size + 1:
            break
        
        # 连接 predicted_id 与输出，作为解码器的输入传递到解码器。
        output = K.concatenate([output, predicted_id], axis=-1)
    return tokenizer_en.decode([i for i in K.squeeze(output, axis=0) if i < tokenizer_en.vocab_size])  

In [None]:
print('Predicted translation: ' + translate("este é um problema que temos que resolver."))
print ("Real translation: this is a problem we have to solve .")


Predicted translation: this is a problem that we have to solve america .
Real translation: this is a problem we have to solve .


In [None]:
print('Predicted translation: ' + translate("os meus vizinhos ouviram sobre esta ideia."))
print("Real translation: and my neighboring homes heard about this idea .")


Predicted translation: my neighbors have heard about this idea .
Real translation: and my neighboring homes heard about this idea .


In [None]:
print('Predicted translation: ' + translate("vou então muito rapidamente partilhar convosco algumas histórias de algumas coisas mágicas que aconteceram."))
print("Real atranslation: so i 'll just share with you some stories very quickly of some magical things that have happened .")

Predicted translation: so i 'll be very quickly to share with you some stories of some girandom stuff that happened .
Real translation: so i 'll just share with you some stories very quickly of some magical things that have happened .
