In [70]:
import jieba
import zhconv
import tensorflow as tf
import re


class Vocab:

    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        counter = Vocab.count_corpus(tokens)
        # 对词频率排序
        self.__token_freqs = sorted(counter.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
        self.index_to_token = ['<unk>'] + reserved_tokens
        self.token_to_index = {
            token: idx
            for idx, token in enumerate(self.index_to_token)
        }
        for token, freq in self.__token_freqs:
            if freq >= min_freq and token not in self.token_to_index:
                self.index_to_token.append(token)
                self.token_to_index[token] = len(self.index_to_token) - 1

    def __len__(self):
        return len(self.index_to_token)

    def get_tokens(self, indicates):
        if not isinstance(indicates, (list, tuple)):
            return self.index_to_token[indicates]
        return ''.join([self.get_tokens(index) for index in indicates])

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_index.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    @property
    def unk(self):
        return 0

    @property
    def token_freqs(self):
        return self.__token_freqs

    @staticmethod
    def count_corpus(tokens):
        if isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        from collections import Counter
        return Counter(tokens)


def truncate_and_pad(line, steps, padding_token):
    """
    \{填充||截断\}序列，使序列长度保持一致
    """
    if len(line) > steps:
        return line[:steps]
    return line + [padding_token] * (steps - len(line))


def load_datasets(steps=20, batch_size=50):
    """
    预处理数据并封装成tf.data.Dataset
    """
    with open('./en_zh.trans.txt', 'r') as data_file:
        lines = data_file.readlines()
    # print(zhconv.convert(s[1], 'zh-cn'))
    en, zh = [], []
    for line in lines:
        split = line.split('\t')
        en.append(
            truncate_and_pad(
                re.sub('[^A-Za-z]+', ' ', split[0]).strip().lower().split(' ')
                + ['<eos>'], steps, '<pad>')), zh.append(
                    truncate_and_pad(
                        list(
                            jieba.cut(zhconv.convert(split[1], 'zh-cn'),
                                      cut_all=False)) + ['<eos>'], steps,
                    '<pad>'))
    en_vocab, zh_vocab = Vocab(
        en, min_freq=2,
        reserved_tokens=['<pad>', '<bos>', '<eos>'
                         ]), Vocab(zh,
                                   min_freq=2,
                                   reserved_tokens=['<pad>', '<bos>', '<eos>'])
    en = tf.constant([en_vocab[line] for line in en], dtype='float32')
    zh = tf.constant([zh_vocab[line] for line in zh], dtype='int32')
    en_len = tf.reduce_sum(tf.cast(en != en_vocab['<pad>'], dtype='int32'), axis=1)
    zh_len = tf.reduce_sum(tf.cast(zh != zh_vocab['<pad>'], dtype='int32'), axis=1)
    ds = (
        tf.data.Dataset.from_tensor_slices(en),
        tf.data.Dataset.from_tensor_slices(en_len),
        tf.data.Dataset.from_tensor_slices(zh),
        tf.data.Dataset.from_tensor_slices(zh_len),
    )
    # en.shape=(batch_size, steps)
    train_iter = tf.data.Dataset.zip(ds).shuffle(buffer_size=len(en)).batch(batch_size=batch_size)
    return train_iter, en_vocab, zh_vocab

BATCH_SIZE, STEPS = 512, 15
train_iter, en_vocab, zh_vocab = load_datasets(batch_size=BATCH_SIZE, steps=STEPS)

In [71]:
class Seq2SeqEncoder(tf.keras.layers.Layer):
    
    def __init__(self, vocab_size, embed_size, hiddens, layers, dropout=0., **kwargs):
        super().__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
        self.rnn_net = tf.keras.layers.RNN(tf.keras.layers.StackedRNNCells([
                tf.keras.layers.GRUCell(hiddens, dropout=dropout) for _ in range(layers)
            ]), return_sequences=True, return_state=True)
    
    def call(self, x, *args, **kwargs):
        x = self.embedding(x)
        y = self.rnn_net(x, *args, **kwargs)
        state = y[1:]
        return y[0], state

class Seq2SeqDecoder(tf.keras.layers.Layer):
    
    def __init__(self, vocab_size, embed_size, hiddens, layers, dropout=0., **kwargs):
        super().__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
        self.rnn_net = tf.keras.layers.RNN(tf.keras.layers.StackedRNNCells([
            tf.keras.layers.GRUCell(hiddens, dropout=dropout) for _ in range(layers)
            ]), return_sequences=True, return_state=True)
        self.output_layer = tf.keras.layers.Dense(vocab_size)
    
    def init_state(self, encode, *args):
        return encode[1]
        
    def call(self, x, state, **kwargs):
        x = self.embedding(x)
        context = tf.repeat(tf.expand_dims(state[-1], axis=1), repeats=x.shape[1], axis=1)
        x_ctx = tf.concat((x, context), axis=2)
        y = self.rnn_net(x, state, **kwargs)
        state = y[1:]
        y = self.output_layer(y[0])
        return y, state

class EncoderDecoder(tf.keras.Model):
    
    def __init__(self, encoder, decoder, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
    
    def call(self, encode_x, decode_x, *args, **kwargs):
        encode = self.encoder(encode_x, *args, **kwargs)
        state = self.decoder.init_state(encode, *args)
        return self.decoder(decode_x, state, **kwargs)
    
    def fit(self, train_iter, target_vocab, epochs=10, lr=1e-3):
        optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
        for epoch in range(epochs):
            for step, (x, xl, y, yl) in enumerate(train_iter):
                bos = tf.reshape(tf.constant([target_vocab['<bos>']] * y.shape[0]), shape=(-1, 1))
                decode_x = tf.concat([bos, tf.cast(y[:, :-1], dtype='int32')] , 1)  # 给y加入<bos> 忽略末尾的<eos>
                with tf.GradientTape() as gt:
                    y_hat, _ = self.call(x, decode_x, training=True)
                    loss = MaskedSoftmaxLoss(yl)(y, y_hat)
                grads = gt.gradient(loss, self.trainable_variables)
                grads = self.gradient_clip(grads, 1)
                optimizer.apply_gradients(zip(grads, self.trainable_variables))
            print(f'epoch[{epoch + 1}\\{epochs} loss: {tf.reduce_mean(loss)}')
    
    def gradient_clip(self, grads, theta):
        """
            梯度裁剪，防止梯度爆炸问题
        """
        theta = tf.constant(theta, dtype="float32")
        new_grads = []
        for grad in grads:
            new_grads.append(
                tf.convert_to_tensor(grad) if isinstance(
                    grad, tf.IndexedSlices) else grad)
        # L2范数
        norm = tf.math.sqrt(
            sum((tf.reduce_sum(grad**2).numpy() for grad in new_grads)))
        norm = tf.cast(norm, "float32")
        if tf.greater(norm, theta):
            for i, grad in enumerate(new_grads):
                new_grads[i] = grad * theta / norm
        return new_grads
    
    def predict(self, src, src_vocab, target_vocab, steps):
        src_tokens = src_vocab[src.lower().split(' ')] + [src_vocab['<eos>']]
        src_valid_len = tf.constant([len(src_tokens)])
        src_tokens = tf.constant(truncate_and_pad(src_tokens, steps, src_vocab['<pad>']), dtype='float32')
        encode_x = tf.expand_dims(src_tokens, axis=0)
        encode_outputs = self.encoder(encode_x, training=False)
        state = self.decoder.init_state(encode_outputs, src_valid_len)
        predict = []
        decode_x = tf.expand_dims(tf.constant([target_vocab['<bos>']]), axis=0)
        for _ in range(steps):
            y, state = net.decoder(decode_x, state, training=False)
            decode_x = tf.argmax(y, axis=2)
            pred = tf.squeeze(decode_x, axis=0)
            if pred == target_vocab['<eos>']:
                break;
            predict.append(pred.numpy())
        return ''.join([target_vocab.get_tokens(word[0]) for word in predict])
        

In [72]:
class MaskedSoftmaxLoss(tf.keras.losses.Loss):
    
    def __init__(self, valid_len):
        super().__init__(reduction='none')
        self.valid_len = valid_len
        self._loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True, reduction='none')
    
    def call(self, y, y_hat):
        mask = tf.ones_like(y, dtype='float32')
        mask = self.sequence_mask(mask)
        y = tf.one_hot(y, depth=y_hat.shape[-1])
        _loss = self._loss_fn(y, y_hat)
        return tf.reduce_mean(mask * _loss, axis=1)

    def sequence_mask(self, x, value=0):
        """
        由于做了固定长度的填充，在预测时要忽略掉
        """
        # x.shape=(batch_size, steps)
        # xl.shape=(steps, )
        # (1, 32) (32, 1)
        mask = tf.range(start=0, limit=x.shape[1],
                        dtype='float32')[None, :] < tf.cast(self.valid_len[:, None],
                                                            dtype='float32')
        if len(x.shape) == 3:
            return tf.where(tf.expand_dims(mask, axis=-1), x, value)
        else:
            return  tf.where(mask, x, value)

In [73]:
EMBED_SIZE, HIDDENTS, LAYERS, DROPOUT = 64, 64, 2, .1

encoder = Seq2SeqEncoder(len(en_vocab), EMBED_SIZE, HIDDENTS, LAYERS, DROPOUT)
decoder = Seq2SeqDecoder(len(zh_vocab), EMBED_SIZE, HIDDENTS, LAYERS, DROPOUT)

net = EncoderDecoder(encoder=encoder, decoder=decoder)
net.fit(train_iter, zh_vocab, epochs=200, lr=1e-3)

epoch[1\200 loss: 2.863422393798828
epoch[2\200 loss: 2.680830478668213
epoch[3\200 loss: 2.4347567558288574
epoch[4\200 loss: 2.3846709728240967
epoch[5\200 loss: 2.392573118209839
epoch[6\200 loss: 2.303725242614746
epoch[7\200 loss: 2.3967225551605225
epoch[8\200 loss: 2.2686619758605957
epoch[9\200 loss: 2.1332828998565674
epoch[10\200 loss: 2.171907901763916
epoch[11\200 loss: 2.1042423248291016
epoch[12\200 loss: 2.1073410511016846
epoch[13\200 loss: 2.0729575157165527
epoch[14\200 loss: 1.9905121326446533
epoch[15\200 loss: 1.9423660039901733
epoch[16\200 loss: 1.8784390687942505
epoch[17\200 loss: 1.9707272052764893
epoch[18\200 loss: 1.9936015605926514
epoch[19\200 loss: 1.8809469938278198
epoch[20\200 loss: 1.9264605045318604
epoch[21\200 loss: 1.9051804542541504
epoch[22\200 loss: 1.8375962972640991
epoch[23\200 loss: 1.849548101425171
epoch[24\200 loss: 1.8886387348175049
epoch[25\200 loss: 1.9044933319091797
epoch[26\200 loss: 1.8290139436721802
epoch[27\200 loss: 1.813459

In [109]:
en = [
    "Hello",
    "can you help me",
    "i need help",
    "please help me",
    "i am a student",
    "i love you",
    "i written some code",
    "we are friends"
]
for source in en:
    target = net.predict(source, en_vocab, zh_vocab, steps=STEPS)
    print(f"原文：{source}，翻译结果：{target}")

原文：Hello，翻译结果：你好。
原文：can you help me，翻译结果：你能帮我吗？
原文：i need help，翻译结果：我需要帮助。
原文：please help me，翻译结果：请帮我。
原文：i am a student，翻译结果：我是个学生。
原文：i love you，翻译结果：我爱你。
原文：i written some code，翻译结果：我写了一封信。
原文：we are friends，翻译结果：我们是朋友。


In [98]:
net.predict("good idea", en_vocab, zh_vocab, steps=STEPS)

'很好。'

In [76]:
net.predict("Hi", en_vocab, zh_vocab, steps=STEPS)

'你好。'

In [91]:
net.predict("i have an apple", en_vocab, zh_vocab, steps=STEPS)

'我有一个非常喜欢的苹果。'