In [None]:
import os
import numpy as np
import sys
from h5py import File
from keras.optimizers import *
from keras.callbacks import *
from keras.models import *
from keras.layers import *
from keras.initializers import *
from keras.activations import *
import tensorflow as tf
from keras_layer_normalization import LayerNormalization

# Data Parameters
data_file = 'data/en2de.h5'
valid_data_file = 'data/en2de.valid.h5'
dict_file = 'data/en2de_word.txt'

# Transformer Parameters
d_model = 512 # Embedding Demension
d_ff = 2048 # Feed-Forward Network's Hidden Size
d_k = d_v = 64 # = d_model / head
N = 6 # Num of Encoder / Decoder Layer's Stack
head = 8 # Num of Multi-Head Attention's Head
len_limit = 999

dropout = 0.1 # Dropout
warmup_steps = 4000 # Using When Evaluate Learning Rate

# 1 Data Preprocessing

> We use En2De Dataset.

In [None]:
class TokenList:
    def __init__(self, token_list):
        self.id2t = ['<PAD>', '<UNK>', '<S>', '</S>'] + token_list
        self.t2id = {v: k for k, v in enumerate(self.id2t)}

    def id(self, x):    
        return self.t2id.get(x, 1)

    def token(self, x):    
        return self.id2t[x]

    def num(self):        
        return len(self.id2t)

    def start_id(self):  
        return 2

    def end_id(self):    
        return 3
    

def make_dict(dict_file):
    with open(dict_file, encoding="utf-8") as f:
        _list = list(ll for ll in f.read().split('\n') if ll != "")
    mid_pos = _list.index('<@@@>') # Seperate En & De
    input_tokens = TokenList(_list[:mid_pos])
    output_tokens = TokenList(_list[mid_pos + 1:])
    return input_tokens, output_tokens


def make_data(data_file):
    with File(data_file) as df:
        X, Y = df['X'][:], df['Y'][:]
    return X, Y


if __name__ == '__main__':
    input_tokens, output_tokens = make_dict(dict_file)
    x_train, y_train = make_data(data_file)
    x_valid, y_valid = make_data(valid_data_file)

    input_vocab_size = input_tokens.num()
    output_vocab_size = output_tokens.num()
    
    print('English DictNum:', input_tokens.num())
    print('Deutsche DictNum:', output_tokens.num())
    print('Train Shapes:',  x_train.shape, y_train.shape)
    print('Valid Shapes:', x_valid.shape, y_valid.shape)

# 2 Modeling

## 2.1 Positional Encoding

In [None]:
def positional_encoding(max_len):
    PE = np.array([
        [pos / np.power(10000, 2 * i / d_model) for i in range(d_model)]
        if pos != 0 else np.zeros(d_model) for pos in range(max_len)
    ]) # np.power(10000, 2 * (j // 2) / d_emb) ??? Why?
    PE[1:, 0::2] = np.sin(PE[1:, 0::2]) # 2i
    PE[1:, 1::2] = np.cos(PE[1:, 1::2]) # 2i + 1
    
    return PE

## 2.2 Multi-Head Attention

In [None]:
class MultiHeadAttention:
    def __init__(self):
        print("Multi-Head Attention Layer Generated")
        self.Q_linear_transform_layers = []
        self.K_linear_transform_layers = []
        self.V_linear_transform_layers = []
        
        for _ in range(head):
            self.Q_linear_transform_layers.append(Dense(d_k, use_bias=False))
            self.K_linear_transform_layers.append(Dense(d_k, use_bias=False))
            self.V_linear_transform_layers.append(Dense(d_v, use_bias=False))
        
        self.normarlization_layer = LayerNormalization()
        self.output_linear_transfrom_layer = Dense(d_model)
        
    def __call__(self, Q, K, V, mask=None):
        attentions = []
        outputs = [] # ?
        
        for i in range(head):
            WQ = self.Q_linear_transform_layers[i](Q)
            WK = self.K_linear_transform_layers[i](K)
            WV = self.V_linear_transform_layers[i](V)
            output, attention = self.scaled_dot_product_attention(WQ, WK, WV)
            attentions.append(attention)
            outputs.append(output)
        
        attention_result = Concatenate()(attentions)
        output_result = Concatenate()(outputs)
        output = self.output_linear_transfrom_layer(output_result)
        output += Q
        
        return self.normarlization_layer(output), attention
        
    def scaled_dot_product_attention(self, Q, K, V):
        num = tf.matmul(Q, tf.transpose(K, perm=[0, 2, 1]))
        denum = np.sqrt(d_model)
        attention = tf.matmul(softmax(num / denum), V)
        output = tf.matmul(attention, tf.transpose(V, perm=[0, 2, 1]))
        
        return output, attention

## 2.3 Position-wise Feed-Forward Network

In [None]:
class PositionWiseFeedForwardNetwork:
    def __init__(self):
        print("Position-Wise Feed Forward Network Generated")
        self.linear_transform_layer_1 = Dense(d_model)
        self.relu_layer = Dense(d_model, activation='relu')
        self.linear_transform_layer_2 = Dense(d_model)
        self.normarlization_layer = LayerNormalization()
        
    def __call__(self, x):
        output = self.linear_transform_layer_1(x)
        output = self.relu_layer(output)
        output = self.linear_transform_layer_2(output)
        output += x
        
        return self.normarlization_layer(output)

## 2.4 Encoder

In [None]:
class EncoderLayer:
    def __init__(self):
        print("Encoder Layer Generated")
        self.multi_head_attention_layer = MultiHeadAttention()
        self.position_wise_feed_forward_network = PositionWiseFeedForwardNetwork()
        
    def __call__(self, encoder_input, mask=None):
        encoder_output, attention = self.multi_head_attention_layer(encoder_input, encoder_input, encoder_input, mask)
        encoder_output = self.position_wise_feed_forward_network(encoder_output)
        
        return encoder_output, attention
    
class Encoder:
    def __init__(self):
        self.input_embedding =  Embedding(input_vocab_size, d_model)
        self.positional_embedding = Embedding(len_limit, d_model, trainable=False,
                                              weights=[positional_encoding(len_limit)])
        self.layers = [EncoderLayer() for _ in range(N)]
        
    def __call__(self, encoder_input):
        encoder_output = self.input_embedding(encoder_input) + self.positional_embedding(tf.constant([i for i in range(input_vocab_size)]))
        encoder_attentions = []
        
        for layer in self.layers:
            encoder_output, encoder_attention = layer(encoder_output)
            encoder_attentions.append(encoder_attention)
            
        return encoder_output, encoder_attentions

## 2.5 Decoder

In [None]:
class DecoderLayer:
    def __init__(self):
        print("Decoder Layer Generated")
        self.multi_head_attention_layer = MultiHeadAttention()
        self.masked_multi_head_attention_layer = MultiHeadAttention()
        self.position_wise_feed_forward_network = PositionWiseFeedForwardNetwork()
        
    def __call__(self, decoder_input, encoder_output, mask=None):
        decoder_output, decoder_attention = self.masked_multi_head_attention_layer(decoder_input, decoder_input, decoder_input, mask)
        decoder_output, decoder_encoder_attention = self.masked_multi_head_attention_layer(decoder_output, encoder_output, encoder_output, None)
        decoder_output = self.position_wise_feed_forward_network(decoder_output)
        
        return decoder_output, decoder_attention, decoder_encoder_attention
        
class Decoder:
    def __init__(self):
        self.output_embedding =  Embedding(output_vocab_size, d_model)
        self.positional_embedding = Embedding(len_limit, d_model, trainable=False,
                                              weights=[positional_encoding(len_limit)])
        self.layers = [DecoderLayer() for _ in range(N)]
        
    def __call__(self, decoder_input, encoder_input, encoder_output):
        decoder_output = self.output_embedding(encoder_input) + self.positional_embedding(tf.constant([i for i in range(output_vocab_size)]))
        attention_mask = self.get_attention_mask(decoder_input, encoder_input)
        
        decoder_attentions, decoder_encoder_attentions = [], []
        
        for layer in self.layers:
            decoder_output, decoder_attention, decoder_encoder_attention = layer(decoder_output, encoder_output, attention_mask)
            decoder_attentions.append(decoder_attention)
            decoder_encoder_attentions.append(decoder_encoder_attention)
            
        return decoder_output, decoder_attentions, decoder_encoder_attentions
    
    def get_attention_mask(seq_q, seq_k):
        batch_size, len_q = seq_q.size()
        batch_size, len_k = seq_k.size()
        pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q)
    
        return pad_attn_mask.expand(batch_size, len_q, len_k)

## 2.6 Transformer

In [None]:
class Transformer:
    def __init__(self):
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.linear_transform_layer = Dense(d_model, use_bias=False)
        
    def get_position(self, x):
        mask = K.cast(K.not_equal(x, 0), 'int32')
        pos = K.cumsum(K.ones_like(x, 'int32'), 1)
        
        return pos * mask
    
    def compile(self, optimizer='adam'):
        print(">> Start Compile")
        source_input = Input(shape=(None,), dtype='int32')
        target_input = Input(shape=(None,), dtype='int32')
        
        print(">> Set Encoder")
        encoder_output = self.encoder(source_input)
        print(">> Set Decoder")
        decoder_output = self.decoder(target_input, source_input, encoder_output)
        final_output = self.linear_transform_layer(decoder_output)
        
        def get_loss(args):
            y_pred, y_true = args
            y_true = tf.cast(y_true, 'int32')
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
            mask = tf.cast(tf.not_equal(y_true, 0), 'float32')
            loss = tf.reduce_sum(loss * mask, -1) / tf.reduce_sum(mask, -1)
            loss = K.mean(loss)
            return loss

        def get_accu(args):
            y_pred, y_true = args
            mask = tf.cast(tf.not_equal(y_true, 0), 'float32')
            corr = K.cast(K.equal(K.cast(y_true, 'int32'), K.cast(K.argmax(y_pred, axis=-1), 'int32')), 'float32')
            corr = K.sum(corr * mask, -1) / K.sum(mask, -1)
            return K.mean(corr)
        
        loss = Lambda(get_loss)([final_output, target_true])
        self.ppl = Lambda(K.exp)(loss)
        self.accu = Lambda(get_accu)([final_output, target_true])

        self.model = Model([source_input, target_input], loss)
        self.model.add_loss([loss])
        self.output_model = Model([source_input, target_input], final_output)

        self.model.compile(optimizer, None)
        self.model.metrics_names.append('ppl')
        self.model.metrics_tensors.append(self.ppl)
        self.model.metrics_names.append('accu')
        self.model.metrics_tensors.append(self.accu)

## 3 Training

In [None]:
transformer = Transformer()

transformer.compile(Adam(0.001, 0.9, 0.98, epsilon=1e-9))