In [1]:
import os
import numpy as np
import sys
from h5py import File
from keras.optimizers import *
from keras.callbacks import *
from keras.models import *
from keras.layers import *
from keras.initializers import *
from keras.activations import *
import tensorflow as tf
from keras_layer_normalization import LayerNormalization

# Data Parameters
data_file = 'data/en2de.h5'
valid_data_file = 'data/en2de.valid.h5'
dict_file = 'data/en2de_word.txt'

# Transformer Parameters
d_model = 512 # Embedding Demension
d_ff = 2048 # Feed-Forward Network's Hidden Size
d_k = d_v = 64 # = d_model / head
N = 6 # Num of Encoder / Decoder Layer's Stack 6
head = 8 # Num of Multi-Head Attention's Head 8
len_limit = 999

dropout = 0.1 # Dropout
warmup_steps = 4000 # Using When Evaluate Learning Rate

Using TensorFlow backend.


# 1 Data Preprocessing

> We use En2De Dataset.

In [2]:
class TokenList:
    def __init__(self, token_list):
        self.id2t = ['<PAD>', '<UNK>', '<S>', '</S>'] + token_list
        self.t2id = {v: k for k, v in enumerate(self.id2t)}

    def id(self, x):    
        return self.t2id.get(x, 1)

    def token(self, x):    
        return self.id2t[x]

    def num(self):        
        return len(self.id2t)

    def start_id(self):  
        return 2

    def end_id(self):    
        return 3
    

def make_dict(dict_file):
    with open(dict_file, encoding="utf-8") as f:
        _list = list(ll for ll in f.read().split('\n') if ll != "")
    mid_pos = _list.index('<@@@>') # Seperate En & De
    input_tokens = TokenList(_list[:mid_pos])
    output_tokens = TokenList(_list[mid_pos + 1:])
    
    return input_tokens, output_tokens


def make_data(data_file):
    with File(data_file) as df:
        X, Y = df['X'][:], df['Y'][:]
        
    return X, Y


if __name__ == '__main__':
    input_tokens, output_tokens = make_dict(dict_file)
    x_train, y_train = make_data(data_file)
    x_valid, y_valid = make_data(valid_data_file)

    input_vocab_size = input_tokens.num()
    output_vocab_size = output_tokens.num()
    
    print('English DictNum:', input_tokens.num())
    print('Deutsche DictNum:', output_tokens.num())
    print('Train Shapes:',  x_train.shape, y_train.shape)
    print('Valid Shapes:', x_valid.shape, y_valid.shape)

English DictNum: 3369
Deutsche DictNum: 3665
Train Shapes: (29000, 43) (29000, 47)
Valid Shapes: (1014, 34) (1014, 39)


# 2 Modeling

## 2.1 Positional Encoding

In [3]:
def positional_encoding(max_len):
    PE = np.array([
        [pos / np.power(10000, 2 * i / d_model) for i in range(d_model)]
        if pos != 0 else np.zeros(d_model) for pos in range(max_len)
    ]) # np.power(10000, 2 * (j // 2) / d_emb) ??? Why?
    PE[1:, 0::2] = np.sin(PE[1:, 0::2]) # 2i
    PE[1:, 1::2] = np.cos(PE[1:, 1::2]) # 2i + 1
    
    return PE

## 2.2 Multi-Head Attention

In [4]:
class MultiHeadAttention:
    def __init__(self):
        print("# Generated: Multi-Head Attention Layer")
        self.Q_linear_transform_layers = []
        self.K_linear_transform_layers = []
        self.V_linear_transform_layers = []
        
        for _ in range(head):
            self.Q_linear_transform_layers.append(Dense(d_k, use_bias=False))
            self.K_linear_transform_layers.append(Dense(d_k, use_bias=False))
            self.V_linear_transform_layers.append(Dense(d_v, use_bias=False))
        
        self.normarlization_layer = LayerNormalization()
        self.output_linear_transfrom_layer = Dense(d_model)
        
    def __call__(self, Q, K, V, mask=None):
        print("# Executed: Multi-Head Attention")
        outputs = []
        
        for i in range(head):
            WQ = self.Q_linear_transform_layers[i](Q)
            WK = self.K_linear_transform_layers[i](K)
            WV = self.V_linear_transform_layers[i](V)
            output = self.scaled_dot_product_attention(WQ, WK, WV)
            outputs.append(output)
        
        output_result = Concatenate()(outputs)
        output = self.output_linear_transfrom_layer(output_result)
        output = Add()([output, Q])
        
        return self.normarlization_layer(output)
        
    def scaled_dot_product_attention(self, Q, K, V):
        attention = Lambda(lambda x: tf.matmul(x[0], x[1]) / np.sqrt(d_model))([Q, tf.transpose(K, perm=[0, 2, 1])])
        attention = Lambda(lambda x: tf.matmul(x[0], x[1]))([(softmax(attention), V)])
        
        return attention

    def scaled_dot_product_attention(self, q, k, v):
        attention = Lambda(lambda x:tf.matmul(x[0],x[1]) / np.sqrt(d_model))([q, k])
        attention = Activation('softmax')(attention)
        output = Lambda(lambda x:tf.matmul(x[0], x[1]))([attention, v])
        
        return output

## 2.3 Position-wise Feed-Forward Network

In [5]:
class PositionWiseFeedForwardNetwork:
    def __init__(self):
        print("# Generated: Position-Wise Feed Forward Network")
        self.linear_transform_layer_1 = Dense(d_model)
        self.relu_layer = Dense(d_model, activation='relu')
        self.linear_transform_layer_2 = Dense(d_model)
        self.normarlization_layer = LayerNormalization()
        
    def __call__(self, x):
        output = self.linear_transform_layer_1(x)
        output = self.relu_layer(output)
        output = self.linear_transform_layer_2(output)
        output = Add()([output, x])
        
        return self.normarlization_layer(output)

## 2.4 Encoder

In [6]:
class EncoderLayer:
    def __init__(self):
        print("# Generated: Encoder Layer")
        self.multi_head_attention_layer = MultiHeadAttention()
        self.position_wise_feed_forward_network = PositionWiseFeedForwardNetwork()
        
    def __call__(self, encoder_input, mask=None):
        encoder_output = self.multi_head_attention_layer(encoder_input, encoder_input, encoder_input, mask)
        encoder_output = self.position_wise_feed_forward_network(encoder_output)
        
        return encoder_output
    
class Encoder:
    def __init__(self):
        self.input_embedding =  Embedding(input_vocab_size, d_model)
        self.positional_embedding = Embedding(len_limit, d_model, trainable=False, weights=[positional_encoding(len_limit)])
        self.layers = [EncoderLayer() for _ in range(N)]
        
    def __call__(self, encoder_input, source_position):
        encoder_output = Add()([self.input_embedding(encoder_input), self.positional_embedding(source_position)])

        for layer in self.layers:
            encoder_output = layer(encoder_output)
            
        return encoder_output

## 2.5 Decoder

In [7]:
class DecoderLayer:
    def __init__(self):
        print("# Generated: Decoder Layer")
        self.masked_multi_head_attention_layer = MultiHeadAttention()
        self.multi_head_attention_layer = MultiHeadAttention()
        self.position_wise_feed_forward_network = PositionWiseFeedForwardNetwork()
        
    def __call__(self, decoder_input, encoder_output, mask=None):
        decoder_output = self.masked_multi_head_attention_layer(decoder_input, decoder_input, decoder_input, mask)
        decoder_output = self.multi_head_attention_layer(decoder_output, encoder_output, encoder_output, None)
        decoder_output = self.position_wise_feed_forward_network(decoder_output)
        
        return decoder_output
        
class Decoder:
    def __init__(self):
        self.output_embedding =  Embedding(output_vocab_size, d_model)
        self.positional_embedding = Embedding(len_limit, d_model, trainable=False, weights=[positional_encoding(len_limit)])
        self.layers = [DecoderLayer() for _ in range(N)]
        
    def __call__(self, decoder_input, target_position, encoder_input, encoder_output):
        decoder_output = Add()([self.output_embedding(decoder_input), self.positional_embedding(target_position)])
        
        for layer in self.layers:
            decoder_output = layer(decoder_output, encoder_output)
            
        return decoder_output

## 2.6 Transformer

In [8]:
class Transformer:
    def __init__(self):
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.linear_transform_layer = TimeDistributed(Dense(output_tokens.num(), use_bias=False))
        
    def get_position(self, x):
        mask = K.cast(K.not_equal(x, 0), 'int32')
        pos = K.cumsum(K.ones_like(x, 'int32'), 1)
        
        return pos * mask
    
    def compile(self, optimizer='adam'):
        print(">> Start Compile ===========================================================================")
        source_input = Input(shape=(None,), dtype='int32')
        target_input = Input(shape=(None,), dtype='int32')
        
        target_sequence = Lambda(lambda x:x[:, :-1])(target_input)
        target_true = Lambda(lambda x:x[:, 1:])(target_input)
        
        def get_sequence_position(x):
            mask = K.cast(K.not_equal(x, 0), 'int32')
            pos = K.cumsum(K.ones_like(x, 'int32'), 1)
            return pos * mask
        
        print(">> Set Encoder =============================================================================")
        source_position = Lambda(get_sequence_position)(source_input)
        encoder_output = self.encoder(source_input, source_position)
        print(">> Set Decoder =============================================================================")
        target_position = Lambda(get_sequence_position)(target_sequence)
        decoder_output = self.decoder(target_sequence, target_position, source_input, encoder_output)
        final_output = self.linear_transform_layer(decoder_output)
        
        def get_loss(args):
            y_pred, y_true = args
            y_true = tf.cast(y_true, 'int32')
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
            mask = tf.cast(tf.not_equal(y_true, 0), 'float32')
            loss = tf.reduce_sum(loss * mask, -1) / tf.reduce_sum(mask, -1)
            loss = K.mean(loss)
            return loss

        def get_accu(args):
            y_pred, y_true = args
            mask = tf.cast(tf.not_equal(y_true, 0), 'float32')
            corr = K.cast(K.equal(K.cast(y_true, 'int32'), K.cast(K.argmax(y_pred, axis=-1), 'int32')), 'float32')
            corr = K.sum(corr * mask, -1) / K.sum(mask, -1)
            return K.mean(corr)
        
        print(">> Set Loss ================================================================================")

        loss = Lambda(get_loss)([final_output, target_true])
        self.ppl = Lambda(K.exp)(loss)
        self.accu = Lambda(get_accu)([final_output, target_true])
        
        print(">> Set Model ===============================================================================")

        self.model = Model([source_input, target_input], loss)
        self.model.add_loss([loss])
        self.output_model = Model([source_input, target_input], final_output)

        self.model.compile(optimizer, None)
        self.model.metrics_names.append('ppl')
        self.model.metrics_tensors.append(self.ppl)
        self.model.metrics_names.append('accu')
        self.model.metrics_tensors.append(self.accu)

## 3 Training

In [None]:
transformer = Transformer()

transformer.compile(Adam(0.001, 0.9, 0.98, epsilon=1e-9))

print(">> Start Training")

transformer.model.fit([x_train, y_train], None, batch_size=4, epochs=30,
                     validation_data=([x_valid, y_valid], None))

# Generated: Encoder Layer
# Generated: Multi-Head Attention Layer
# Generated: Position-Wise Feed Forward Network
# Generated: Encoder Layer
# Generated: Multi-Head Attention Layer
# Generated: Position-Wise Feed Forward Network
# Generated: Encoder Layer
# Generated: Multi-Head Attention Layer
# Generated: Position-Wise Feed Forward Network
# Generated: Encoder Layer
# Generated: Multi-Head Attention Layer
# Generated: Position-Wise Feed Forward Network
# Generated: Encoder Layer
# Generated: Multi-Head Attention Layer
# Generated: Position-Wise Feed Forward Network
# Generated: Encoder Layer
# Generated: Multi-Head Attention Layer
# Generated: Position-Wise Feed Forward Network
# Generated: Decoder Layer
# Generated: Multi-Head Attention Layer
# Generated: Multi-Head Attention Layer
# Generated: Position-Wise Feed Forward Network
# Generated: Decoder Layer
# Generated: Multi-Head Attention Layer
# Generated: Multi-Head Attention Layer
# Generated: Position-Wise Feed Forward Network
