In [1]:
import os
import collections
import pandas as pd
import re
import numpy as np
import os
import numpy as np
import sys
from keras.optimizers import *
from keras.callbacks import *
from keras.models import *
from keras.layers import *
from keras.initializers import *
from keras.activations import *
import tensorflow as tf
from keras_layer_normalization import LayerNormalization
from sklearn.model_selection import train_test_split
from khaiii import KhaiiiApi
import nltk
nltk.download('punkt')
api = KhaiiiApi()

# Transformer Parameters
d_model = 512 # Embedding Demension
d_ff = 2048 # Feed-Forward Network's Hidden Size
d_k = d_v = 64 # = d_model / head
N = 6 # Num of Encoder / Decoder Layer's Stack 6
head = 8 # Num of Multi-Head Attention's Head 8
len_limit = 50

dropout = 0.1 # Dropout
warmup_steps = 4000 # Using When Evaluate Learning Rate

epochs = 30

data_directory = "./data/"

Using TensorFlow backend.
[nltk_data] Downloading package punkt to /home/moon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
class Eng2KorData:
    def __init__(self, data_dir):
        self.files = os.listdir(data_dir)
        self.files = [os.path.join(data_dir, f) for f in self.files]
        self.eng = []
        self.kor = []
        self.datas = []
        
    def data_processing(self):
        print("※ Data Processing...")
        self.make_data_to_list()
        self.split_eng_kor()
        
        return self.eng, self.kor
        
    def make_data_to_list(self):
        print(">> Read Files...")
        for file in self.files:
            try:
                with open(file, 'r', encoding='cp949') as f:
                    all_data = f.readlines()
                    
                    for data in all_data:
                        data = data.replace('\n', '')
                        data = data.replace('#', '')
                                       
                        if len(data) != 0:
                            self.datas.append(data)
            except:
                pass
        
    def split_eng_kor(self):        
        for i in range(len(self.datas) - 3):
            if self.datas[i][0] == '[':
                if self.datas[i + 3][0] == '[': # Best Case
                    self.eng.append(clean_text(self.datas[i + 1]))
                    self.kor.append(clean_text(self.datas[i + 2]))
                    

    def make_dictionary(self, data, Korean=False):
        print(">> Make Dictionary...")
        words = []
        for sentence in data:
            if not Korean:
                tokens = nltk.word_tokenize(sentence)
                            
            else:
                tokens = korean_morphing(sentence)
                
            for word in tokens:
                if has_number(word):
                    tokens.remove(word)
                        
            words.extend(tokens)

        words = collections.Counter(words)

        dictionary = {}
        dictionary['<PAD>'] = 0
        dictionary['<UNK>'] = 1
        dictionary['<EOS>'] = 2
        dictionary['<S>'] = 3
        idx = 4
        for word in words.most_common():
            if len(word[0]) > 0:
                dictionary[word[0]] = idx
                idx += 1
            
            if idx >= 20000: break;

        return dictionary
    
def korean_morphing(sentence):
    sentence = sentence.replace(' ', ' V ')
    word = api.analyze(sentence)

    morphs = []
    for tokens in word:
        for morph in tokens.morphs:
            #morphs.append(str(morph))
            morphs.append(str(morph.lex))
            
    return morphs
    
def clean_text(text):
    return re.sub('[\{\}\[\]\/,;:|\)*~`^\-_+<>@\#$%&\\\=\(\"“”◀▶【©】☎]', '', text.lower())

def has_number(word):
    return any(char.isdigit() for char in word)

In [3]:
eng2kor = Eng2KorData(data_directory)
eng, kor = eng2kor.data_processing()

eng_dict = eng2kor.make_dictionary(eng)
kor_dict = eng2kor.make_dictionary(kor, True)

input_dict = {y:x for x,y in eng_dict.items()}
output_dict = {y:x for x,y in kor_dict.items()}

eng_dict_size = len(eng_dict)
kor_dict_size = len(kor_dict)
        
eng_df = pd.DataFrame(eng, columns=['English'])
kor_df = pd.DataFrame(kor, columns=['Korean'])

eng2kor = pd.concat([eng_df, kor_df], axis=1)

print(">> Resizing Dataset...")
        
for i, eng in enumerate(eng2kor['English']):
    if len(eng) > len_limit:
        eng2kor.drop(eng2kor.index[i])

for i, kor in enumerate(eng2kor['Korean']):
    if len(kor) > len_limit:
        eng2kor.drop(eng2kor.index[i])

print("English Dictionary Size: ", eng_dict_size)
print("Korean Dictionary Size: ", kor_dict_size)

print("Done")

※ Data Processing...
>> Read Files...
>> Make Dictionary...
>> Make Dictionary...
>> Resizing Dataset...
English Dictionary Size:  20000
Korean Dictionary Size:  19979
Done


In [4]:
def vectorize_data(data, dictionary, Korean=False):
    max_len = 0
    vec_sentence = []

    for sentence in data:
        temp = []
        
        temp.append(dictionary['<S>'])
        
        if not Korean:
            for word in nltk.word_tokenize(sentence):
                if len(word) > 0:
                    if word in dictionary:
                        temp.append(dictionary[word])
                    else:
                        temp.append(dictionary['<UNK>'])
        else:
            
            for word in korean_morphing(sentence):
                if len(word) > 0:
                    if word in dictionary:
                        temp.append(dictionary[word])
                    else:
                        temp.append(dictionary['<UNK>'])
            
        temp.append(dictionary['<EOS>'])
        
        if max_len < len(temp):
            max_len = len(temp)

        vec_sentence.append(temp)
        
    return vec_sentence, max_len

def add_padding(vec_data, max_len):
    for sentence in vec_data:
        for i in range(len(sentence), max_len + 1):
            sentence.append(0)
    
    return np.array(vec_data)

eng_vec, eng_max = vectorize_data(eng2kor['English'], eng_dict)
kor_vec, kor_max = vectorize_data(eng2kor['Korean'], kor_dict, True)

x_train = add_padding(eng_vec, eng_max)
y_train = add_padding(kor_vec, kor_max)

print("Done")

Done


In [None]:
def positional_encoding(max_len):
    PE = np.array([
        [pos / np.power(10000, 2 * (i // 2) / d_model) for i in range(d_model)]
        if pos != 0 else np.zeros(d_model) for pos in range(max_len)
    ]) # np.power(10000, 2 * (j // 2) / d_emb) ??? Why?
    PE[1:, 0::2] = np.sin(PE[1:, 0::2]) # 2i
    PE[1:, 1::2] = np.cos(PE[1:, 1::2]) # 2i + 1
    
    return PE

class MultiHeadAttention:
    def __init__(self):
        self.Q_linear_transform_layers = []
        self.K_linear_transform_layers = []
        self.V_linear_transform_layers = []
        
        for _ in range(head):
            self.Q_linear_transform_layers.append(Dense(d_k, use_bias=False))
            self.K_linear_transform_layers.append(Dense(d_k, use_bias=False))
            self.V_linear_transform_layers.append(Dense(d_v, use_bias=False))
        
        self.normarlization_layer = LayerNormalization()
        self.output_linear_transfrom_layer = Dense(d_model)
        
    def __call__(self, Q, K, V, mask=None):
        outputs = []
        
        for i in range(head):
            WQ = self.Q_linear_transform_layers[i](Q)
            WK = self.K_linear_transform_layers[i](K)
            WV = self.V_linear_transform_layers[i](V)
            output = self.scaled_dot_product_attention(WQ, WK, WV, mask)
            outputs.append(output)
        
        output_result = Concatenate()(outputs)
        output = self.output_linear_transfrom_layer(output_result)
        output = Dropout(dropout)(output)
        output = Add()([output, Q])
        
        return self.normarlization_layer(output)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        K_T = Lambda(lambda x:tf.transpose(x, perm=[0, 2, 1]))(K)
        attention = Lambda(lambda x:tf.matmul(x[0], x[1]) / np.sqrt(d_model))([Q, K_T])
        
        if mask is not None:
            mask_value = Lambda(lambda x: (-1e+10) * (1 - x))(mask)
            attention = Add()([attention, mask_value])
            
        attention = Activation('softmax')(attention)
        attention = Dropout(dropout)(attention)
        output = Lambda(lambda x:tf.matmul(x[0], x[1]))([attention, V])
        
        return output
    
class PositionWiseFeedForwardNetwork:
    def __init__(self):
        self.w_1 = Conv1D(512, 1, activation='relu')
        self.w_2 = Conv1D(512, 1)
#         self.linear_transform_layer_1 = Dense(d_ff, input_shape=(d_model,))
#         self.relu_layer = Dense(d_ff, activation='relu')
#         self.linear_transform_layer_2 = Dense(d_model, input_shape=(d_ff,))
        self.normarlization_layer = LayerNormalization()
        
    def __call__(self, x):
        output = self.w_1(x)
        output = self.w_2(output)
#         output = self.linear_transform_layer_1(x)
#         output = self.relu_layer(output)
#         output = self.linear_transform_layer_2(output)
        output = Dropout(dropout)(output)
        output = Add()([output, x])
        
        return self.normarlization_layer(output)
    
    
    
def GetPadMask(q, k):
    ones = K.expand_dims(K.ones_like(q, 'float32'), -1)
    mask = K.cast(K.expand_dims(K.not_equal(k, 0), 1), 'float32')
    mask = K.batch_dot(ones, mask, axes=[2, 1])
    return mask


def GetSubMask(s):
    len_s = tf.shape(s)[1]
    bs = tf.shape(s)[:1]
    mask = K.cumsum(tf.eye(len_s, batch_shape=bs), 1)
    return mask


    
class EncoderLayer:
    def __init__(self):
        self.multi_head_attention_layer = MultiHeadAttention()
        self.position_wise_feed_forward_network = PositionWiseFeedForwardNetwork()
        
    def __call__(self, encoder_input, mask=None):
        encoder_output = self.multi_head_attention_layer(encoder_input, encoder_input, encoder_input, mask)
        encoder_output = self.position_wise_feed_forward_network(encoder_output)
        
        return encoder_output
    
class Encoder:
    def __init__(self):
        self.input_embedding =  Embedding(eng_dict_size, d_model)
        self.positional_embedding = Embedding(len_limit, d_model, trainable=False, weights=[positional_encoding(len_limit)])
        self.layers = [EncoderLayer() for _ in range(N)]
        
    def __call__(self, encoder_input, source_position):
        encoder_output = Add()([self.input_embedding(encoder_input), self.positional_embedding(source_position)])

        for layer in self.layers:
            encoder_output = layer(encoder_output)
            
        return encoder_output
    
class DecoderLayer:
    def __init__(self):
        self.masked_multi_head_attention_layer = MultiHeadAttention()
        self.multi_head_attention_layer = MultiHeadAttention()
        self.position_wise_feed_forward_network = PositionWiseFeedForwardNetwork()
        
    def __call__(self, decoder_input, encoder_output, self_mask=None, enc_mask=None):
        decoder_output = self.masked_multi_head_attention_layer(decoder_input, decoder_input, decoder_input, self_mask)
        decoder_output = self.multi_head_attention_layer(decoder_output, encoder_output, encoder_output, enc_mask)
        decoder_output = self.position_wise_feed_forward_network(decoder_output)
        
        return decoder_output
        
class Decoder:
    def __init__(self):
        self.output_embedding =  Embedding(kor_dict_size, d_model)
        self.positional_embedding = Embedding(len_limit, d_model, trainable=False, weights=[positional_encoding(len_limit)])
        self.layers = [DecoderLayer() for _ in range(N + 3)]
        
    def __call__(self, decoder_input, target_position, encoder_input, encoder_output):
        decoder_output = Add()([self.output_embedding(decoder_input), self.positional_embedding(target_position)])
        
        if target_position is not None:
            position = self.positional_embedding(target_position)
            decoder_output = Add()([decoder_output, position])
            
        self_pad_mask = Lambda(lambda x: GetPadMask(x, x))(decoder_input)
        self_sub_mask = Lambda(GetSubMask)(decoder_input)
        self_mask = Lambda(lambda x: K.minimum(x[0], x[1]))([self_pad_mask, self_sub_mask])
        enc_mask = Lambda(lambda x: GetPadMask(x[0], x[1]))([decoder_input, encoder_input])
        
        for layer in self.layers:
            decoder_output = layer(decoder_output, encoder_output, self_mask, enc_mask)
        
        return decoder_output
    
class Transformer:
    def __init__(self):
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.linear_transform_layer = Dense(kor_dict_size, use_bias=False)
        #self.softmax_layer = Dense(kor_dict_size, activation='softmax')
        #self.softmax_layer = Activation('softmax')
    
    def compile(self, optimizer='adam'):
        print(">> Start Compile ===========================================================================")
        source_input = Input(shape=(None,), dtype='int32')
        target_input = Input(shape=(None,), dtype='int32')
        
        source_sequence = source_input
        target_sequence = Lambda(lambda x:x[:, :-1])(target_input)
        target_true = Lambda(lambda x:x[:, 1:])(target_input)
        
        def get_sequence_position(x):
            mask = K.cast(K.not_equal(x, 0), 'int32')
            pos = K.cumsum(K.ones_like(x, 'int32'), 1)
            return pos * mask
        
        print(">> Set Encoder =============================================================================")
        source_position = Lambda(get_sequence_position)(source_sequence)
        encoder_output = self.encoder(source_sequence, source_position)
        print(">> Set Decoder =============================================================================")
        target_position = Lambda(get_sequence_position)(target_sequence)
        decoder_output = self.decoder(target_sequence, target_position, source_sequence, encoder_output)
        final_output = self.linear_transform_layer(decoder_output)
        #final_output = self.softmax_layer(final_output)
        
        def get_loss(args):
            y_pred, y_true = args
            y_true = tf.cast(y_true, 'int32')
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
            mask = tf.cast(tf.not_equal(y_true, 0), 'float32')
            loss = tf.reduce_sum(loss * mask, -1) / tf.reduce_sum(mask, -1)
            loss = K.mean(loss)
            return loss

        def get_accu(args):
            y_pred, y_true = args
            mask = tf.cast(tf.not_equal(y_true, 0), 'float32')
            corr = K.cast(K.equal(K.cast(y_true, 'int32'), K.cast(K.argmax(y_pred, axis=-1), 'int32')), 'float32')
            corr = K.sum(corr * mask, -1) / K.sum(mask, -1)
            return K.mean(corr)
        
        print(">> Set Loss ================================================================================")

        loss = Lambda(get_loss)([final_output, target_true])
        self.ppl = Lambda(K.exp)(loss)
        self.accu = Lambda(get_accu)([final_output, target_true])
        
        print(">> Set Model ===============================================================================")

        self.model = Model([source_input, target_input], loss)
        self.model.add_loss([loss])
        self.output_model = Model([source_input, target_input], final_output)
        
        self.model.compile(optimizer, None)
        self.model.metrics_names.append('ppl')
        self.model.metrics_tensors.append(self.ppl)
        self.model.metrics_names.append('accu')
        self.model.metrics_tensors.append(self.accu)
        
    def decode_sequence(self, input_seq, delimiter=''):
        
        def get_loss(args):
            y_pred, y_true = args
            y_true = tf.cast(y_true, 'int32')
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
            mask = tf.cast(tf.not_equal(y_true, 0), 'float32')
            loss = tf.reduce_sum(loss * mask, -1) / tf.reduce_sum(mask, -1)
            loss = K.mean(loss)
            return loss
        
        src_seq, _ = vectorize_data(input_seq, eng_dict)
        decoded_tokens = []
        target_seq = np.zeros((1, len_limit), dtype='int32')
        target_seq[0, 0] = kor_dict['<S>']
        
        for i in range(len_limit - 1):
            output = self.output_model.predict([src_seq, target_seq])
            sampled_index = np.argmax(output[0, i, :])
            sampled_token = output_dict[sampled_index]
            print(output[0, i, :10])
            print(sampled_token)
            decoded_tokens.append(sampled_token)
            
            if sampled_index == kor_dict['<EOS>']: break
                
            target_seq[0, i + 1] = sampled_index
        
        print('Final::', decoded_tokens)        
        
        return delimiter.join(decoded_tokens[:-1])

class LearningRateScheduler(Callback):
    def __init__(self, d_model, warmup=4000):
        self.basic = d_model**-0.5
        self.warm = warmup**-1.5
        self.step_num = 0

    def on_batch_begin(self, batch, logs = None):
        self.step_num += 1
        lr = self.basic * min(self.step_num**-0.5, self.step_num*self.warm)
        K.set_value(self.model.optimizer.lr, lr)
    

In [None]:
with tf.device('/gpu:0'):

    transformer = Transformer()

    transformer.compile(Adam(0.001, 0.9, 0.98, epsilon=1e-9))

    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, shuffle=True)     

    print("x_train Shape: ", x_train.shape)
    print("y_train Shape: ", y_train.shape)
    print("x_test Shape: ", x_test.shape)
    print("y_test Shape: ", y_test.shape)
    print("English Dictionary Size: ", eng_dict_size)
    print("Korean Dictionary Size: ", kor_dict_size)

    print(">> Start Training")

    learning_rate_scheduler = LearningRateScheduler(d_model, warmup_steps)

    transformer.model.fit([x_train, y_train], None, batch_size=64, epochs=epochs,
                         validation_data=([x_test, y_test], None), callbacks=[learning_rate_scheduler])

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


x_train Shape:  (54501, 30)
y_train Shape:  (54501, 91)
x_test Shape:  (6056, 30)
y_test Shape:  (6056, 91)
English Dictionary Size:  20000
Korean Dictionary Size:  19979
>> Start Training
Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Train on 54501 samples, validate on 6056 samples
Epoch 1/30

In [None]:
eng2kor.head(10)

In [None]:
num = 18

eng = []
for i in x_train[num]:
    eng.append(input_dict[i])
    
print(eng)

kor = []
for i in y_train[num]:
    kor.append(output_dict[i])
    
print(kor)

In [None]:
text = ["the field was filled with snow ."]

print(text)
print(vectorize_data(text, eng_dict))
transformer.decode_sequence(text)