# Sequence2Sequence 논문 구현
# 0.baseline: simple sequence2sequence (encoder hidden state or output -> decoder initial state)
# 1.논문: Sequence to Sequence Learning with Neural Networks 
## https://arxiv.org/abs/1409.3215

# 2. 논문: Luong Attention 
## https://arxiv.org/abs/1508.04025

# 3. 논문: Bahdanau Attention
## https://arxiv.org/abs/1409.0473

# 4. 논문: Transformer
## https://arxiv.org/abs/1706.03762

In [1]:
import keras.backend.tensorflow_backend as K
from keras.layers import Input, GRU, Dense, Activation, Lambda, LSTM, Embedding, Bidirectional
from keras.models import Model

from os.path import join
from os import listdir
from konlpy.tag import Okt
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


# 1. Data Loader
## 미리 구현된 것 복사&붙여넣기
## http://121.140.2.142:8888/notebooks/test_notes/preprocess/nlp_preprocess.ipynb

## 원본 데이터도 가지고 올 수 있도록 로직을 약간 수정했습니다.
## Decoder는 \<start>와 \<end> 태그를 문장의 시작과 끝에 자동으로 넣을 수 있도록 변경하였습니다.

In [2]:
class Dataset:
    def __init__(self):
        self.okt = Okt()
    
    """
    1. 다음의 두 함수는 소단위로 구현된 아래 함수들[2]를 응용하는 함수입니다. 필요에 따라 추후에 merge하면 좋겠지요 ^^
    """
    def get_classification_data(self, data_root_path, data_type, max_seq_length):
        # 1. Read Data
        data_path_list = listdir(data_root_path)
        data_path_list = [join(data_root_path, d) for d in data_path_list]
        all_documents, all_documents_names = self.read_data(data_path_list=data_path_list, data_type=data_type)
#         print(all_documents_names)
#         print(all_documents)

        # 2. Tokenize data
        all_tokenized_documents = list()
        for documents in all_documents:
            tokenized_documents = self.tokenize(documents=documents, token_type='word')
            all_tokenized_documents.append(tokenized_documents)
        print(all_tokenized_documents)
        
        # 3. token to index
        token2idx_dict = dict()
        token_counter_dict = dict()
        all_indiced_documents = list()

        for tokenized_documents in all_tokenized_documents:
            indiced_documents, token2idx_dict, token_counter_dict = self.token2idx(tokenized_documents=tokenized_documents, 
                                                                              token2idx_dict=token2idx_dict, 
                                                                              token_counter_dict=token_counter_dict)
            all_indiced_documents.append(indiced_documents)
        
        # 4. padd & formatting
        all_padded_documents = list()
        for indiced_documnets in all_indiced_documents:
            padded_documents = self.pad_format(indiced_documnets, max_seq_length=max_seq_length)
            all_padded_documents.append(padded_documents)
        
        # 5. Make classification dataset
        label2idx = dict()
        x_data = list()
        y_data = list()

        for padded_documents, label in zip(all_padded_documents, all_documents_names):
            _x_data, _y_data, label2idx =self. map_document_label(documents=padded_documents, label=label, label2idx=label2idx)
            x_data.append(_x_data)
            y_data.append(_y_data)

        x_data = np.concatenate(x_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        for idx, (x, y) in enumerate(zip(x_data, y_data)):
            if idx == 10:
                break
#             print('label:', y)
#             print('data:', x)
        
        return x_data, y_data
        
        
    def get_encoder_decoder_data(self, encoder_root_path, decoder_root_path, data_type, encoder_language, decoder_langeuage,
                                 max_enc_seq_length, max_dec_seq_length, 
                                 encoder_token_counter_dict=dict(), 
                                 decoder_token_counter_dict=dict()): # token 추가하는 token_counter_dict는 개발하지 않음
        # 1. Read Data
        encoder_path_list = listdir(encoder_root_path)
        encoder_path_list = [join(encoder_root_path, d) for d in encoder_path_list]
        
        decoder_path_list = listdir(decoder_root_path)
        decoder_path_list = [join(decoder_root_path, d) for d in decoder_path_list]
        
        data_path_list = np.concatenate([encoder_path_list, decoder_path_list], axis=0)
        ########## same logic below
        all_documents, all_documents_names = self.read_data(data_path_list=data_path_list, data_type=data_type)
#         print(all_documents_names)
#         print(all_documents)

        # 2. Tokenize data
        all_tokenized_documents = list()
        for documents in all_documents:
            tokenized_documents = self.tokenize(documents=documents, token_type='word')
            all_tokenized_documents.append(tokenized_documents)
        print(all_tokenized_documents)
        
        ########## same logic above
        
        # 3. token to index
        encoder_token2idx_dict = dict()
        encoder_token_counter_dict = dict()
        encoder_all_indiced_documents = list()
        encoder_documents = list()
        
        for documents, tokenized_documents in zip(all_documents[:len(encoder_path_list)], all_tokenized_documents[:len(encoder_path_list)]):
            indiced_documents, encoder_token2idx_dict, encoder_token_counter_dict = \
                    self.token2idx(tokenized_documents=tokenized_documents,
                                   add_tag=False,
                                   token2idx_dict=encoder_token2idx_dict, 
                                   token_counter_dict=encoder_token_counter_dict)
            encoder_all_indiced_documents.append(indiced_documents)
            encoder_documents.append(documents)
        
        
        decoder_token2idx_dict = dict()
        decoder_token_counter_dict = dict()
        decoder_all_indiced_documents = list()
        decoder_documents = list()

        for documents, tokenized_documents in zip(all_documents[len(encoder_path_list):], all_tokenized_documents[len(encoder_path_list):]):
            indiced_documents, decoder_token2idx_dict, decoder_token_counter_dict = \
                    self.token2idx(tokenized_documents=tokenized_documents,
                                   add_tag=True,
                                   token2idx_dict=decoder_token2idx_dict, 
                                   token_counter_dict=decoder_token_counter_dict)
            decoder_all_indiced_documents.append(indiced_documents)
            decoder_documents.append(documents)
        
        # 4. padd & formatting
        encoder_all_padded_documents = list()
        for indiced_documnets in encoder_all_indiced_documents:
            padded_documents = self.pad_format(indiced_documnets, max_seq_length=max_enc_seq_length)
            encoder_all_padded_documents.append(padded_documents)
            
        decoder_all_padded_documents = list()
        for indiced_documnets in decoder_all_indiced_documents:
            padded_documents = self.pad_format(indiced_documnets, max_seq_length=max_enc_seq_length)
            decoder_all_padded_documents.append(padded_documents)
            
        # 5. Make encoder decoder dataset
        encoder_inputs = np.concatenate(encoder_all_padded_documents, axis=0)
        decoder_inputs = list()
        decoder_outputs = list()

        for padded_documents in decoder_all_padded_documents:
            _x_data, _y_data = self.map_document_ae(documents=padded_documents)
            decoder_inputs.append(_x_data)
            decoder_outputs.append(_y_data)

        decoder_inputs = np.concatenate(decoder_inputs, axis=0)
        decoder_outputs = np.concatenate(decoder_outputs, axis=0)
        
        encoder_documents = np.concatenate(encoder_documents, axis=0)
        decoder_documents = np.concatenate(decoder_documents, axis=0)
        
        return [encoder_inputs, decoder_inputs, decoder_outputs], [encoder_documents, encoder_token2idx_dict, encoder_token_counter_dict,
                                                                  decoder_documents, decoder_token2idx_dict, decoder_token_counter_dict]
    
    
    """
    2. 아래의 함수들은 위에 구현된 함수들 복사 붙여넣기 후 파라미터에 가장 앞에 self를 추가해줍니다 ^^
    """
    def read_data(self, data_path_list, data_type):
        all_documents = list()
        all_documents_names = list()
        for data_path in data_path_list:
            if 'numpy' == data_type: # Path(data_path).suffix
                documents = np.load(data_path)
            elif 'text' == data_type:
                documents = list()
                with open(data_path, 'r', encoding='utf-8') as f:
                    new_data_lines = f.read().split('\n')
                    documents.extend(new_data_lines)
            else:
                print('Proper data_type is not presented.')
            all_documents.append(documents)
            all_documents_names.append(Path(data_path).stem)
        return all_documents, all_documents_names
    
    def tokenize(self, documents, token_type):
        if token_type == 'char':
            return [char for document in documents for char in document]
        elif token_type == 'word': # token_type == word
            return [self.okt.morphs(document, norm=False, stem=False) for document in documents]
        elif token_type == 'ngram': # 'ngram'
            tokenized_sentence = sentence.split(' ')
            if kor_tokenizer_max_word_char == -1:
                return [word for word in tokenized_sentence]
            else:
                return [word[:kor_tokenizer_max_word_char] for word in tokenized_sentence]
        else:
            print('Not implemented token type:', token_type)
    
    def token2idx(self, tokenized_documents, add_tag=False, token2idx_dict=dict(), token_counter_dict=dict()):
        """
        1. will change token to idx and 2. token2idx dictionary and 3. counted token dictionary
        """
        indiced_documents = list()

        if len(token2idx_dict) == 0:
            token2idx_dict['<pad>'] = 0
            token2idx_dict['<start>'] = 1
            token2idx_dict['<end>'] = 2
            token2idx_dict['<unk>'] = 3

        for tokenized_document in tokenized_documents:
            indiced_document = list()
            for token in tokenized_document:
                if token not in token2idx_dict:
                    token2idx_dict[token] = len(list(token2idx_dict.keys()))
                if token not in token_counter_dict:
                    token_counter_dict[token] = 0
                token_counter_dict[token] += 1

                indiced_document.append(token2idx_dict[token])
            if add_tag:
                indiced_document.insert(0, token2idx_dict['<start>'])
                indiced_document.append(token2idx_dict['<end>'])
            indiced_documents.append(indiced_document)
        return indiced_documents, token2idx_dict, token_counter_dict
    
    def pad_format(self, indiced_documents, max_seq_length):
        padded_data = np.empty(shape=(0, max_seq_length))

        for indiced_document in indiced_documents:
            np_transformed = np.zeros(shape=(max_seq_length, ))
            for idx, index in enumerate(indiced_document):
                if idx == max_seq_length:
                    break
                np_transformed[idx] = index
            padded_data = np.insert(padded_data, padded_data.shape[0], np_transformed, axis=0)

        return padded_data
    
    def map_document_label(self, documents, label, label2idx=dict()):
        x_data = documents
        y_data = list()
        idx = len(label2idx)

        if label not in label2idx:
            label2idx[label] = idx
        labels = [label2idx[label]] * documents.shape[0]
        y_data.extend(labels)

        return x_data, y_data, label2idx
    
    def map_document_ae(self, documents):
        x_data = documents
        y_data = documents[:, 1:]
        y_data = np.insert(y_data, y_data.shape[1], 0, axis=1)
        return x_data, y_data

In [3]:
max_enc_seq_length = 50
max_dec_seq_length = 50

train_data_path = '/data1/translation_data/train'
test_data_path = '/data1/translation_data/test'
val_data_path = '/data1/translation_data/dev_validation'

encoder_root_path = join(train_data_path, 'english')
decoder_root_path = join(train_data_path, 'korean')

seq2seq_dataset = Dataset()
[encoder_inputs, decoder_inputs, decoder_outputs], [encoder_documents, encoder_token2idx_dict, encoder_token_counter_dict,
                                                                  decoder_documents, decoder_token2idx_dict, decoder_token_counter_dict] = seq2seq_dataset.get_encoder_decoder_data(
    encoder_root_path=encoder_root_path, decoder_root_path=decoder_root_path, 
    data_type='text', encoder_language='english', decoder_langeuage='korean',
    max_enc_seq_length=max_enc_seq_length, max_dec_seq_length=max_dec_seq_length)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [4]:
train_encoder_inputs, test_encoder_inputs, \
train_decoder_inputs, test_decoder_inputs, \
train_decoder_outputs, test_decoder_outputs = train_test_split(encoder_inputs, decoder_inputs, decoder_outputs, test_size=0.1)

In [5]:
num_encoder_tokens = len(encoder_token_counter_dict)
num_decoder_tokens = len(decoder_token_counter_dict)

# 2. Sequence 2 Sequence model
## Embedding 참조: http://121.140.2.142:8888/notebooks/test_notes/keras/Embedding%20Layer%20%26%20Padding.ipynb
## RNN 기본 참조: http://121.140.2.142:8888/notebooks/test_notes/keras/RNN%20-%20LSTM%2C%20GRU.ipynb

# 2-0. Fit generator for each batch to transform final output as one hot
# 모든 데이터셋을 전부 one hot으로 미리 만들어 놓으면 메모리 에러 발생 가능성이 높기 때문에... 매 배치마다 output 데이터를 one hot으로 실시간으로 바꿔줍니다.

In [38]:
from keras.callbacks import Callback, ModelCheckpoint
import random

In [39]:
def convert_generator(x_data, y_data, num_decoder_tokens, batch_size=16):
    '''
    Return a random from x_data, y_data
    '''
    while True:
        samples_per_mini_epoch = x_data[0].shape[0]
        number_of_steps = np.ceil(samples_per_mini_epoch / batch_size).astype(int)
        data_idx = list(range(number_of_steps))

        while len(data_idx) > 0:
            # choose batch_size random images / labels from the data

            idx = random.choice(data_idx)
            next_idx = min(samples_per_mini_epoch, idx + batch_size)
            encoder_input = x_data[0][idx: next_idx]
            decoder_input = x_data[1][idx: next_idx]
            decoder_output = y_data[idx: next_idx]

            converted_decoder_output = np.zeros(shape=(decoder_output.shape[0], decoder_output.shape[1], num_decoder_tokens))
            for idx2, data in enumerate(decoder_output):
                for idx3, d in enumerate(data):
                    converted_decoder_output[idx2, idx3, int(d)] = 1

            data_idx.remove(idx)
            yield [encoder_input, decoder_input], converted_decoder_output

In [40]:
def train(model, train_data, validation_data=None, epochs=50, batch_size=64, verbose=1, validation_split=0.1, *args):
#         print(len(args))
        train_encoder_input, train_decoder_input, train_decoder_output = train_data
        if validation_data is not None:
            val_encoder_input, val_decoder_input, val_decoder_output = validation_data
        
#         mc = ModelCheckpoint('./save/s2s_{epoch:03d}.h5', save_weights_only=True, period=5)
        
        val_steps = 0
        val_data = None
        
        if validation_data is not None:
            val_data = ([val_encoder_input, val_decoder_input], val_decoder_output)
            val_steps = np.ceil(val_encoder_input.shape[0] / batch_size)
            val_data = convert_generator(x_data=val_data[0], y_data=val_data[1], batch_size=batch_size,
                                        num_decoder_tokens=num_decoder_tokens)

        steps_per_epoch = np.ceil(train_encoder_input.shape[0] / batch_size)
                
        model.fit_generator(generator=convert_generator([train_encoder_input, train_decoder_input],
                                                            train_decoder_output,
                                                            num_decoder_tokens=num_decoder_tokens),
                                 steps_per_epoch=steps_per_epoch,
                                 validation_data=val_data,
                                 validation_steps=val_steps,
                                 epochs=epochs,
                                 verbose=verbose,
#                                  callbacks=[mc]
                                )

# 2-1. Baseline

In [73]:
hidden_dim = 64
embedding_dim = 128


def get_seq2seq_base_model():
    encoder_inputs = Input(shape=(max_enc_seq_length, ))
    decoder_inputs = Input(shape=(max_dec_seq_length, ))
    
    encoder_embed_layer = Embedding(input_dim=num_encoder_tokens, output_dim=embedding_dim, input_length=max_enc_seq_length)
    decoder_embed_layer = Embedding(input_dim=num_decoder_tokens, output_dim=embedding_dim, input_length=max_dec_seq_length)
    
    encoder_embeded_inputs = encoder_embed_layer(encoder_inputs)
    decoder_embeded_inputs = encoder_embed_layer(decoder_inputs)
    
    encoder_layer = LSTM(hidden_dim, return_sequences=True, return_state=True)
    encoder_outputs, last_encoder_output, last_encoder_cell_state  = encoder_layer(encoder_embeded_inputs)
    encoder_states = [last_encoder_output, last_encoder_cell_state]
    
    decoder_layer = LSTM(hidden_dim, return_sequences=True, return_state=True)
    decoder_outputs, last_decoder_output, last_decoder_cell_state  = decoder_layer(decoder_embeded_inputs, initial_state=encoder_states)
    
    outputs = Dense(num_decoder_tokens, activation='softmax')(decoder_outputs)
    
    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    print(model.summary())
    
    return model

In [75]:
seq2seq_model = get_seq2seq_base_model()
train(model=seq2seq_model, train_data=(train_encoder_inputs, train_decoder_inputs, train_decoder_outputs), 
      validation_data=(test_encoder_inputs, test_decoder_inputs, test_decoder_outputs), epochs=5, batch_size=32, verbose=1, validation_split=0.1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_18 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_17 (Embedding)        (None, 50, 128)      7117312     input_17[0][0]                   
                                                                 input_18[0][0]                   
__________________________________________________________________________________________________
lstm_19 (LSTM)                  [(None, 50, 64), (No 49408       embedding_17[0][0]               
__________

# 2-2. Sequence 2 Sequence (encoder last output as decoder first input) 하다 말아씀...

In [68]:
def get_seq2seq_general():
    encoder_inputs = Input(shape=(max_enc_seq_length, ))
    decoder_inputs = Input(shape=(max_dec_seq_length, ))
    
    encoder_embed_layer = Embedding(input_dim=num_encoder_tokens, output_dim=embedding_dim, input_length=max_enc_seq_length)
    decoder_embed_layer = Embedding(input_dim=num_decoder_tokens, output_dim=embedding_dim, input_length=max_dec_seq_length)
    
    encoder_embeded_inputs = encoder_embed_layer(encoder_inputs)
    decoder_embeded_inputs = encoder_embed_layer(decoder_inputs)
    
    encoder_layer1 = Bidirectional(LSTM(hidden_dim, return_sequences=True, return_state=True))
    encoder_layer2 = Bidirectional(LSTM(hidden_dim, return_sequences=True, return_state=True))
    encoder_layer3 = Bidirectional(LSTM(hidden_dim, return_sequences=True, return_state=True))
    encoder_layer4 = Bidirectional(LSTM(hidden_dim, return_sequences=True, return_state=True))
    encoder_outputs, last_encoder_output, last_encoder_cell_state  = encoder_layer1(encoder_embeded_inputs)
    encoder_outputs, last_encoder_output, last_encoder_cell_state  = encoder_layer2(encoder_outputs)
    encoder_outputs, last_encoder_output, last_encoder_cell_state  = encoder_layer3(encoder_outputs)
    encoder_outputs, last_encoder_output, last_encoder_cell_state  = encoder_layer4(encoder_outputs)
    
    encoder_states = [last_encoder_output, last_encoder_cell_state]
    
    decoder_layer = LSTM(hidden_dim, return_sequences=True, return_state=True)
    decoder_outputs, last_decoder_output, last_decoder_cell_state  = decoder_layer(decoder_embeded_inputs, initial_state=encoder_states)
    
    outputs = Dense(num_decoder_tokens, activation='softmax')(decoder_outputs)
    
    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    print(model.summary())
    
    return model

In [70]:
seq2seq_model = get_seq2seq_general()
train(model=seq2seq_model, train_data=(train_encoder_inputs, train_decoder_inputs, train_decoder_outputs), 
      validation_data=(test_encoder_inputs, test_decoder_inputs, test_decoder_outputs), epochs=10, batch_size=64, verbose=1, validation_split=0.1)

ValueError: too many values to unpack (expected 3)

# 2-3. Attention Luong (Encoder에서 bidirectional 사용하는 것은 적용 안 됨)

In [53]:
from keras.layers import Reshape, Concatenate, RepeatVector, Permute, Softmax, Multiply
import tensorflow as tf

In [55]:
# Sum of last dimension is 0, then that means it is padded !
def get_pad_index():
    return Lambda(lambda x: K.cast(K.not_equal(K.sum(x, axis=-1, keepdims=True), 0), 'float32'))

def get_last_outputs(inputs, outputs, dimension, seq_length):
    if dimension == 2:
        new_inputs = Reshape((seq_length, 1))(inputs)
    else:
        new_inputs = inputs
    pad_index = get_pad_index()(new_inputs)
    last_index = Lambda(lambda x: K.sum(x, axis=-2) - 1)(pad_index)

    # LAST RELEVANT OUTPUT
    # create the row index with tf.range
    row_idx = Lambda(lambda x: tf.reshape(tf.range(tf.shape(x)[0]), (-1,1)))(last_index)

    # stack with column index
    idx = Lambda(lambda x: tf.stack([row_idx, K.cast(x, 'int32')], axis=-1))(last_index)
    # extract the elements with gather_nd
    last_outputs = Lambda(lambda x: tf.gather_nd(x, idx))(outputs)
    
    last_outputs = Reshape((hidden_dim, ))(last_outputs)
    return pad_index, last_outputs

In [56]:
hidden_dim = 64
embedding_dim = 128

def get_seq2seq_luong_attention_model(gpu=0):
    with K.tf.device('/gpu:' + str(gpu)):
        encoder_inputs = Input(shape=(max_enc_seq_length, ))
        decoder_inputs = Input(shape=(max_dec_seq_length, ))

        encoder_embed_layer = Embedding(input_dim=num_encoder_tokens, output_dim=embedding_dim, input_length=max_enc_seq_length)
        decoder_embed_layer = Embedding(input_dim=num_decoder_tokens, output_dim=embedding_dim, input_length=max_dec_seq_length)

        encoder_embeded_inputs = encoder_embed_layer(encoder_inputs)
        decoder_embeded_inputs = encoder_embed_layer(decoder_inputs)

        encoder_layer = GRU(units=hidden_dim, return_sequences=True, return_state=True)
        encoder_outputs, encoder_last_output = encoder_layer(encoder_embeded_inputs)
        
        pad_index, corrected_last_outputs = get_last_outputs(encoder_inputs, encoder_outputs, dimension=2, seq_length=max_enc_seq_length)
        masked_encoder_outputs = Multiply()([encoder_outputs, pad_index])

        decoder_layer = GRU(units=hidden_dim, return_sequences=True, return_state=False)
        decoder_outputs = decoder_layer(decoder_embeded_inputs, initial_state=corrected_last_outputs)

        # Attention
        # 1. 각 decoder position에 대하여 전체 encoder 벡터 값 붙이기
        reshaped_encoder_inputs = Reshape((max_enc_seq_length * hidden_dim, ))(masked_encoder_outputs)
        reshaped_decoder_inputs = Reshape((max_dec_seq_length * hidden_dim, ))(decoder_outputs)
        enc_repeat_vector = RepeatVector(max_dec_seq_length)(reshaped_encoder_inputs)
        dec_repeat_vector = RepeatVector(max_enc_seq_length)(reshaped_decoder_inputs)
        reshape_enc_repeat_vector = Reshape((max_dec_seq_length, max_enc_seq_length, hidden_dim))(enc_repeat_vector)
        reshape_dec_repeat_vector = Reshape((max_enc_seq_length, max_dec_seq_length, hidden_dim))(dec_repeat_vector)
        reshape_dec_repeat_vector = Permute((2, 1, 3))(reshape_dec_repeat_vector)

        concat = Concatenate()([reshape_dec_repeat_vector, reshape_enc_repeat_vector])

        # 2. 벡터들을 1차원짜리 스코어로 만들어주기
        dense1_score = Dense(hidden_dim // 2, activation='tanh')(concat)
        dense2_score = Dense(1)(dense1_score) # to make softmax comparison
        dense2_score = Reshape((max_dec_seq_length, max_enc_seq_length))(dense2_score) # reshape to be 2 dims

        softmax_score_layer = Softmax(axis=-1)
        softmax_score = softmax_score_layer(dense2_score)

        # 3. Score를 Encoder Outputs와 곱해주기 (여기서부터 체크하기)
        reshaped_score = Reshape((max_dec_seq_length * max_enc_seq_length, ))(softmax_score)
        score_repeat_vector = RepeatVector(hidden_dim)(reshaped_score)
        reshape_score_repeat_vector = Reshape((hidden_dim, max_dec_seq_length, max_enc_seq_length))(score_repeat_vector)
        repeat_score = Permute((2, 1, 3))(reshape_score_repeat_vector)

        # (여기서부터 체크하기)
        permute_e = Permute((2, 1))(masked_encoder_outputs)
        reshaped_e = Reshape((hidden_dim * max_enc_seq_length, ))(permute_e)
        repeat_e_vector = RepeatVector(max_dec_seq_length)(reshaped_e)
        repeat_e = Reshape((max_dec_seq_length, hidden_dim, max_enc_seq_length))(repeat_e_vector)


        attended_mat_layer = Multiply()
        attended_mat = attended_mat_layer([repeat_score, repeat_e])

        context_layer = Lambda(lambda x: K.sum(x, axis=-1))
        context = context_layer(attended_mat)

        concat_context_layer = Concatenate(axis=-1)
        concat_context = concat_context_layer([context, decoder_outputs])

        attention_output = Dense(hidden_dim, activation='tanh')(concat_context)


        decoder_dense = Dense(num_decoder_tokens, activation='softmax')
        decoder_outputs_pred = decoder_dense(attention_output)
    
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs_pred)
    # Run training
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
    print(model.summary())
    
    return model

In [58]:
seq2seq_model = get_seq2seq_luong_attention_model()
train(model=seq2seq_model, train_data=(train_encoder_inputs, train_decoder_inputs, train_decoder_outputs), 
      validation_data=(test_encoder_inputs, test_decoder_inputs, test_decoder_outputs), epochs=5, batch_size=32, verbose=1, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# 2-4. Attention Bahdanau

# 2-5. Attention Transformer