# * Data download: https://github.com/jungyeul/korean-parallel-corpora
# * This implementation is originally from: https://keras.io/examples/lstm_seq2seq/
# * And re-implemented 1. data pre-process 2. neural network

In [1]:
from __future__ import print_function

import keras.backend.tensorflow_backend as K

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Flatten, Activation
from keras.constraints import MinMaxNorm
from keras.initializers import RandomUniform
from keras import losses
from keras.callbacks import Callback, ModelCheckpoint
import numpy as np

import random
from os.path import join
from collections import Counter
from itertools import dropwhile

from konlpy.tag import Okt

Using TensorFlow backend.


In [2]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [3]:
# Path to the data txt file on disk.
# data_path = '/content/gdrive/My Drive/ai_data/kor-eng/kor.txt'
os_type = 'linux'

if os_type == 'windows':
    root_path = 'd:/IGS_Projects_data'
else: # os_type == linux
    root_path = '/data1'

input_data_path = join(root_path, 'translation_data/korean-english-park.train/korean-english-park.train.en')
output_data_path = join(root_path, 'translation_data/korean-english-park.train/korean-english-park.train.ko')

val_input_data_path = join(root_path, 'translation_data/korean-english-park.dev/korean-english-park.dev.en')
val_output_data_path = join(root_path, 'translation_data/korean-english-park.dev/korean-english-park.dev.ko')

test_input_data_path = join(root_path, 'translation_data/korean-english-park.test/korean-english-park.test.en')
test_output_data_path = join(root_path, 'translation_data/korean-english-park.test/korean-english-park.test.ko')

In [4]:
# from here is the not one hot encoded output
class Dataset:
    def __init__(self, max_seq_length=1000):
        
        # list of data path of train, validation, test
        # For translation, each has input_path, output_path
        self.input_tokens = list()
        self.target_tokens = list()
        self.input_token_index = dict()
        self.target_token_index = dict()
        self.num_encoder_tokens = 0
        self.num_decoder_tokens = 0
        self.max_seq_length = max_seq_length
        
        self.okt = Okt()
        
        # Temp: to check the distribution of the words and their counted numbers
        self._input_tokens = dict()
        self._target_tokens = dict()
        self.dictionary_freq = 0
    
    def _read_data_seq2seq(self, data_path_list, token_type='char', kor_tokenizer='custom', 
                           kor_tokenizer_max_word_char=2, num_samples=10000):
        
        # temp
        onehot_style = False
        embedding_version = True
        
        
        if len(data_path_list) > 2:
            print('too many data path:', len(data_path_list))
            return
        
        input_texts = []
        target_texts = []
        
        data_docs = list()
        
        for idx, data_path in enumerate(data_path_list):
            with open(data_path, 'r', encoding='utf-8') as f:
                new_data_lines = f.read().split('\n')
                data_docs.append(new_data_lines)
#                 print(len(new_data_lines))
            
        train_val_test_index = list()
                
        # 1. Tokenize & Count & Store all tokens even duplicates
        for idx, doc_texts in enumerate(data_docs):
            
            # 1-1. Tokenize
            # We use "tab" as the "start sequence" character
            # for the targets, and "\n" as "end sequence" character.
            if idx % 2 == 0: # input text
                data_texts = input_texts
                data_tokens = self.input_tokens
                _data_tokens = self._input_tokens
                # Tokenize for english input, char and all tokenizers but custom will be accepted
                _doc_texts = [self.tokenizer(sentence=sentence, token_type=token_type, kor_tokenizer=kor_tokenizer, 
                                            kor_tokenizer_max_word_char=-1)
                             for sentence in doc_texts]
            else: # target text; put \t and \n as for start and end marking
                data_texts = target_texts
                data_tokens = self.target_tokens
                _data_tokens = self._target_tokens
                _doc_texts = [self.tokenizer(sentence=sentence, token_type=token_type, kor_tokenizer=kor_tokenizer, 
                                            kor_tokenizer_max_word_char=kor_tokenizer_max_word_char)
                             for sentence in doc_texts]
                _doc_texts = [np.insert(tokenized_sentence, [0, len(tokenized_sentence)], ['\t', '\n']) 
                              for tokenized_sentence in _doc_texts if len(tokenized_sentence) > 0]
    
            
            data_texts.extend(_doc_texts)
            
            # 1-2. Count & Store all tokens even duplicates
            if self.num_encoder_tokens != -1:
#             if self.num_encoder_tokens == 0:
                data_tokens.append('<unk>')
                for tokenized_sentence in data_texts:
                    for token in tokenized_sentence:
                        if token not in data_tokens:
                            data_tokens.append(token)
                        if token not in _data_tokens:
                            _data_tokens[token] = 0
                        _data_tokens[token] += 1
                
                for key, count in dropwhile(lambda key_count: key_count[1] > self.dictionary_freq, Counter(_data_tokens).most_common()):
                    del _data_tokens[key]
            train_val_test_index.append(len(data_texts))
        train_val_test_index = train_val_test_index[::2]
                        
        # 2. Unique tokens only
        self.input_tokens = sorted(list(set(self.input_tokens)))
        self.target_tokens = sorted(list(set(self.target_tokens)))
        self.num_encoder_tokens = len(self.input_tokens)
        self.num_decoder_tokens = len(self.target_tokens)
        max_encoder_seq_length = max([len(txt) for txt in input_texts])
        max_decoder_seq_length = max([len(txt) for txt in target_texts])

        print('Number of samples:', len(input_texts))
        print('Number of unique input tokens:', self.num_encoder_tokens)
        print('Number of unique output tokens:', self.num_decoder_tokens)
        print('Max sequence length for inputs:', max_encoder_seq_length)
        print('Max sequence length for outputs:', max_decoder_seq_length)
        
        # 3. Replace words with frequency under freq
        

        input_token_index = dict(
            [(token, i) for i, token in enumerate(self.input_tokens)])
        target_token_index = dict(
            [(token, i) for i, token in enumerate(self.target_tokens)])

        encoder_input_data_dict = dict()
        decoder_input_data_dict = dict()
        decoder_target_data_dict = dict()
        for idx, pos in enumerate(range(0, len(input_texts), num_samples)):
            next_pos = min(pos + num_samples, len(input_texts))
            
            if onehot_style:
                encoder_input_data = np.zeros(
                    (next_pos - pos, self.max_seq_length, self.num_encoder_tokens),
                    dtype='float32')
                decoder_input_data = np.zeros(
                    (next_pos - pos, self.max_seq_length, self.num_decoder_tokens),
                    dtype='float32')
                decoder_target_data = np.zeros(
                    (next_pos - pos, self.max_seq_length, self.num_decoder_tokens),
                    dtype='float32')
            else:
                encoder_input_data = np.zeros(
                    shape=(next_pos - pos, self.max_seq_length),
                    dtype='float32')
                decoder_input_data = np.zeros(
                    (next_pos - pos, self.max_seq_length),
                    dtype='float32')
                decoder_target_data = np.zeros(
                    (next_pos - pos, self.max_seq_length),
                    dtype='float32')
            
            batch_input_texts = input_texts[pos: next_pos]
            batch_target_texts = target_texts[pos: next_pos]

            for i, (input_text, target_text) in enumerate(zip(batch_input_texts, batch_target_texts)):
                for t, token in enumerate(input_text):
                    if t >= self.max_seq_length:
                        break
#                     if char not in input_token_index:
#                         char = '<unk>'
                    if token not in input_token_index:
                        put_index = input_token_index['<unk>']
                    else:
                        put_index = input_token_index[token]
            
                    if onehot_style:
                        encoder_input_data[i, t, put_index] = 1.
                    else:
                        encoder_input_data[i, t] = put_index

                for t, token in enumerate(target_text):
                    if t >= self.max_seq_length:
                        break
                    # decoder_target_data is ahead of decoder_input_data by one timestep
                    if token not in target_token_index:
                        put_index = target_token_index['<unk>']
                    else:
                        put_index = target_token_index[token]
                            
                    if onehot_style:
                        decoder_input_data[i, t, put_index] = 1.
                        if t > 0:
                            # decoder_target_data will be ahead by one timestep
                            # and will not include the start character.
                            decoder_target_data[i, t - 1, target_token_index[token]] = 1.
                    else:
                        decoder_input_data[i, t] = put_index
                        if t > 0:
                            # decoder_target_data will be ahead by one timestep
                            # and will not include the start character.
                            decoder_target_data[i, t - 1] = put_index
            
            if not embedding_version:
                new_shape = np.append(encoder_input_data.shape, 1)
                encoder_input_data = encoder_input_data.reshape(new_shape)
                

                new_shape = np.append(decoder_input_data.shape, 1)
                decoder_input_data = decoder_input_data.reshape(new_shape)
                

                new_shape = np.append(decoder_target_data.shape, 1)
                decoder_target_data = decoder_target_data.reshape(new_shape)
                
            encoder_input_data_dict[idx] = encoder_input_data
            decoder_input_data_dict[idx] = decoder_input_data
            decoder_target_data_dict[idx] = decoder_target_data
            
        self.input_token_index = input_token_index
        self.target_token_index = target_token_index
        
        # Always dataset comes first then additional info comes next
        return [[encoder_input_data_dict, decoder_input_data_dict, decoder_target_data_dict], 
                [self.num_encoder_tokens, self.num_decoder_tokens],
               [input_texts, target_texts]]
    
    def tokenizer(self, sentence, token_type='char', kor_tokenizer='custom', kor_tokenizer_max_word_char=2):
        if token_type == 'char':
            return [s for s in sentence]
        else: # token_type == word
            if kor_tokenizer == 'okt':
                return self.okt.morphs(sentence, norm=True, stem=True)
            else: # 'custom'
                tokenized_sentence = sentence.split(' ')
                if kor_tokenizer_max_word_char == -1:
                    return [word for word in tokenized_sentence]
                else:
                    return [word[:kor_tokenizer_max_word_char] for word in tokenized_sentence]
    
    def get_data(self, data_path_list, data_type='translation', token_type='char',
                 kor_tokenizer='custom', kor_tokenizer_max_word_char=2, num_samples=10000):
        data_type_dict = {
            'translation': self._read_data_seq2seq
        }
        
        if data_type not in data_type_dict:
            print('not valid data type. given: translation')
        
        all_data = data_type_dict[data_type](data_path_list=data_path_list, token_type=token_type, 
                                             kor_tokenizer=kor_tokenizer, kor_tokenizer_max_word_char=kor_tokenizer_max_word_char,
                                             num_samples=num_samples)
        
        return all_data
        

In [5]:
# This is only for test
num_samples = 50000
dataset = Dataset(max_seq_length=50)
train_data = dataset.get_data(data_path_list=[input_data_path, output_data_path], data_type='translation', 
                              token_type='word', kor_tokenizer='okt', kor_tokenizer_max_word_char=2, num_samples=num_samples)
# print('')
val_data = dataset.get_data(data_path_list=[val_input_data_path, val_output_data_path], data_type='translation', 
                            token_type='word', kor_tokenizer='okt', kor_tokenizer_max_word_char=2, num_samples=num_samples)
print('')
test_data = dataset.get_data(data_path_list=[test_input_data_path, test_output_data_path], data_type='translation', 
                             token_type='word', kor_tokenizer='okt', kor_tokenizer_max_word_char=2, num_samples=num_samples)

Number of samples: 94124
Number of unique input tokens: 55605
Number of unique output tokens: 52084
Max sequence length for inputs: 103
Max sequence length for outputs: 120
Number of samples: 1001
Number of unique input tokens: 55934
Number of unique output tokens: 52442
Max sequence length for inputs: 77
Max sequence length for outputs: 75

Number of samples: 2001
Number of unique input tokens: 56692
Number of unique output tokens: 53115
Max sequence length for inputs: 81
Max sequence length for outputs: 97


# 1. Encoder & Decoder inputs and Decoder output as onehot (original form)

In [6]:
from keras.layers import RepeatVector, Lambda, TimeDistributed, Reshape, Concatenate, Permute, Multiply, GRU, Softmax

In [7]:
print('Embedding version: size x sequence_length, onehot batch conversion for the output')

def RepeatVectorLayer(rep, axis):
    return Lambda(lambda x: K.repeat_elements(K.expand_dims(x, axis), rep, axis),
                         lambda x: tuple((x[0],) + x[1:axis] + (rep,) + x[axis:]))

def convert_generator(x_data, y_data, num_encoder_tokens, num_decoder_tokens, batch_size=16):
    '''
    Return a random from x_data, y_data
    '''
    while True:
        for me_idx in x_data[0].keys():
            samples_per_mini_epoch = x_data[0][me_idx].shape[0]
            number_of_steps = np.ceil(samples_per_mini_epoch / batch_size).astype(int)
            data_idx = list(range(number_of_steps))
        
            while len(data_idx) > 0:
                # choose batch_size random images / labels from the data
                
                idx = random.choice(data_idx)
                next_idx = min(samples_per_mini_epoch, idx + batch_size)
                encoder_input = x_data[0][me_idx][idx: next_idx]
                decoder_input = x_data[1][me_idx][idx: next_idx]
                decoder_output = y_data[me_idx][idx: next_idx]
                
                converted_encoder_input = np.zeros(shape=(encoder_input.shape[0], encoder_input.shape[1], num_encoder_tokens))
                for idx2, data in enumerate(encoder_input):
                    for idx3, d in enumerate(data):
                        converted_encoder_input[idx2, idx3, int(d)] = 1
                        
                converted_decoder_input = np.zeros(shape=(decoder_input.shape[0], decoder_input.shape[1], num_decoder_tokens))
                for idx2, data in enumerate(decoder_input):
                    for idx3, d in enumerate(data):
                        converted_decoder_input[idx2, idx3, int(d)] = 1
                
                converted_decoder_output = np.zeros(shape=(decoder_output.shape[0], decoder_output.shape[1], num_decoder_tokens))
                for idx2, data in enumerate(decoder_output):
                    for idx3, d in enumerate(data):
                        converted_decoder_output[idx2, idx3, int(d)] = 1

                data_idx.remove(idx)
                yield [converted_encoder_input, converted_decoder_input], converted_decoder_output

class Sequence2sequence:
    def __init__(self, initial_params, gpu=0):
        
        self.max_encoder_seq_length, self.max_decoder_seq_length, self.num_encoder_tokens, self.num_decoder_tokens = initial_params
#         print('num_encoder_tokens: %i, num_decoder_tokens: %i' %( self.num_encoder_tokens, self.num_decoder_tokens))
        
        self.embedding_dim = 256
        self.latent_dim = 256  # Latent dimensionality of the encoding space.
        self.max_enc_seq_length = 50
        self.max_dec_seq_length = 50
        
        with K.tf.device('/gpu:' + str(gpu)):
            # Define an input sequence and process it.
            encoder_inputs = Input(shape=(self.max_enc_seq_length, self.num_encoder_tokens))
           
            encoder = LSTM(self.latent_dim, return_sequences=True, return_state=True)
            encoder_outputs, state_h, state_c = encoder(encoder_inputs)
            # We discard `encoder_outputs` and only keep the states.
            encoder_states = [state_h, state_c]

            # Set up the decoder, using `encoder_states` as initial state.
            decoder_inputs = Input(shape=(self.max_dec_seq_length, self.num_decoder_tokens))
            # We set up our decoder to return full output sequences,
            # and to return internal states as well. We don't use the
            # return states in the training model, but we will use them in inference.
            decoder = LSTM(self.latent_dim, return_sequences=True, return_state=True)
            decoder_outputs, _, _ = decoder(decoder_inputs,
                                                 initial_state=encoder_states)
            
#             encoder_inputs = Input(shape=(self.max_enc_seq_length, self.num_encoder_tokens))
#             encoder = GRU(self.latent_dim, return_sequences=True, return_state=True)
#             encoder_outputs, encoder_states = encoder(encoder_inputs)
            
#             decoder_inputs = Input(shape=(self.max_dec_seq_length, self.num_decoder_tokens))
#             decoder = GRU(self.latent_dim, return_sequences=True, return_state=True)
#             decoder_outputs, _ = decoder(decoder_inputs, initial_state=encoder_states)
            
            
            # 어텐션 매커니즘.
            repeat_d_layer = RepeatVectorLayer(self.max_enc_seq_length, 2)
            repeat_d = repeat_d_layer(decoder_outputs)

            repeat_e_layer = RepeatVectorLayer(self.max_dec_seq_length, 1)
            repeat_e = repeat_e_layer(encoder_outputs)

            concat_for_score_layer = Concatenate(axis=-1)
            concat_for_score = concat_for_score_layer([repeat_d, repeat_e])

            dense1_t_score_layer = Dense(self.latent_dim // 2, activation='tanh')
            dense1_score_layer = TimeDistributed(dense1_t_score_layer)
            dense1_score = dense1_score_layer(concat_for_score)
            dense2_t_score_layer = Dense(1) # to make softmax comparison
            dense2_score_layer = TimeDistributed(dense2_t_score_layer)
            dense2_score = dense2_score_layer(dense1_score)
            dense2_score = Reshape((self.max_dec_seq_length, self.max_enc_seq_length))(dense2_score) # reshape to be 2 dims

            softmax_score_layer = Softmax(axis=-1)
            softmax_score = softmax_score_layer(dense2_score)

            repeat_score_layer = RepeatVectorLayer(self.latent_dim, 2)
            repeat_score = repeat_score_layer(softmax_score)

            permute_e = Permute((2, 1))(encoder_outputs)
            repeat_e_layer = RepeatVectorLayer(self.max_dec_seq_length, 1)
            repeat_e = repeat_e_layer(permute_e)

            attended_mat_layer = Multiply()
            attended_mat = attended_mat_layer([repeat_score, repeat_e])

            context_layer = Lambda(lambda x: K.sum(x, axis=-1),
                                         lambda x: tuple(x[:-1]))
            context = context_layer(attended_mat)

            concat_context_layer = Concatenate(axis=-1)
            concat_context = concat_context_layer([context, decoder_outputs])

            attention_dense_output_layer = Dense(self.latent_dim, activation='tanh')
            attention_output_layer = TimeDistributed(attention_dense_output_layer)
            attention_output = attention_output_layer(concat_context)
            
            
            decoder_dense = Dense(self.num_decoder_tokens, activation='softmax')
            decoder_outputs_pred = decoder_dense(attention_output)
            
        # Define the model that will turn
        # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs_pred)
        self.encoder_inputs = encoder_inputs
        self.encoder_states = encoder_states
        self.encoder_outputs = encoder_outputs
        self.decoder = decoder
        self.decoder_inputs = decoder_inputs
        self.decoder_dense = decoder_dense
        self.repeat_d_layer = repeat_d_layer
        self.repeat_e_layer = repeat_e_layer
        self.attention_output = attention_output
        

        # Run training
        model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
        print(model.summary())
        
        self.model = model
    
    def train(self, train_data, validation_data=None, epochs=50, batch_size=64, verbose=1, validation_split=0.1, *args):
#         print(len(args))
        encinput_train_dict, decinput_train_dict, dectarget_train_dict = train_data
        if validation_data is not None:
            encinput_val_dict, decinput_val_dict, dectarget_val_dict = validation_data
        
        mc = ModelCheckpoint('./save/s2s_{epoch:03d}.h5', save_weights_only=True, period=5)
        val_steps = 0
        
        if validation_data is not None:
            val_data = ([encinput_val_dict, decinput_val_dict], dectarget_val_dict)
            for encinput_val_data in encinput_val_dict.values():
                val_steps += np.ceil(encinput_val_data.shape[0] / batch_size)
            val_data = convert_generator(x_data=val_data[0], y_data=val_data[1], batch_size=batch_size,
                                        num_decoder_tokens=self.num_decoder_tokens,
                                        num_encoder_tokens=self.num_encoder_tokens)
        else:
            val_data = None

        steps_per_epoch = 0
        for encinput_train_data in encinput_train_dict.values():
                steps_per_epoch += np.ceil(encinput_train_data.shape[0] / batch_size)
                
        self.model.fit_generator(generator=convert_generator([encinput_train_dict, decinput_train_dict],
                                                            dectarget_train_dict,
                                                            num_decoder_tokens=self.num_decoder_tokens,
                                                            num_encoder_tokens=self.num_encoder_tokens),
                                 steps_per_epoch=steps_per_epoch,
                                 validation_data=val_data,
                                 validation_steps=val_steps,
                                 epochs=epochs,
                                 verbose=verbose,
                                 callbacks=[mc]
                                )
        print('')
            
            
    def test(self, test_data, batch_size=64):
        print('')
    
    @staticmethod
    def get_initial_params(*args):
#         print(len(args))
#         print(len(args[0]))
        (encinput_train_dict, decinput_train_dict, dectarget_train_dict), (num_encoder_tokens, num_decoder_tokens), _ = args[0]
        encoder_input_data, decoder_input_data, decoder_target_data = encinput_train_dict[0], decinput_train_dict[0], dectarget_train_dict[0]
        return [encoder_input_data.shape[1], decoder_input_data.shape[1], num_encoder_tokens, num_decoder_tokens]

Embedding version: size x sequence_length, onehot batch conversion for the output


In [None]:
seq2seq_try11 = Sequence2sequence(Sequence2sequence.get_initial_params(test_data), gpu=0)
seq2seq_try11.train(train_data=train_data[0], 
              validation_data=val_data[0], 
              epochs=10, batch_size=16, verbose=1, validation_split=0.1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50, 56692)    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 50, 53115)    0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 50, 256), (N 58315776    input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 50, 256), (N 54652928    input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          

In [29]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

target_data = test_data

# Define sampling models

# added for my version
encoder_inputs = seq2seq_try11.encoder_inputs
encoder_states = seq2seq_try11.encoder_states
latent_dim = seq2seq_try11.latent_dim
max_encoder_seq_length = seq2seq_try11.max_encoder_seq_length
max_decoder_seq_length = seq2seq_try11.max_decoder_seq_length

decoder = seq2seq_try11.decoder
decoder_inputs = seq2seq_try11.decoder_inputs
decoder_dense = seq2seq_try11.decoder_dense
repeat_d_layer = seq2seq_try11.repeat_d_layer
repeat_e_layer = seq2seq_try11.repeat_e_layer
attention_output = seq2seq_try11.attention_output
encoder_outputs = seq2seq_try11.encoder_outputs

input_token_index = dict(
            [(token, i) for i, token in enumerate(dataset.input_tokens)])
target_token_index = dict(
    [(token, i) for i, token in enumerate(dataset.target_tokens)])

(encinput_train_dict, decinput_train_dict, dectarget_train_dict), (num_encoder_tokens, num_decoder_tokens), (input_texts, target_texts) = target_data
encoder_input_data, decoder_input_data, decoder_target_data = encinput_train_dict[0], decinput_train_dict[0], dectarget_train_dict[0]



encoder_model = Model(encoder_inputs, [encoder_outputs, encoder_states[0], encoder_states[1]]) 
encoder_outputs_input = Input(shape=(max_encoder_seq_length, latent_dim)) 
decoder_inputs = Input(shape=(1, num_decoder_tokens)) 
decoder_state_input_h = Input(shape=(latent_dim,)) 
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, decoder_h, decoder_c = decoder(decoder_inputs, initial_state=decoder_states_inputs) 

# repeat_d_layer = RepeatVectorLayer(max_encoder_seq_length, 2) 
repeat_d = repeat_d_layer(decoder_outputs) 
# repeat_e_layer = RepeatVectorLayer(1, axis=1) 
repeat_e = repeat_e_layer(encoder_outputs_input) 
concat_for_score_layer = Concatenate(axis=-1) 
concat_for_score = concat_for_score_layer([repeat_d, repeat_e]) 
dense1_score_layer = TimeDistributed(dense1_t_score_layer) 
dense1_score = dense1_score_layer(concat_for_score) 
dense2_score_layer = TimeDistributed(dense2_t_score_layer) 
dense2_score = dense2_score_layer(dense1_score) 
dense2_score = Reshape((1, max_encoder_seq_length))(dense2_score)
softmax_score_layer = Softmax(axis=-1) 
softmax_score = softmax_score_layer(dense2_score) 
repeat_score_layer = RepeatVectorLayer(latent_dim, 2) 
repeat_score = repeat_score_layer(softmax_score) 
permute_e = Permute((2, 1))(encoder_outputs_input) 
repeat_e_layer = RepeatVectorLayer(1, axis=1) 
repeat_e = repeat_e_layer(permute_e) 
attended_mat_layer = Multiply() 
attended_mat = attended_mat_layer([repeat_score, repeat_e]) 
context_layer = Lambda(lambda x: K.sum(x, axis=-1), lambda x: tuple(x[:-1])) 
context = context_layer(attended_mat) 
concat_context_layer = Concatenate(axis=-1) 
concat_context = concat_context_layer([context, decoder_outputs]) 
attention_output_layer = TimeDistributed(attention_dense_output_layer)
attention_output = attention_output_layer(concat_context) 
decoder_att_outputs = decoder_dense(attention_output) 
decoder_model = Model([decoder_inputs, decoder_state_input_h, decoder_state_input_c, encoder_outputs_input], 
                      [decoder_outputs, decoder_h, decoder_c, decoder_att_outputs])


# encoder_model = Model(encoder_inputs, encoder_states)
# # LSTM
# decoder_state_input_h = Input(shape=(latent_dim,))
# decoder_state_input_c = Input(shape=(latent_dim,))
# decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
# decoder_outputs, state_h, state_c = decoder_lstm(
#     decoder_inputs, initial_state=decoder_states_inputs)
# decoder_states = [state_h, state_c]

# # GRU
# # decoder_state_input_h = Input(shape=(latent_dim,))
# # decoder_states_inputs = [decoder_state_input_h]
# # decoder_outputs, state_h = decoder(
# #     decoder_inputs, initial_state=decoder_states_inputs)
# # decoder_states = [state_h]

# encoder_outputs_input = Input(shape=(max_encoder_seq_length, latent_dim))
# m = Model(inputs=[encoder_outputs_input, decoder_outputs], )

# decoder_outputs = decoder_dense(decoder_outputs)
# decoder_model = Model(
#     [decoder_inputs] + decoder_states_inputs,
#     [decoder_outputs] + decoder_states)










# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    _input_seq = input_seq
    input_seq = np.zeros(shape=(input_seq.shape[0], input_seq.shape[1], seq2seq_try11.num_encoder_tokens))
    for idx2, data in enumerate(_input_seq):
        for idx3, d in enumerate(data):
            input_seq[idx2, idx3, int(d)] = 1
    
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
#         output_tokens, h, c = decoder_model.predict(
#             [target_seq] + states_value)
        output_tokens, h = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > dataset.max_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
#         states_value = [h, c]
        states_value = h

    return decoded_sentence


for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
    print('Target sentence:', target_texts[seq_index])

ValueError: A `Concatenate` layer requires inputs with matching shapes except for the concat axis. Got inputs shapes: [(None, 1, 100, 256), (None, 100, 100, 256)]

# 2. Encoder & Decoder inputs as indices and decoder output as onehot, Encoder & Decoder inputs will be embeded

In [32]:
print('Embedding version: size x sequence_length, onehot batch conversion for the output')

def convert_generator(x_data, y_data, num_decoder_tokens, batch_size=16):
    '''
    Return a random from x_data, y_data
    '''
    while True:
        for me_idx in x_data[0].keys():
            samples_per_mini_epoch = x_data[0][me_idx].shape[0]
            number_of_steps = np.ceil(samples_per_mini_epoch / batch_size).astype(int)
            data_idx = list(range(number_of_steps))
        
            while len(data_idx) > 0:
                # choose batch_size random images / labels from the data
                
                idx = random.choice(data_idx)
                next_idx = min(samples_per_mini_epoch, idx + batch_size)
                encoder_input = x_data[0][me_idx][idx: next_idx]
                decoder_input = x_data[1][me_idx][idx: next_idx]
                decoder_output = y_data[me_idx][idx: next_idx]
                converted_decoder_output = np.zeros(shape=(decoder_output.shape[0], decoder_output.shape[1], num_decoder_tokens))
                for idx2, data in enumerate(decoder_output):
                    for idx3, d in enumerate(data):
                        converted_decoder_output[idx2, idx3, int(d)] = 1

                data_idx.remove(idx)
                yield [encoder_input, decoder_input], converted_decoder_output

class Sequence2sequence:
    def __init__(self, initial_params, gpu=0):
        
        self.max_encoder_seq_length, self.max_decoder_seq_length, self.num_encoder_tokens, self.num_decoder_tokens = initial_params
        
        self.embedding_dim = 256
        self.latent_dim = 256  # Latent dimensionality of the encoding space.
        self.max_enc_seq_length = 50
        self.max_dec_seq_length = 50
        
        with K.tf.device('/gpu:' + str(gpu)):
            # Define an input sequence and process it.
            encoder_inputs = Input(shape=(self.max_enc_seq_length, ))
            embeded_encoder_inputs = Embedding(self.num_encoder_tokens, self.embedding_dim, 
                                               embeddings_constraint=MinMaxNorm(min_value=0.0, max_value=1.0, axis=1),
                                               embeddings_initializer=RandomUniform(minval=0.0, maxval=1.0), 
                                               input_length=self.max_encoder_seq_length)(encoder_inputs)
#             embeded_encoder_inputs = Activation('tanh')(embeded_encoder_inputs)
            encoder = LSTM(self.latent_dim, return_state=True)
            encoder_outputs, state_h, state_c = encoder(embeded_encoder_inputs)
            # We discard `encoder_outputs` and only keep the states.
            encoder_states = [state_h, state_c]

            # Set up the decoder, using `encoder_states` as initial state.
            decoder_inputs = Input(shape=(self.max_dec_seq_length, ))
            decoder_embed = Embedding(self.num_decoder_tokens, self.embedding_dim, 
#                                       embeddings_constraint=MinMaxNorm(min_value=0.0, max_value=1.0, axis=1),
#                                       embeddings_initializer=RandomUniform(minval=0.0, maxval=1.0), 
                                      input_length=self.max_decoder_seq_length)
            embeded_decoder_inputs = decoder_embed(decoder_inputs)
#             embeded_decoder_inputs = Activation('tanh')(embeded_decoder_inputs)
            # We set up our decoder to return full output sequences,
            # and to return internal states as well. We don't use the
            # return states in the training model, but we will use them in inference.
            decoder_lstm = LSTM(self.latent_dim, return_sequences=True, return_state=True)
            decoder_outputs, _, _ = decoder_lstm(embeded_decoder_inputs,
                                                 initial_state=encoder_states)
            decoder_dense = Dense(self.num_decoder_tokens, activation='softmax')
            decoder_outputs_pred = decoder_dense(decoder_outputs)
            

        # Define the model that will turn
        # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs_pred)
        self.encoder_inputs = encoder_inputs
        self.encoder_states = encoder_states
        self.decoder_lstm = decoder_lstm
        self.decoder_inputs = decoder_inputs
        self.decoder_dense = decoder_dense
        self.decoder_embed = decoder_embed
        self.embeded_decoder_inputs = embeded_decoder_inputs

        # Run training
        model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
        print(model.summary())
        
        self.model = model
    
    def train(self, train_data, validation_data=None, epochs=50, batch_size=64, verbose=1, validation_split=0.1, *args):
#         print(len(args))
        encinput_train_dict, decinput_train_dict, dectarget_train_dict = train_data
        if validation_data is not None:
            encinput_val_dict, decinput_val_dict, dectarget_val_dict = validation_data
        
        mc = ModelCheckpoint('./save/s2s_{epoch:03d}.h5', save_weights_only=True, period=5)
        val_steps = 0
        
        if validation_data is not None:
            val_data = ([encinput_val_dict, decinput_val_dict], dectarget_val_dict)
            for encinput_val_data in encinput_val_dict.values():
                val_steps += np.ceil(encinput_val_data.shape[0] / batch_size)
            val_data = convert_generator(x_data=val_data[0], y_data=val_data[1], batch_size=batch_size,
                                        num_decoder_tokens=self.num_decoder_tokens)
        else:
            val_data = None

        steps_per_epoch = 0
        for encinput_train_data in encinput_train_dict.values():
                steps_per_epoch += np.ceil(encinput_train_data.shape[0] / batch_size)
                
        self.model.fit_generator(generator=convert_generator([encinput_train_dict, decinput_train_dict],
                                                            dectarget_train_dict,
                                                            num_decoder_tokens=self.num_decoder_tokens),
                                 steps_per_epoch=steps_per_epoch,
                                 validation_data=val_data,
                                 validation_steps=val_steps,
                                 epochs=epochs,
                                 verbose=verbose,
                                 callbacks=[mc]
                                )
        print('')
            
            
    def test(self, test_data, batch_size=64):
        print('')
    
    @staticmethod
    def get_initial_params(*args):
#         print(len(args))
#         print(len(args[0]))
        (encinput_train_dict, decinput_train_dict, dectarget_train_dict), (num_encoder_tokens, num_decoder_tokens), _ = args[0]
        encoder_input_data, decoder_input_data, decoder_target_data = encinput_train_dict[0], decinput_train_dict[0], dectarget_train_dict[0]
        return [encoder_input_data.shape[1], decoder_input_data.shape[1], num_encoder_tokens, num_decoder_tokens]

Embedding version: size x sequence_length, onehot batch conversion for the output


In [33]:
seq2seq_try21 = Sequence2sequence(Sequence2sequence.get_initial_params(test_data), gpu=0)
seq2seq_try21.train(train_data=train_data[0], 
              validation_data=val_data[0], 
              epochs=50, batch_size=8, verbose=1, validation_split=0.1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_27 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
input_28 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 256)      14513152    input_27[0][0]                   
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 50, 256)      13597440    input_28[0][0]                   
__________________________________________________________________________________________________
lstm_9 (LS

KeyboardInterrupt: 

In [34]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

target_data = test_data

# Define sampling models

# added for my version
encoder_inputs = seq2seq_try21.encoder_inputs
encoder_states = seq2seq_try21.encoder_states
latent_dim = seq2seq_try21.latent_dim
decoder_lstm = seq2seq_try21.decoder_lstm
decoder_inputs = seq2seq_try21.decoder_inputs
decoder_dense = seq2seq_try21.decoder_dense
input_token_index = dict(
            [(token, i) for i, token in enumerate(dataset.input_tokens)])
target_token_index = dict(
    [(token, i) for i, token in enumerate(dataset.target_tokens)])

(encinput_train_dict, decinput_train_dict, dectarget_train_dict), (num_encoder_tokens, num_decoder_tokens), (input_texts, target_texts) = target_data
encoder_input_data, decoder_input_data, decoder_target_data = encinput_train_dict[0], decinput_train_dict[0], dectarget_train_dict[0]


encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    _input_seq = input_seq
    input_seq = np.zeros(shape=(input_seq.shape[0], input_seq.shape[1], seq2seq_try11.num_encoder_tokens))
    for idx2, data in enumerate(_input_seq):
        for idx3, d in enumerate(data):
            input_seq[idx2, idx3, int(d)] = 1
    
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > dataset.max_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)
    print('Target sentence:', target_texts[seq_index])

ValueError: Input 0 is incompatible with layer lstm_10: expected ndim=3, found ndim=2

# 1. Embedding to Embedding (All inputs of encoder & decoder, and output of decoder will be fed as indices of character tokens)

# Try 1-1. Categorical Cross Entropy

In [None]:
print('Embedding to Embedding')
def convert_generator(x_data, y_data, num_decoder_tokens, batch_size=16):
    '''
    Return a random image from x_data, y_data
    '''
    
    samples_per_epoch = x_data[0].shape[0]
    number_of_steps = np.ceil(samples_per_epoch / batch_size).astype(int)
    data_idx = list(range(number_of_steps))
#     print('number of unique decoder tokens:', num_decoder_tokens)
    
    while True:
        # choose batch_size random images / labels from the data
#         idx = np.random.randint(0, x_data.shape[0], batch_size)
        idx = random.choice(data_idx)
        next_idx = min(samples_per_epoch, idx + batch_size)
        encoder_input = x_data[0][idx: next_idx]
        decoder_input = x_data[1][idx: next_idx]
        decoder_output = y_data[idx: next_idx]
        converted_decoder_output = np.zeros(shape=(decoder_output.shape[0], decoder_output.shape[1], num_decoder_tokens))
        for idx2, data in enumerate(decoder_output):
            for idx3, d in enumerate(data):
                converted_decoder_output[idx2, idx3, int(d)] = 1
        
        data_idx.remove(idx)
        
        yield [encoder_input, decoder_input], converted_decoder_output

class Sequence2sequence:
    def __init__(self, initial_params, options=dict(), gpu=0):
        
        max_encoder_seq_length, max_decoder_seq_length, num_encoder_tokens, num_decoder_tokens = initial_params
        if 'loss' in options:
            loss_type = options['loss']
        else:
            loss_type = 'categorical_crossentropy'
            
        loss_dict = {
            'categorical_crossentropy': losses.categorical_crossentropy,
            'binary_crossentropy': losses.binary_crossentropy,
            'mse': losses.mse
        }
        
        loss = loss_dict[loss_type]
        
        
        self.embedding_dim = 256
        self.latent_dim = 256  # Latent dimensionality of the encoding space.
        
        with K.tf.device('/gpu:' + str(gpu)):
            # Define an input sequence and process it.
            encoder_inputs = Input(shape=(None, ))
            embeded_encoder_inputs = Embedding(num_encoder_tokens, self.embedding_dim, 
                                               embeddings_constraint=MinMaxNorm(min_value=0.0, max_value=1.0, axis=1),
                                               embeddings_initializer=RandomUniform(minval=0.0, maxval=1.0), 
                                               input_length=max_encoder_seq_length)(encoder_inputs)
            embeded_encoder_inputs = Activation('tanh')(embeded_encoder_inputs)
            encoder = LSTM(self.latent_dim, return_state=True)
            encoder_outputs, state_h, state_c = encoder(embeded_encoder_inputs)
            # We discard `encoder_outputs` and only keep the states.
            encoder_states = [state_h, state_c]

            # Set up the decoder, using `encoder_states` as initial state.
            decoder_inputs = Input(shape=(None, ))
            decoder_embed = Embedding(num_decoder_tokens, self.embedding_dim, 
#                                       embeddings_constraint=MinMaxNorm(min_value=0.0, max_value=1.0, axis=1),
#                                       embeddings_initializer=RandomUniform(minval=0.0, maxval=1.0), 
                                      input_length=max_decoder_seq_length)
            embeded_decoder_inputs = decoder_embed(decoder_inputs)
            embeded_decoder_inputs = Activation('tanh')(embeded_decoder_inputs)
            # We set up our decoder to return full output sequences,
            # and to return internal states as well. We don't use the
            # return states in the training model, but we will use them in inference.
            decoder_lstm = LSTM(self.latent_dim, return_sequences=True, return_state=True)
            decoder_outputs, _, _ = decoder_lstm(embeded_decoder_inputs,
                                                 initial_state=encoder_states)
            decoder_dense = Dense(self.embedding_dim)
            decoder_outputs_pred = decoder_dense(decoder_outputs)
            decoder_outputs_pred = Activation('tanh')(decoder_outputs_pred)
            
            decoder_outputs_true = Input(shape=(None, ))
            embeded_decoder_outputs = decoder_embed(decoder_outputs_true)
            embeded_decoder_outputs = Activation('tanh')(embeded_decoder_outputs)

        # Define the model that will turn
        # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
        model = Model([encoder_inputs, decoder_inputs, decoder_outputs_true], decoder_outputs_pred)
        self.encoder_inputs = encoder_inputs
        self.encoder_states = encoder_states
        self.decoder_lstm = decoder_lstm
        self.decoder_inputs = decoder_inputs
        self.decoder_dense = decoder_dense
        self.decoder_embed = decoder_embed
        self.embeded_decoder_inputs = embeded_decoder_inputs

        # Run training
        _loss = loss(y_true=embeded_decoder_outputs, y_pred=decoder_outputs_pred)
        model.add_loss(_loss) 
        model.compile(optimizer='rmsprop')
        print(model.summary())
        
        self.model = model
    
    def train(self, train_data, validation_data=None, epochs=50, batch_size=64, verbose=1, validation_split=0.1, *args):
#         print(len(args))
        encinput_train_dict, decinput_train_dict, dectarget_train_dict = train_data
        if validation_data is not None:
            encinput_val_dict, decinput_val_dict, dectarget_val_dict = validation_data
        
        decoder_predicter = Model(inputs=self.decoder_inputs, outputs=self.embeded_decoder_inputs)
        
        for epoch in range(epochs):
            print('[EPOCH %i]' % epoch)
            for idx in encinput_train_dict.keys():
                encinput_train_data = encinput_train_dict[idx]
                decinput_train_data = decinput_train_dict[idx]
                dectarget_train_data = dectarget_train_dict[idx]
                                
                if validation_data is not None:
                    val_idx = random.choice(list(encinput_val_dict.keys()))
                    encinput_val_data = encinput_val_dict[val_idx]
                    decinput_val_data = decinput_val_dict[val_idx]
                    dectarget_val_data = dectarget_val_dict[val_idx]
                    val_data = ([encinput_val_data, decinput_val_data, dectarget_val_data], None)
                else:
                    val_data = None
                
#                 print('example!!')
#                 sample = decoder_predicter.predict(dectarget_train_data[:10])
#                 print(sample[0])
#                 print(sample.shape)
                print('%s, %s, %s' % ( encinput_train_data.shape, decinput_train_data.shape, dectarget_train_data.shape))
                print('%s, %s, %s' % ( encinput_val_data.shape, decinput_val_data.shape, dectarget_val_data.shape))
                self.model.fit(x=[encinput_train_data, decinput_train_data, dectarget_train_data], 
#                                y=decoder_predicter.predict(dectarget_train_data),
                               validation_data=val_data,
                               batch_size=batch_size, 
                               epochs=1,
                               verbose=verbose,
                               validation_split=validation_split)
#                 del encinput_train_data, decinput_train_data, dectarget_train_data
                print('')
            # Save model
            self.model.save('s2s.h5')
            
            
    def test(self, test_data, batch_size=64):
        print('')
    
    @staticmethod
    def get_initial_params(*args):
#         print(len(args))
#         print(len(args[0]))
        (encinput_train_dict, decinput_train_dict, dectarget_train_dict), (num_encoder_tokens, num_decoder_tokens), _ = args[0]
        encoder_input_data, decoder_input_data, decoder_target_data = encinput_train_dict[0], decinput_train_dict[0], dectarget_train_dict[0]
        return [encoder_input_data.shape[1], decoder_input_data.shape[1], num_encoder_tokens, num_decoder_tokens]

In [None]:
seq2seq_try11 = Sequence2sequence(Sequence2sequence.get_initial_params(train_data), options={'loss': 'categorical_crossentropy'}, gpu=0)
seq2seq_try11.train(train_data=train_data[0], 
              validation_data=val_data[0], 
              epochs=20, batch_size=128, verbose=1, validation_split=0.1)

In [None]:
seq2seq_try12 = Sequence2sequence(Sequence2sequence.get_initial_params(train_data), options={'loss': 'binary_crossentropy'}, gpu=0)
seq2seq_try12.train(train_data=train_data[0], 
              validation_data=val_data[0], 
              epochs=20, batch_size=128, verbose=1, validation_split=0.1)

In [None]:
seq2seq_try13 = Sequence2sequence(Sequence2sequence.get_initial_params(train_data), options={'loss': 'mse'}, gpu=0)
seq2seq_try13.train(train_data=train_data[0], 
              validation_data=val_data[0], 
              epochs=20, batch_size=128, verbose=1, validation_split=0.1)

# 2. No embedding & only use indices to train

In [None]:
print('Not embedding version: size x sequence_length x 1')
class Sequence2sequence:
    def __init__(self, initial_params, options=dict(), gpu=0):
        
        max_encoder_seq_length, max_decoder_seq_length, num_encoder_tokens, num_decoder_tokens = initial_params
        
        self.embedding_dim = 256
        self.latent_dim = 256  # Latent dimensionality of the encoding space.
        
        loss_dict = {
            'mse': 'mse',
            'sparse_categorical_crossentropy': 'sparse_categorical_crossentropy'
        }
        
        if 'loss' in options and options['loss'] in loss_dict:
            loss = loss_dict[options['loss']]
        else:
            loss = 'mse'
        
        with K.tf.device('/gpu:' + str(gpu)):
            # Define an input sequence and process it.
            encoder_inputs = Input(shape=(None, 1))
            encoder = LSTM(self.latent_dim, return_state=True)
            encoder_outputs, state_h, state_c = encoder(encoder_inputs)
            # We discard `encoder_outputs` and only keep the states.
            encoder_states = [state_h, state_c]

            # Set up the decoder, using `encoder_states` as initial state.
            decoder_inputs = Input(shape=(None, 1))
            # We set up our decoder to return full output sequences,
            # and to return internal states as well. We don't use the
            # return states in the training model, but we will use them in inference.
            decoder_lstm = LSTM(self.latent_dim, return_sequences=True, return_state=True)
            decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                                 initial_state=encoder_states)
            decoder_dense = Dense(1, activation='relu')
            decoder_outputs_pred = decoder_dense(decoder_outputs)
            

        # Define the model that will turn
        # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs_pred)
        self.encoder_inputs = encoder_inputs
        self.encoder_states = encoder_states
        self.decoder_lstm = decoder_lstm
        self.decoder_inputs = decoder_inputs
        self.decoder_dense = decoder_dense

        # Run training
        model.compile(optimizer='rmsprop', loss=loss)
        print(model.summary())
        
        self.model = model
    
    def train(self, train_data, validation_data=None, epochs=50, batch_size=64, verbose=1, validation_split=0.1, *args):
#         print(len(args))
        encinput_train_dict, decinput_train_dict, dectarget_train_dict = train_data
        if validation_data is not None:
            encinput_val_dict, decinput_val_dict, dectarget_val_dict = validation_data
        
        
        for epoch in range(epochs):
            print('[EPOCH %i]' % epoch)
            for idx in encinput_train_dict.keys():
                encinput_train_data = encinput_train_dict[idx]
                decinput_train_data = decinput_train_dict[idx]
                dectarget_train_data = dectarget_train_dict[idx]
                                
                if validation_data is not None:
                    val_idx = random.choice(list(encinput_val_dict.keys()))
                    encinput_val_data = encinput_val_dict[val_idx]
                    decinput_val_data = decinput_val_dict[val_idx]
                    dectarget_val_data = dectarget_val_dict[val_idx]
                    val_data = ([encinput_val_data, decinput_val_data], dectarget_val_data)
                else:
                    val_data = None
                
#                 print('example!!')
#                 sample = decoder_predicter.predict(dectarget_train_data[:10])
#                 print(sample[0])
#                 print(sample.shape)
#                 print('%s, %s, %s' % ( encinput_train_data.shape, decinput_train_data.shape, dectarget_train_data.shape))
                self.model.fit(x=[encinput_train_data, decinput_train_data], 
                               y=dectarget_train_data,
                               validation_data=val_data,
                               batch_size=batch_size, 
                               epochs=1,
                               verbose=verbose,
                               validation_split=validation_split)
                del encinput_train_data, decinput_train_data, dectarget_train_data
                print('')
            # Save model
            self.model.save('s2s.h5')
            
            
    def test(self, test_data, batch_size=64):
        print('')
    
    @staticmethod
    def get_initial_params(*args):
#         print(len(args))
#         print(len(args[0]))
        (encinput_train_dict, decinput_train_dict, dectarget_train_dict), (num_encoder_tokens, num_decoder_tokens), _ = args[0]
        encoder_input_data, decoder_input_data, decoder_target_data = encinput_train_dict[0], decinput_train_dict[0], dectarget_train_dict[0]
        return [encoder_input_data.shape[1], decoder_input_data.shape[1], num_encoder_tokens, num_decoder_tokens]

In [None]:
seq2seq_try21 = Sequence2sequence(Sequence2sequence.get_initial_params(train_data), options={'loss': 'mse'}, gpu=0)
seq2seq_try21.train(train_data=train_data[0], 
              validation_data=val_data[0], 
              epochs=50, batch_size=128, verbose=1, validation_split=0.1)

In [None]:
seq2seq_try21 = Sequence2sequence(Sequence2sequence.get_initial_params(train_data), options={'loss': 'sparse_categorical_crossentropy'}, gpu=0)
seq2seq_try21.train(train_data=train_data[0], 
              validation_data=val_data[0], 
              epochs=50, batch_size=128, verbose=1, validation_split=0.1)