<span style="font-size:36px"><b>Seq2seq Model Tutorial</b></span>

Copyright &copy; 2020 Gunawan Lumban Gaol

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language overning permissions and limitations under the License.

# Import Packages

In [136]:
import os
import sys
import glob

import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Attention, LSTM, Dense, TimeDistributed, Lambda
from tensorflow.keras.layers import Concatenate, Input, Masking, Activation, RepeatVector, Dot, Bidirectional
print("tf.__version__ = "+tf.__version__)

from gurih.data.data_generator import DataGenerator, validate_dataset_dir
from gurih.models.model import Seq2SeqModel
from gurih.models.my_keras_layers import ASREncoder, BahdanauAttention, ASRDecoder
from gurih.models.utils import CharMap, Seq2SeqCharMap, wer_and_cer

%load_ext autoreload
%autoreload 2

tf.__version__ = 2.1.0
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Set documentation directory and model weights directory.

In [43]:
model_dir = "../../models/Model020a/"  # to store model weights
doc_dir = "../../docs/Model020a/"  # to store model configuration and evaluation

if not os.path.exists(model_dir): os.makedirs(model_dir)
if not os.path.exists(doc_dir): os.makedirs(doc_dir)

Also specify `train`, `validation`, and `test` dataset directories. 

These directories should contain all `.npz` and its corresponding `.txt` transcriptions files with same naming conventions. For example if npz filename is `0.npz`, then the transcription txt filename should be `0.txt`.

In [44]:
train_dir = "../../dataset/interim/Model010a/train/"
valid_dir = "../../dataset/interim/Model010a/valid/"

In [45]:
validate_dataset_dir(train_dir)
validate_dataset_dir(valid_dir)

../../dataset/interim/Model010a/train/ checks passed.
../../dataset/interim/Model010a/valid/ checks passed.


# Model Training

Configure parameters for training.

In [46]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Masking

In [263]:
BATCH_SIZE = 16
MAX_EPOCHS = 100
MAX_SEQ_LENGTH = 1000
MAX_LABEL_LENGTH = 100
VOCAB_INPUT_SIZE = len(Seq2SeqCharMap()) + 1
VOCAB_OUTPUT_SIZE = len(Seq2SeqCharMap()) + 1

n_lstm = 128
n_mfcc = 39

In [264]:
# encoder = ASREncoder(vocab_len=VOCAB_INPUT_SIZE,
#                      n_lstm=128,
#                      batch_size=BATCH_SIZE)
# # decoder = ASRDecoder(vocab_len=VOCAB_OUTPUT_SIZE,
# #                      n_lstm=128*2,
# #                      n_units=512,
# #                      batch_size=BATCH_SIZE)


In [265]:
test = "<janganlah kamu menyangka, bahwa kitab suci tanpa alasan berkata roh yang ditempatkan allah di dalam diri kita, diingini nya dengan cemburu tetapi kasih karunia, yang dianugerahkan nya kepada kita, lebih besar dari pada itu.>"

Model with attention

In [266]:
encoder_inputs = Input(shape=(MAX_SEQ_LENGTH, n_mfcc))  # batch shape=(b, MAX_SEQUENCE_LENGTH, n_mfcc)
encoder = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))
encoder_outputs, state_h, state_c, state2_h, state2_c = encoder(encoder_inputs)
# encoder_states = [tf.concat([state_h, state_c], axis=1), tf.concat([state2_h, state2_c], axis=1)]  # discard encoder_outputs and keep only states
encoder_states = [state_c, state2_c]

repeator = RepeatVector(MAX_SEQ_LENGTH)
concatenator = Concatenate(axis=-1)
densor = Dense(1, activation = "relu")
activator = Activation('softmax', name='attention_weights')
dotor = Dot(axes = 1)

decoder_inputs = Input(shape=(None, VOCAB_INPUT_SIZE))  # batch shape=(b, MAX_LABEL_LENGTH, VOCAB_INPUT_SIZE)
decoder_lstm = LSTM(n_lstm, return_sequences=True, return_state=True)
decoder_dense = Dense(VOCAB_OUTPUT_SIZE, activation='softmax')
# multi step
outputs = []
for ty in range(MAX_LABEL_LENGTH):
    s_prev = tf.concat(encoder_states, axis=1)
    s_prev = repeator(s_prev)
    concat = concatenator([encoder_outputs, s_prev])
    e = densor(concat)
    alphas = activator(e)
    context = dotor([alphas, encoder_outputs])
    slice_input = Lambda(lambda x: x[:, ty:ty+1, :])(decoder_inputs)
    context = concatenator([slice_input, context])
    
    decoder_outputs, *encoder_states = decoder_lstm(context,
                                         initial_state=encoder_states)
    decoder_outputs = decoder_dense(decoder_outputs)
    
    outputs.append(decoder_outputs)

outputs = Concatenate(axis=1)(outputs)
model = Model([encoder_inputs, decoder_inputs], outputs)

# Run training
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [267]:
model.summary()

Model: "model_21"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_44 (InputLayer)           [(None, 1000, 39)]   0                                            
__________________________________________________________________________________________________
bidirectional_18 (Bidirectional [(None, 1000, 256),  172032      input_44[0][0]                   
__________________________________________________________________________________________________
tf_op_layer_concat_1514 (Tensor [(None, 256)]        0           bidirectional_18[0][2]           
                                                                 bidirectional_18[0][4]           
__________________________________________________________________________________________________
repeat_vector_26 (RepeatVector) (None, 1000, 256)    0           tf_op_layer_concat_1514[0]

__________________________________________________________________________________________________
lambda_1547 (Lambda)            (None, None, 32)     0           input_45[0][0]                   
__________________________________________________________________________________________________
tf_op_layer_concat_1570 (Tensor [(None, 256)]        0           lstm_1222[55][1]                 
                                                                 lstm_1222[55][2]                 
__________________________________________________________________________________________________
lambda_1548 (Lambda)            (None, None, 32)     0           input_45[0][0]                   
__________________________________________________________________________________________________
tf_op_layer_concat_1571 (Tensor [(None, 256)]        0           lstm_1222[56][1]                 
                                                                 lstm_1222[56][2]                 
__________

Model without attention.

In [232]:
encoder_inputs = Input(shape=(MAX_SEQ_LENGTH, n_mfcc))  # batch shape=(b, MAX_SEQUENCE_LENGTH, n_mfcc)
encoder = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))
encoder_outputs, state_h, state_c, state2_h, state2_c = encoder(encoder_inputs)
# encoder_states = [tf.concat([state_h, state_c], axis=1), tf.concat([state2_h, state2_c], axis=1)]  # discard encoder_outputs and keep only states
encoder_states = [state_c, state2_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, VOCAB_INPUT_SIZE))  # batch shape=(b, MAX_LABEL_LENGTH, VOCAB_INPUT_SIZE)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(n_lstm, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(VOCAB_OUTPUT_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [231]:
model.summary()

Model: "model_16"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           [(None, 3000, 39)]   0                                            
__________________________________________________________________________________________________
input_32 (InputLayer)           [(None, 300, 32)]    0                                            
__________________________________________________________________________________________________
bidirectional_7 (Bidirectional) [(None, 3000, 256),  172032      input_31[0][0]                   
__________________________________________________________________________________________________
lstm_24 (LSTM)                  [(None, 300, 128), ( 82432       input_32[0][0]                   
                                                                 bidirectional_7[0][2]     

Train Model.

In [256]:
def onehottify(x, n=None, dtype=float):
    """1-hot encode x with the max value n (computed from data if n is None)."""
    x = np.asarray(x)
    n = np.max(x) + 1 if n is None else n
    return np.eye(n, dtype=dtype)[x]

In [257]:
train_generator = DataGenerator(input_dir=train_dir,
                                max_seq_length=MAX_SEQ_LENGTH,
                                max_label_length=MAX_LABEL_LENGTH-1,
                                ctc_input_length=1495,
                                char_to_idx_map=Seq2SeqCharMap.CHAR_TO_IDX_MAP,
                                batch_size=8,
                                shuffle=False)
X = train_generator[0][0]['the_input']
Y = train_generator[0][0]['the_labels']
print(X.shape, Y.shape)

(8, 3000, 39) (8, 299)


In [None]:
X = np.random.rand(8, MAX_SEQ_LENGTH, 39)
Y = np.random.randint(0, 32, size=(8, MAX_LABEL_LENGTH-1))

In [258]:
encoder_input_data = X
decoder_target_data = np.apply_along_axis(lambda x: np.append(x[:-1], [2, 0]), 1, Y)  # at end of sentence + blank token
decoder_input_data = np.apply_along_axis(lambda x: np.append([1], x[:-1]), 1, decoder_target_data)  # prepend start token
decoder_target_data = onehottify(decoder_target_data)
decoder_input_data = onehottify(decoder_input_data)

print(encoder_input_data.shape)
print(decoder_target_data.shape)
print(decoder_input_data.shape)

(8, 3000, 39)
(8, 300, 32)
(8, 300, 32)


In [259]:
history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
                    batch_size=BATCH_SIZE,
                    epochs=MAX_EPOCHS,
)

Train on 8 samples
Epoch 1/100


KeyboardInterrupt: 

In [122]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(n_lstm,))
decoder_state_input_c = Input(shape=(n_lstm,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [125]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, Seq2SeqCharMap.CHAR_TO_IDX_MAP['<']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = Seq2SeqCharMap.IDX_TO_CHAR_MAP[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '>' or
           len(decoded_sentence) > MAX_LABEL_LENGTH):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [126]:
for seq_index in range(1):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', Y_true[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: inilah kepala kepala para pahlawan yang mengiringi daud, yang telah memberi dukungan yang kuat kepadanya, bersama sama seluruh israel, dalam mencapai kedudukan raja dan yang mengangkat dia sebagai raja, seperti yang difirmankan tuhan mengenai israel.                                                 
Decoded sentence: inilah kelala pala pala pala pala pang yang dii dang dang dang perang seruu dang dang dang kerang dang serang dang dang dang dang perang dang serang dang serang dang dang dang dang perang dang dang dang kerang dang dang dang perang dang serang dang serang dang dang dang perang dang serang dang serang


In [16]:
Tx = 1000
Ty = 100

repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor = Dense(1, activation = "relu")
activator = Activation('softmax', name='attention_weights')
dotor = Dot(axes = 1)

post_activation_LSTM = LSTM(128, return_state=True)
fc_output = Dense(len(Seq2SeqCharMap()) + 1, activation='softmax')

In [17]:
def one_step_attention(a, s_prev):
    s_prev = repeator(s_prev)
    concat = concatenator([a, s_prev])
    e = densor(concat)
    alphas = activator(e)    
    context = dotor([alphas, a])
    return context

In [18]:
def create_model(Tx, Ty, n_a, n_s, n_mfcc):
    X = Input(shape=(Tx, n_mfcc), name='enc_input')
    mask_enc_input = Masking(mask_value=0.0, name='mask_enc_input')(X)
    dec_s0 = Input(shape=(n_s))
    dec_c0 = Input(shape=(n_s))
    s = dec_s0
    c = dec_c0

    a = Bidirectional(LSTM(n_a, return_sequences=True))(X)

    outputs = []
    for t in range(Ty):
        context = one_step_attention(a, s)

        s, _, c = post_activation_LSTM(context, initial_state=[s, c])
        out = fc_output(s)
        outputs.append(out)

    model = Model(inputs=[X, dec_s0, dec_c0], outputs=outputs)
    
    return model

In [19]:
model = create_model(Tx, Ty, 128, 128, 39)

In [20]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
enc_input (InputLayer)          [(None, 1000, 39)]   0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 1000, 256)    172032      enc_input[0][0]                  
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 1000, 128)    0           input_3[0][0]                    
                                                                 lstm_3[0][0]                 

In [21]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy())

In [48]:
X = tf.random.uniform((8, 1000, 39))
s0 = tf.zeros((8, 128))
c0 = tf.zeros((8, 128))
# Y = [tf.random.uniform((100, 8, 1), minval=0, maxval=32, dtype=tf.dtypes.int32)]
Y = np.random.randint(0, 32, size=(8, 100, 1))

model.fit([X, s0, c0], list(Y.swapaxes(0, 1)))

Train on 8 samples


<tensorflow.python.keras.callbacks.History at 0x58d8148>

In [36]:
enc_input = Input(batch_shape=(16, 150, 39), name='enc_input')
mask_enc_input = Masking(mask_value=0, name='mask_enc_input')(enc_input)
_, init_enc_state = encoder(mask_enc_input, hidden=None)
enc_output, enc_state = encoder(mask_enc_input, init_enc_state)

state_h = tf.concat(enc_state[0:2], axis=1)
state_c = tf.concat(enc_state[2:], axis=1)
dec_input = Input(batch_shape=(16, 15, 1), name='dec_input')
dec_lstm = LSTM(256, return_sequences=True, return_state=True)
dec_output, *dec_state = dec_lstm(dec_input, initial_state=[state_h, state_c])

attention = AttentionLayer()
attention_out, *attention_states = attention([enc_output, dec_output])

decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([dec_output, attention_out])
dense = Dense(32, activation='softmax', name='softmax_layer')
dense_time = TimeDistributed(dense, name='time_distributed_layer')
decoder_pred = dense_time(decoder_concat_input)

model = Model(inputs=[enc_input, dec_input], outputs=decoder_pred)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [55]:
model.fit([X, np.expand_dims(Y, axis=-1)])

Train on 16 samples


IndexError: list index out of range

In [None]:
def create(Ty, encoder, attention, decoder, CharMap):
    """
    Create the seq2seq model.
    
    Parameters
    ----------
    Ty : int
        length of the output sequence
    encoder : tf.keras.Model
        instance of ASREncoder
    attention : tf.keras.Model
        instance of Attention custom layer
    decoder : tf.keras.Model
        instance of ASRDecoder
    CharMap : object
        character map object
    """
    # Define inputs and outputs
    X = Input(shape=(None, len(CharMap())))
    outputs = []
    
    # Teacher forcing training
    enc_hidden = encoder.initialize_hidden_state()
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([CharMap['<'] * BATCH_SIZE], 1)  # start token
    
    for t in range(1, Ty):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
        dec_input = tf.expand_dims()

In [24]:
model = Model(inputs=the_input, outputs=decoder)

AttributeError: 'ASRDecoder' object has no attribute 'op'

## Train with `tf.data.Dataset` API

You might want to see [tutorial](https://www.tensorflow.org/guide/data) first to understand the API.

In [None]:
START_TOKEN = '<BOS>'
END_TOKEN = '<EOS>'

In [None]:
# From numpy
train_dataset = train_generator