<span style="font-size:36px"><b>Seq2seq Model Tutorial</b></span>

Copyright &copy; 2020 Gunawan Lumban Gaol

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language overning permissions and limitations under the License.

# Import Packages

In [1]:
import os
import sys
import glob

import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Attention, LSTM, Dense, TimeDistributed, Lambda
from tensorflow.keras.layers import Concatenate, Input, Masking, Activation, RepeatVector, Dot, Bidirectional
from tensorflow.keras.models import Model
print("tf.__version__ = "+tf.__version__)

from gurih.data.data_generator import DataGenerator, validate_dataset_dir
from gurih.models.model import Seq2SeqModel
from gurih.models.my_keras_layers import ASREncoder, BahdanauAttention, ASRDecoder
from gurih.models.utils import CharMap, Seq2SeqCharMap, wer_and_cer

%load_ext autoreload
%autoreload 2

tf.__version__ = 2.1.0


In [2]:
def onehottify(x, n=None, dtype=float):
    """1-hot encode x with the max value n (computed from data if n is None)."""
    x = np.asarray(x)
    n = np.max(x) + 1 if n is None else n
    return np.eye(n, dtype=dtype)[x]

Set documentation directory and model weights directory.

In [3]:
model_dir = "../../models/Model020a/"  # to store model weights
doc_dir = "../../docs/Model020a/"  # to store model configuration and evaluation

if not os.path.exists(model_dir): os.makedirs(model_dir)
if not os.path.exists(doc_dir): os.makedirs(doc_dir)

Also specify `train`, `validation`, and `test` dataset directories. 

These directories should contain all `.npz` and its corresponding `.txt` transcriptions files with same naming conventions. For example if npz filename is `0.npz`, then the transcription txt filename should be `0.txt`.

In [4]:
train_dir = "../../dataset/interim/Model010a/train/"
valid_dir = "../../dataset/interim/Model010a/valid/"

In [5]:
validate_dataset_dir(train_dir)
validate_dataset_dir(valid_dir)

../../dataset/interim/Model010a/train/ checks passed.
../../dataset/interim/Model010a/valid/ checks passed.


# Basic Listen, Attend, and Spell

Basic LAS architecture with reduced layers.

In [7]:
BATCH_SIZE = 16
MAX_EPOCHS = 100
MAX_SEQ_LENGTH = 2500
MAX_LABEL_LENGTH = 250
VOCAB_INPUT_SIZE = len(Seq2SeqCharMap()) + 1
VOCAB_OUTPUT_SIZE = len(Seq2SeqCharMap()) + 1

n_lstm = 128
n_mfcc = 39

## Model Training

In [8]:
encoder_inputs = Input(shape=(MAX_SEQ_LENGTH, n_mfcc))  # batch shape=(b, MAX_SEQUENCE_LENGTH, n_mfcc)
encoder = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))
encoder_outputs, state_h, state_c, state2_h, state2_c = encoder(encoder_inputs)
# encoder_states = [tf.concat([state_h, state_c], axis=1), tf.concat([state2_h, state2_c], axis=1)]  # discard encoder_outputs and keep only states
encoder_states = [state_c, state2_c]

repeator = RepeatVector(MAX_SEQ_LENGTH)
concatenator = Concatenate(axis=-1)
densor = Dense(1, activation = "relu")
activator = Activation('softmax', name='attention_weights')
dotor = Dot(axes = 1)

decoder_inputs = Input(shape=(None, VOCAB_INPUT_SIZE))  # batch shape=(b, MAX_LABEL_LENGTH, VOCAB_INPUT_SIZE)
decoder_lstm = LSTM(n_lstm, return_sequences=True, return_state=True)
decoder_dense = Dense(VOCAB_OUTPUT_SIZE, activation='softmax')
# multi step
outputs = []
for ty in range(MAX_LABEL_LENGTH):
    s_prev = tf.concat(encoder_states, axis=1)
    s_prev = repeator(s_prev)
    concat = concatenator([encoder_outputs, s_prev])
    e = densor(concat)
    alphas = activator(e)
    context = dotor([alphas, encoder_outputs])
    slice_input = Lambda(lambda x: x[:, ty:ty+1, :])(decoder_inputs)
    context = concatenator([slice_input, context])

    decoder_outputs, *encoder_states = decoder_lstm(context,
                                        initial_state=encoder_states)
    decoder_outputs = decoder_dense(decoder_outputs)

    outputs.append(decoder_outputs)

outputs = Concatenate(axis=1)(outputs)
model = Model([encoder_inputs, decoder_inputs], outputs)

# Run training
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [9]:
model.load_weights("../../models/Model020a/weights.h5")

In [10]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2500, 39)]   0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 2500, 256),  172032      input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_concat (TensorFlowO [(None, 256)]        0           bidirectional[0][2]              
                                                                 bidirectional[0][4]              
__________________________________________________________________________________________________
repeat_vector (RepeatVector)    (None, 2500, 256)    0           tf_op_layer_concat[0][0]     

                                                                 concatenate[338][0]              
                                                                 concatenate[340][0]              
                                                                 concatenate[342][0]              
                                                                 concatenate[344][0]              
                                                                 concatenate[346][0]              
                                                                 concatenate[348][0]              
                                                                 concatenate[350][0]              
                                                                 concatenate[352][0]              
                                                                 concatenate[354][0]              
                                                                 concatenate[356][0]              
          

                                                                 concatenate[109][0]              
                                                                 lstm_1[53][1]                    
                                                                 lstm_1[53][2]                    
                                                                 concatenate[111][0]              
                                                                 lstm_1[54][1]                    
                                                                 lstm_1[54][2]                    
                                                                 concatenate[113][0]              
                                                                 lstm_1[55][1]                    
                                                                 lstm_1[55][2]                    
                                                                 concatenate[115][0]              
          

lambda_107 (Lambda)             (None, None, 32)     0           input_2[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_concat_108 (TensorF [(None, 256)]        0           lstm_1[107][1]                   
                                                                 lstm_1[107][2]                   
__________________________________________________________________________________________________
lambda_108 (Lambda)             (None, None, 32)     0           input_2[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_concat_109 (TensorF [(None, 256)]        0           lstm_1[108][1]                   
                                                                 lstm_1[108][2]                   
__________________________________________________________________________________________________
lambda_109

Total params: 390,177
Trainable params: 390,177
Non-trainable params: 0
__________________________________________________________________________________________________


## Model Inferencing

In [25]:
from tqdm.auto import tqdm

In [26]:
state_h_input = Input(shape=(n_lstm))
state_c_input = Input(shape=(n_lstm))
context_input = Input(shape=(1, 2*n_lstm+VOCAB_OUTPUT_SIZE))
decoder_recurrent, decoder_state_h, decoder_state_c = decoder_lstm(context_input, initial_state=[state_h_input, state_c_input])
decoder_dense_output = decoder_dense(decoder_recurrent)

decoder_model = Model([context_input] +  [state_h_input, state_c_input], 
                      [decoder_dense_output] + [decoder_state_h, decoder_state_c])

In [27]:
encoder_model = Model(encoder_inputs, 
                      [encoder_outputs, state_c, state2_c]
)

repeator = repeator
concatenator = concatenator
densor = densor
activator = activator
dotor = dotor

decoder_lstm = decoder_lstm
decoder_dense = decoder_dense

def transcript(X):
    start_inputs = onehottify(np.expand_dims(np.array([Seq2SeqCharMap.CHAR_TO_IDX_MAP['<']]), axis=0), n=32)
    start_inputs = tf.convert_to_tensor(start_inputs)
    
    encoder_outputs, *encoder_states = encoder_model.predict(X)

    outputs = []
    for ty in tqdm(range(MAX_LABEL_LENGTH)):
        s_prev = tf.concat(encoder_states, axis=1)
        s_prev = repeator(s_prev)
        concat = concatenator([encoder_outputs, s_prev])
        e = densor(concat)
        alphas = activator(e)
        context = dotor([alphas, encoder_outputs])
        context = concatenator([start_inputs, context])

        decoder_outputs, *encoder_states = decoder_model.predict([context, encoder_states[0], encoder_states[1]])
        
        start_inputs = decoder_outputs
        outputs.append(decoder_outputs)
    
    outputs = Concatenate(axis=1)(outputs)
        
    return outputs

In [28]:
def pad_sequence(x, max_seq_length):
    """Zero pad input features sequence"""
    out = None
    if x.shape[0] > max_seq_length:
        raise ValueError(f"Found input sequence {x.shape[0]} more than {max_seq_length}")
    elif x.shape[0] < max_seq_length:
        out = np.zeros([max_seq_length, x.shape[1]])
        out[:x.shape[0]] = x
    else:
        out = x

    return out

In [29]:
X_test = np.load("0.npz")['arr_0']
X_test = pad_sequence(X_test, 2500)
X_test = np.expand_dims(X_test, axis=0)
with open("0.txt", 'r') as f:
    y_test = f.readline()

In [30]:
print(X_test.shape)
print(y_test)

(1, 2500, 39)
telah berdiri di dekat rumah haman, lima puluh hasta tingginya.


In [31]:
predictions = transcript(X_test)

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




In [32]:
''.join([Seq2SeqCharMap.IDX_TO_CHAR_MAP[c] for c in tf.argmax(predictions[0], axis=-1).numpy() if c!=0])

'seaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'

# Base Seq2Seq

With fixed output size.

In [None]:
BATCH_SIZE = 16
MAX_EPOCHS = 100
MAX_SEQ_LENGTH = 2500
MAX_LABEL_LENGTH = 250
VOCAB_INPUT_SIZE = len(Seq2SeqCharMap()) + 1
VOCAB_OUTPUT_SIZE = len(Seq2SeqCharMap()) + 1

n_lstm = 128
n_mfcc = 39

## Model Training

In [232]:
encoder_inputs = Input(shape=(MAX_SEQ_LENGTH, n_mfcc))  # batch shape=(b, MAX_SEQUENCE_LENGTH, n_mfcc)
encoder = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))
encoder_outputs, state_h, state_c, state2_h, state2_c = encoder(encoder_inputs)
# encoder_states = [tf.concat([state_h, state_c], axis=1), tf.concat([state2_h, state2_c], axis=1)]  # discard encoder_outputs and keep only states
encoder_states = [state_c, state2_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, VOCAB_INPUT_SIZE))  # batch shape=(b, MAX_LABEL_LENGTH, VOCAB_INPUT_SIZE)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(n_lstm, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(VOCAB_OUTPUT_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [231]:
model.summary()

Model: "model_16"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           [(None, 3000, 39)]   0                                            
__________________________________________________________________________________________________
input_32 (InputLayer)           [(None, 300, 32)]    0                                            
__________________________________________________________________________________________________
bidirectional_7 (Bidirectional) [(None, 3000, 256),  172032      input_31[0][0]                   
__________________________________________________________________________________________________
lstm_24 (LSTM)                  [(None, 300, 128), ( 82432       input_32[0][0]                   
                                                                 bidirectional_7[0][2]     

Train Model.

In [20]:
train_generator = DataGenerator(input_dir=train_dir,
                                max_seq_length=MAX_SEQ_LENGTH,
                                max_label_length=MAX_LABEL_LENGTH-1,
                                ctc_input_length=1495,
                                char_to_idx_map=Seq2SeqCharMap.CHAR_TO_IDX_MAP,
                                batch_size=8,
                                shuffle=False)
X = train_generator[0][0]['the_input']
Y = train_generator[0][0]['the_labels']
print(X.shape, Y.shape)

In [None]:
X = np.random.rand(8, MAX_SEQ_LENGTH, 39)
Y = np.random.randint(0, 32, size=(8, MAX_LABEL_LENGTH-1))

In [81]:
encoder_input_data = X
decoder_target_data = np.apply_along_axis(lambda x: np.append(x[:-1], [2, 0]), 1, Y)  # at end of sentence + blank token
decoder_input_data = np.apply_along_axis(lambda x: np.append([1], x[:-1]), 1, decoder_target_data)  # prepend start token
decoder_target_data = onehottify(decoder_target_data)
decoder_input_data = onehottify(decoder_input_data)

print(encoder_input_data.shape)
print(decoder_target_data.shape)
print(decoder_input_data.shape)

(1, 2500, 39)
(8, 250, 32)
(8, 250, 32)


In [259]:
train_history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
                          batch_size=BATCH_SIZE,
                          epochs=MAX_EPOCHS,
)

Train on 8 samples
Epoch 1/100


KeyboardInterrupt: 

## Model Inferencing

In [75]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [125]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, Seq2SeqCharMap.CHAR_TO_IDX_MAP['<']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = Seq2SeqCharMap.IDX_TO_CHAR_MAP[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '>' or
           len(decoded_sentence) > MAX_LABEL_LENGTH):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [126]:
for seq_index in range(1):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', Y_true[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: inilah kepala kepala para pahlawan yang mengiringi daud, yang telah memberi dukungan yang kuat kepadanya, bersama sama seluruh israel, dalam mencapai kedudukan raja dan yang mengangkat dia sebagai raja, seperti yang difirmankan tuhan mengenai israel.                                                 
Decoded sentence: inilah kelala pala pala pala pala pang yang dii dang dang dang perang seruu dang dang dang kerang dang serang dang dang dang dang perang dang serang dang serang dang dang dang dang perang dang dang dang kerang dang dang dang perang dang serang dang serang dang dang dang perang dang serang dang serang


# Listen, Attend, and Spell

Keras implementation of Listen Attend and Spell [paper](https://arxiv.org/abs/1508.01211).

In [7]:
from tensorflow.keras.layers import Layer
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

class PStacker(Layer):
    def __init__(self):
        super(PStacker, self).__init__()
        self.even = Lambda(lambda x: x[:, 0::2, :])
        self.odd = Lambda(lambda x: x[:, 1::2, :])
        self.concat = Concatenate(axis=-1)

    def call(self, inputs):
        even_sequence = self.even(inputs)
        odd_sequence = self.odd(inputs)
        outputs = self.concat([even_sequence, odd_sequence])
        return outputs

    
class MLP(Layer):
    def __init__(self, n_dense):
        super(MLP, self).__init__()
        self.densor_1 = Dense(n_dense)
        self.densor_2 = Dense(n_dense//2)
        self.densor_3 = Dense(n_dense//4)
        self.densor_4 = Dense(1, activation='relu')
        
    def call(self, X):
        return self.densor_4(self.densor_3(self.densor_2(self.densor_1(X))))


class LuongAttention(Model):
    def __init__(self, n_dense):
        super(LuongAttention, self).__init__()
        self.concatenator = Concatenate(axis=-1)
        self.densor = MLP(n_dense)
        self.activator = Activation('softmax', name='attention_weights')
        self.dotor = Dot(axes=1)
    
    def call(self, inputs):
        encoder_outputs, *decoder_prev_states = inputs
        Tx = K.int_shape(encoder_outputs)[1]
        
        decoder_prev_states = self.concatenator(decoder_prev_states)
        decoder_prev_states = RepeatVector(Tx)(decoder_prev_states)
        concat = self.concatenator([encoder_outputs, decoder_prev_states])
        
        e = self.densor(concat)
        alphas = self.activator(e)
        context_vector = self.dotor([alphas, encoder_outputs])
        
        return context_vector, alphas


class DecoderLSTM(Model):
    def __init__(self, n_lstm, vocab_len):
        super(DecoderLSTM, self).__init__()
        self.lstm_1 = LSTM(n_lstm, return_sequences=True, return_state=True)
        self.lstm_2 = LSTM(n_lstm, return_sequences=True, return_state=True)
        self.dense = Dense(vocab_len, activation='softmax')
    
    def call(self, inputs):
        context_vector, *initial_states = inputs
        
        lstm_1_output, *lstm_1_states = self.lstm_1(context_vector, initial_state=initial_states[0:2])
        lstm_2_output, *lstm_2_states = self.lstm_2(lstm_1_output, initial_state=initial_states[2:4])
        outputs = self.dense(lstm_2_output)
        
        return outputs, [*lstm_1_states, *lstm_2_states]
    

class EncoderLSTM(Model):
    def __init__(self, n_lstm):
        super(EncoderLSTM, self).__init__()
        self.pstack = PStacker()
        self.encoder_1 = Bidirectional(LSTM(n_lstm//4, return_sequences=True))
        self.encoder_2 = Bidirectional(LSTM(n_lstm//2, return_sequences=True))
        self.encoder_3 = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))
        
    def call(self, inputs):
        stack_1 = self.pstack(self.encoder_1(inputs))
        stack_2 = self.pstack(self.encoder_2(stack_1))
        encoder_outputs, *encoder_states = self.encoder_3(stack_2)
        
        return encoder_outputs, encoder_states

Configure parameters.

In [8]:
BATCH_SIZE = 16
MAX_EPOCHS = 5000
MAX_SEQ_LENGTH = 2500
MAX_LABEL_LENGTH = 250
VOCAB_INPUT_SIZE = len(Seq2SeqCharMap()) + 1
VOCAB_OUTPUT_SIZE = len(Seq2SeqCharMap()) + 1

n_lstm = 128
n_mfcc = 39
n_dense = 128

## Model Training

In [14]:
# Inputs
encoder_inputs = Input(shape=(MAX_SEQ_LENGTH, n_mfcc))  # batch shape=(b, MAX_SEQ_LENGTH, n_mfcc)

# Encoder Model (pStack BiLSTM)
encoder_model = EncoderLSTM(n_lstm)
encoder_outputs, encoder_states = encoder_model(encoder_inputs)

# Attention Model
attention = LuongAttention(n_dense)

# Decoder Model
decoder_inputs = Input(shape=(None, VOCAB_INPUT_SIZE))  # batch shape=(b, MAX_LABEL_LENGTH, VOCAB_INPUT_SIZE)
decoder_model = DecoderLSTM(n_lstm, VOCAB_OUTPUT_SIZE)

# Teacher Forcing Training
decoder_targets = []
for Ty in range(MAX_LABEL_LENGTH):
    slice_input = Lambda(lambda x: x[:, Ty:Ty+1, :])(decoder_inputs)
    context_vector, _ = attention([encoder_outputs, *encoder_states])
    concat_inputs = Concatenate(axis=-1)([slice_input, context_vector])
    decoder_outputs, encoder_states = decoder_model([concat_inputs, *encoder_states])    
    decoder_targets.append(decoder_outputs)

decoder_targets = Concatenate(axis=1)(decoder_targets)
LAS_Model = Model([encoder_inputs, decoder_inputs], decoder_targets)

# Compile Model
LAS_Model.compile(optimizer='adam', loss='categorical_crossentropy',
                  metrics=['accuracy'])

In [15]:
LAS_Model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 2500, 39)]   0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, None, 32)]   0                                            
__________________________________________________________________________________________________
encoder_lstm_3 (EncoderLSTM)    ((None, 625, 256), [ 511488      input_6[0][0]                    
__________________________________________________________________________________________________
lambda_423 (Lambda)             (None, None, 32)     0           input_7[0][0]                    
____________________________________________________________________________________________

                                                                 decoder_lstm_2[70][4]            
                                                                 concatenate_245[0][0]            
                                                                 decoder_lstm_2[71][1]            
                                                                 decoder_lstm_2[71][2]            
                                                                 decoder_lstm_2[71][3]            
                                                                 decoder_lstm_2[71][4]            
                                                                 concatenate_246[0][0]            
                                                                 decoder_lstm_2[72][1]            
                                                                 decoder_lstm_2[72][2]            
                                                                 decoder_lstm_2[72][3]            
          

concatenate_211 (Concatenate)   (None, 1, 288)       0           lambda_461[0][0]                 
                                                                 luong_attention_2[38][0]         
__________________________________________________________________________________________________
lambda_462 (Lambda)             (None, None, 32)     0           input_7[0][0]                    
__________________________________________________________________________________________________
concatenate_212 (Concatenate)   (None, 1, 288)       0           lambda_462[0][0]                 
                                                                 luong_attention_2[39][0]         
__________________________________________________________________________________________________
lambda_463 (Lambda)             (None, None, 32)     0           input_7[0][0]                    
__________________________________________________________________________________________________
concatenat

lambda_664 (Lambda)             (None, None, 32)     0           input_7[0][0]                    
__________________________________________________________________________________________________
concatenate_414 (Concatenate)   (None, 1, 288)       0           lambda_664[0][0]                 
                                                                 luong_attention_2[241][0]        
__________________________________________________________________________________________________
lambda_665 (Lambda)             (None, None, 32)     0           input_7[0][0]                    
__________________________________________________________________________________________________
concatenate_415 (Concatenate)   (None, 1, 288)       0           lambda_665[0][0]                 
                                                                 luong_attention_2[242][0]        
__________________________________________________________________________________________________
lambda_666

Train the model.

In [16]:
X = np.random.rand(8, MAX_SEQ_LENGTH, 39)
Y = np.random.randint(0, 32, size=(8, MAX_LABEL_LENGTH-1))

encoder_input_data = X
decoder_target_data = np.apply_along_axis(lambda x: np.append(x[:-1], [2, 0]), 1, Y)  # at end of sentence + blank token
decoder_input_data = np.apply_along_axis(lambda x: np.append([1], x[:-1]), 1, decoder_target_data)  # prepend start token
decoder_target_data = onehottify(decoder_target_data)
decoder_input_data = onehottify(decoder_input_data)

print(encoder_input_data.shape)
print(decoder_target_data.shape)
print(decoder_input_data.shape)

(8, 2500, 39)
(8, 250, 32)
(8, 250, 32)


In [30]:
LAS_Model.fit([encoder_input_data, decoder_input_data], decoder_target_data)

Train on 8 samples


KeyboardInterrupt: 

## Model Inferencing

In [22]:
def decode_sequence(X):
    encoder_outputs, encoder_states = encoder_model.predict(X)
    
    target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
    target_seq[0, 0, Seq2SeqCharMap.CHAR_TO_IDX_MAP['<']] = 1
    
    stop_condition = False
    decoded_outputs = []
    attention_outputs = []
    while not stop_condition:
        context_vector, alphas = attention([encoder_outputs, *encoder_states])
        concat_inputs = Concatenate(axis=-1)([target_seq, context_vector])
        
        decoder_outputs, encoder_states = decoder_model.predict([concat_inputs, *encoder_states])
        decoded_outputs.append(decoder_outputs)
        attention_outputs.append(alphas)
        
        predicted_id = tf.argmax(decoded_outputs[0], axis=-1)
        if (predicted_id == Seq2SeqCharMap.CHAR_TO_IDX_MAP['>'] or len(decoded_outputs) > MAX_LABEL_LENGTH):
            stop_condition = True
        
        target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
        target_seq[0, 0, predicted_id] = 1

    return decoded_outputs, attention_outputs

In [None]:
decoded_outputs, attention_outputs = decode_sequence(X)

In [None]:
ctc_decoded, _ = K.ctc_decode(decoded_outputs,
                              [decoded_outputs.shape[1]] * decoded_outputs.shape[0])
ctc_decoded = ctc_decoded.numpy()
y_pred_string = ''.join([Seq2SeqCharMap.IDX_TO_CHAR_MAP(c) for c in ctc_decoded])