<span style="font-size:36px"><b>Seq2seq Model Tutorial</b></span>

Copyright &copy; 2020 Gunawan Lumban Gaol

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language overning permissions and limitations under the License.

# Import Packages

In [6]:
import os
import sys
import glob

import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Attention, LSTM, Dense, TimeDistributed, Lambda
from tensorflow.keras.layers import Concatenate, Input, Masking, Activation, RepeatVector, Dot, Bidirectional
from tensorflow.keras.models import Model
print("tf.__version__ = "+tf.__version__)

from gurih.data.data_generator import DataGenerator, validate_dataset_dir
from gurih.models.model import Seq2SeqModel
from gurih.models.my_keras_layers import BahdanauAttention
from gurih.models.utils import CharMap, Seq2SeqCharMap, wer_and_cer

%load_ext autoreload
%autoreload 2

tf.__version__ = 2.1.0


In [7]:
def onehottify(x, n=None, dtype=float):
    """1-hot encode x with the max value n (computed from data if n is None)."""
    x = np.asarray(x)
    n = np.max(x) + 1 if n is None else n
    return np.eye(n, dtype=dtype)[x]

Set documentation directory and model weights directory.

In [8]:
model_dir = "../../models/Model020a/"  # to store model weights
doc_dir = "../../docs/Model020a/"  # to store model configuration and evaluation

if not os.path.exists(model_dir): os.makedirs(model_dir)
if not os.path.exists(doc_dir): os.makedirs(doc_dir)

Also specify `train`, `validation`, and `test` dataset directories. 

These directories should contain all `.npz` and its corresponding `.txt` transcriptions files with same naming conventions. For example if npz filename is `0.npz`, then the transcription txt filename should be `0.txt`.

In [9]:
train_dir = "../../dataset/interim/Model010a/train/"
valid_dir = "../../dataset/interim/Model010a/valid/"

In [10]:
validate_dataset_dir(train_dir)
validate_dataset_dir(valid_dir)

../../dataset/interim/Model010a/train/ checks passed.
../../dataset/interim/Model010a/valid/ checks passed.


# Basic Listen, Attend, and Spell

Basic LAS architecture with reduced layers.

In [11]:
BATCH_SIZE = 16
MAX_EPOCHS = 100
MAX_SEQ_LENGTH = 2500
MAX_LABEL_LENGTH = 250
VOCAB_INPUT_SIZE = len(Seq2SeqCharMap()) + 1
VOCAB_OUTPUT_SIZE = len(Seq2SeqCharMap()) + 1

n_lstm = 128
n_mfcc = 39

## Model Training

In [8]:
encoder_inputs = Input(shape=(MAX_SEQ_LENGTH, n_mfcc))  # batch shape=(b, MAX_SEQUENCE_LENGTH, n_mfcc)
encoder = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))
encoder_outputs, state_h, state_c, state2_h, state2_c = encoder(encoder_inputs)
# encoder_states = [tf.concat([state_h, state_c], axis=1), tf.concat([state2_h, state2_c], axis=1)]  # discard encoder_outputs and keep only states
encoder_states = [state_c, state2_c]

repeator = RepeatVector(MAX_SEQ_LENGTH)
concatenator = Concatenate(axis=-1)
densor = Dense(1, activation = "relu")
activator = Activation('softmax', name='attention_weights')
dotor = Dot(axes = 1)

decoder_inputs = Input(shape=(None, VOCAB_INPUT_SIZE))  # batch shape=(b, MAX_LABEL_LENGTH, VOCAB_INPUT_SIZE)
decoder_lstm = LSTM(n_lstm, return_sequences=True, return_state=True)
decoder_dense = Dense(VOCAB_OUTPUT_SIZE, activation='softmax')
# multi step
outputs = []
for ty in range(MAX_LABEL_LENGTH):
    s_prev = tf.concat(encoder_states, axis=1)
    s_prev = repeator(s_prev)
    concat = concatenator([encoder_outputs, s_prev])
    e = densor(concat)
    alphas = activator(e)
    context = dotor([alphas, encoder_outputs])
    slice_input = Lambda(lambda x: x[:, ty:ty+1, :])(decoder_inputs)
    context = concatenator([slice_input, context])

    decoder_outputs, *encoder_states = decoder_lstm(context,
                                        initial_state=encoder_states)
    decoder_outputs = decoder_dense(decoder_outputs)

    outputs.append(decoder_outputs)

outputs = Concatenate(axis=1)(outputs)
model = Model([encoder_inputs, decoder_inputs], outputs)

# Run training
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [9]:
model.load_weights("../../models/Model020a/weights.h5")

In [10]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2500, 39)]   0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 2500, 256),  172032      input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_concat (TensorFlowO [(None, 256)]        0           bidirectional[0][2]              
                                                                 bidirectional[0][4]              
__________________________________________________________________________________________________
repeat_vector (RepeatVector)    (None, 2500, 256)    0           tf_op_layer_concat[0][0]     

                                                                 concatenate[338][0]              
                                                                 concatenate[340][0]              
                                                                 concatenate[342][0]              
                                                                 concatenate[344][0]              
                                                                 concatenate[346][0]              
                                                                 concatenate[348][0]              
                                                                 concatenate[350][0]              
                                                                 concatenate[352][0]              
                                                                 concatenate[354][0]              
                                                                 concatenate[356][0]              
          

                                                                 concatenate[109][0]              
                                                                 lstm_1[53][1]                    
                                                                 lstm_1[53][2]                    
                                                                 concatenate[111][0]              
                                                                 lstm_1[54][1]                    
                                                                 lstm_1[54][2]                    
                                                                 concatenate[113][0]              
                                                                 lstm_1[55][1]                    
                                                                 lstm_1[55][2]                    
                                                                 concatenate[115][0]              
          

lambda_107 (Lambda)             (None, None, 32)     0           input_2[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_concat_108 (TensorF [(None, 256)]        0           lstm_1[107][1]                   
                                                                 lstm_1[107][2]                   
__________________________________________________________________________________________________
lambda_108 (Lambda)             (None, None, 32)     0           input_2[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_concat_109 (TensorF [(None, 256)]        0           lstm_1[108][1]                   
                                                                 lstm_1[108][2]                   
__________________________________________________________________________________________________
lambda_109

Total params: 390,177
Trainable params: 390,177
Non-trainable params: 0
__________________________________________________________________________________________________


## Model Inferencing

In [25]:
from tqdm.auto import tqdm

In [26]:
state_h_input = Input(shape=(n_lstm))
state_c_input = Input(shape=(n_lstm))
context_input = Input(shape=(1, 2*n_lstm+VOCAB_OUTPUT_SIZE))
decoder_recurrent, decoder_state_h, decoder_state_c = decoder_lstm(context_input, initial_state=[state_h_input, state_c_input])
decoder_dense_output = decoder_dense(decoder_recurrent)

decoder_model = Model([context_input] +  [state_h_input, state_c_input], 
                      [decoder_dense_output] + [decoder_state_h, decoder_state_c])

In [27]:
encoder_model = Model(encoder_inputs, 
                      [encoder_outputs, state_c, state2_c]
)

repeator = repeator
concatenator = concatenator
densor = densor
activator = activator
dotor = dotor

decoder_lstm = decoder_lstm
decoder_dense = decoder_dense

def transcript(X):
    start_inputs = onehottify(np.expand_dims(np.array([Seq2SeqCharMap.CHAR_TO_IDX_MAP['<']]), axis=0), n=32)
    start_inputs = tf.convert_to_tensor(start_inputs)
    
    encoder_outputs, *encoder_states = encoder_model.predict(X)

    outputs = []
    for ty in tqdm(range(MAX_LABEL_LENGTH)):
        s_prev = tf.concat(encoder_states, axis=1)
        s_prev = repeator(s_prev)
        concat = concatenator([encoder_outputs, s_prev])
        e = densor(concat)
        alphas = activator(e)
        context = dotor([alphas, encoder_outputs])
        context = concatenator([start_inputs, context])

        decoder_outputs, *encoder_states = decoder_model.predict([context, encoder_states[0], encoder_states[1]])
        
        start_inputs = decoder_outputs
        outputs.append(decoder_outputs)
    
    outputs = Concatenate(axis=1)(outputs)
        
    return outputs

In [28]:
def pad_sequence(x, max_seq_length):
    """Zero pad input features sequence"""
    out = None
    if x.shape[0] > max_seq_length:
        raise ValueError(f"Found input sequence {x.shape[0]} more than {max_seq_length}")
    elif x.shape[0] < max_seq_length:
        out = np.zeros([max_seq_length, x.shape[1]])
        out[:x.shape[0]] = x
    else:
        out = x

    return out

In [29]:
X_test = np.load("0.npz")['arr_0']
X_test = pad_sequence(X_test, 2500)
X_test = np.expand_dims(X_test, axis=0)
with open("0.txt", 'r') as f:
    y_test = f.readline()

In [30]:
print(X_test.shape)
print(y_test)

(1, 2500, 39)
telah berdiri di dekat rumah haman, lima puluh hasta tingginya.


In [31]:
predictions = transcript(X_test)

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




In [32]:
''.join([Seq2SeqCharMap.IDX_TO_CHAR_MAP[c] for c in tf.argmax(predictions[0], axis=-1).numpy() if c!=0])

'seaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'

# Base Seq2Seq

With fixed output size.

In [None]:
BATCH_SIZE = 16
MAX_EPOCHS = 100
MAX_SEQ_LENGTH = 2500
MAX_LABEL_LENGTH = 250
VOCAB_INPUT_SIZE = len(Seq2SeqCharMap()) + 1
VOCAB_OUTPUT_SIZE = len(Seq2SeqCharMap()) + 1

n_lstm = 128
n_mfcc = 39

## Model Training

In [232]:
encoder_inputs = Input(shape=(MAX_SEQ_LENGTH, n_mfcc))  # batch shape=(b, MAX_SEQUENCE_LENGTH, n_mfcc)
encoder = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))
encoder_outputs, state_h, state_c, state2_h, state2_c = encoder(encoder_inputs)
# encoder_states = [tf.concat([state_h, state_c], axis=1), tf.concat([state2_h, state2_c], axis=1)]  # discard encoder_outputs and keep only states
encoder_states = [state_c, state2_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, VOCAB_INPUT_SIZE))  # batch shape=(b, MAX_LABEL_LENGTH, VOCAB_INPUT_SIZE)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(n_lstm, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(VOCAB_OUTPUT_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [231]:
model.summary()

Model: "model_16"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           [(None, 3000, 39)]   0                                            
__________________________________________________________________________________________________
input_32 (InputLayer)           [(None, 300, 32)]    0                                            
__________________________________________________________________________________________________
bidirectional_7 (Bidirectional) [(None, 3000, 256),  172032      input_31[0][0]                   
__________________________________________________________________________________________________
lstm_24 (LSTM)                  [(None, 300, 128), ( 82432       input_32[0][0]                   
                                                                 bidirectional_7[0][2]     

Train Model.

In [20]:
train_generator = DataGenerator(input_dir=train_dir,
                                max_seq_length=MAX_SEQ_LENGTH,
                                max_label_length=MAX_LABEL_LENGTH-1,
                                ctc_input_length=1495,
                                char_to_idx_map=Seq2SeqCharMap.CHAR_TO_IDX_MAP,
                                batch_size=8,
                                shuffle=False)
X = train_generator[0][0]['the_input']
Y = train_generator[0][0]['the_labels']
print(X.shape, Y.shape)

In [None]:
X = np.random.rand(8, MAX_SEQ_LENGTH, 39)
Y = np.random.randint(0, 32, size=(8, MAX_LABEL_LENGTH-1))

In [81]:
encoder_input_data = X
decoder_target_data = np.apply_along_axis(lambda x: np.append(x[:-1], [2, 0]), 1, Y)  # at end of sentence + blank token
decoder_input_data = np.apply_along_axis(lambda x: np.append([1], x[:-1]), 1, decoder_target_data)  # prepend start token
decoder_target_data = onehottify(decoder_target_data)
decoder_input_data = onehottify(decoder_input_data)

print(encoder_input_data.shape)
print(decoder_target_data.shape)
print(decoder_input_data.shape)

(1, 2500, 39)
(8, 250, 32)
(8, 250, 32)


In [259]:
train_history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
                          batch_size=BATCH_SIZE,
                          epochs=MAX_EPOCHS,
)

Train on 8 samples
Epoch 1/100


KeyboardInterrupt: 

## Model Inferencing

In [75]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [125]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, Seq2SeqCharMap.CHAR_TO_IDX_MAP['<']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = Seq2SeqCharMap.IDX_TO_CHAR_MAP[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '>' or
           len(decoded_sentence) > MAX_LABEL_LENGTH):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [126]:
for seq_index in range(1):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', Y_true[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: inilah kepala kepala para pahlawan yang mengiringi daud, yang telah memberi dukungan yang kuat kepadanya, bersama sama seluruh israel, dalam mencapai kedudukan raja dan yang mengangkat dia sebagai raja, seperti yang difirmankan tuhan mengenai israel.                                                 
Decoded sentence: inilah kelala pala pala pala pala pang yang dii dang dang dang perang seruu dang dang dang kerang dang serang dang dang dang dang perang dang serang dang serang dang dang dang dang perang dang dang dang kerang dang dang dang perang dang serang dang serang dang dang dang perang dang serang dang serang


# Listen, Attend, and Spell Original

Keras implementation of Listen Attend and Spell original [paper](https://arxiv.org/abs/1508.01211).

In [6]:
from gurih.models.my_keras_layers import PStacker, MLPAttention, MLPOutput, LuongAttention, DecoderLSTM, EncoderLSTM
from gurih.models.my_keras_metrics import my_sequence_edit_distance

Configure parameters.

In [16]:
BATCH_SIZE = 16
MAX_EPOCHS = 5000
MAX_SEQ_LENGTH = 2500
n_mfcc = 39
MAX_LABEL_LENGTH = 250
VOCAB_INPUT_SIZE = len(Seq2SeqCharMap())
VOCAB_OUTPUT_SIZE = len(Seq2SeqCharMap())

# Small
n_lstm = 256
n_dense = 256

# # Medium
# n_lstm = 512
# n_dense = 512

# # Large
# n_lstm = 1024
# n_dense = 512

## Model Training

In [17]:
# Inputs
encoder_inputs = Input(shape=(MAX_SEQ_LENGTH, n_mfcc))  # batch shape=(b, MAX_SEQ_LENGTH, n_mfcc)

# Encoder Model (pStack BiLSTM)
encoder_model = EncoderLSTM(n_lstm)
encoder_outputs, encoder_states = encoder_model(encoder_inputs)

# Attention Model
attention = LuongAttention(n_dense)

# Decoder Model
decoder_inputs = Input(shape=(MAX_LABEL_LENGTH, VOCAB_INPUT_SIZE))  # batch shape=(b, MAX_LABEL_LENGTH, VOCAB_INPUT_SIZE)
decoder_model = DecoderLSTM(n_lstm, n_dense, VOCAB_OUTPUT_SIZE)

# Teacher Forcing Training
decoder_targets = []
for Ty in range(MAX_LABEL_LENGTH):
    slice_input = Lambda(lambda x: x[:, Ty:Ty+1, :])(decoder_inputs)
    context_vector, _ = attention([encoder_outputs, *encoder_states])
    concat_inputs = Concatenate(axis=-1)([slice_input, context_vector])
    decoder_outputs, encoder_states = decoder_model([concat_inputs, *encoder_states])    
    decoder_targets.append(decoder_outputs)

decoder_targets = Concatenate(axis=1)(decoder_targets)
LAS_Model = Model([encoder_inputs, decoder_inputs], decoder_targets)

# Compile Model
LAS_Model.compile(optimizer='rmsprop', 
                  loss='categorical_crossentropy',
                  metrics=[my_sequence_edit_distance])

In [18]:
LAS_Model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 2500, 39)]   0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 250, 31)]    0                                            
__________________________________________________________________________________________________
encoder_lstm_1 (EncoderLSTM)    ((None, 625, 512), [ 2022400     input_3[0][0]                    
__________________________________________________________________________________________________
lambda_254 (Lambda)             (None, 1, 31)        0           input_4[0][0]                    
____________________________________________________________________________________________

                                                                 decoder_lstm_1[16][2]            
                                                                 decoder_lstm_1[16][3]            
                                                                 decoder_lstm_1[16][4]            
                                                                 concatenate_273[0][0]            
                                                                 decoder_lstm_1[17][1]            
                                                                 decoder_lstm_1[17][2]            
                                                                 decoder_lstm_1[17][3]            
                                                                 decoder_lstm_1[17][4]            
                                                                 concatenate_274[0][0]            
                                                                 decoder_lstm_1[18][1]            
          

__________________________________________________________________________________________________
concatenate_323 (Concatenate)   (None, 1, 543)       0           lambda_322[0][0]                 
                                                                 luong_attention_1[68][0]         
__________________________________________________________________________________________________
lambda_323 (Lambda)             (None, 1, 31)        0           input_4[0][0]                    
__________________________________________________________________________________________________
concatenate_324 (Concatenate)   (None, 1, 543)       0           lambda_323[0][0]                 
                                                                 luong_attention_1[69][0]         
__________________________________________________________________________________________________
lambda_324 (Lambda)             (None, 1, 31)        0           input_4[0][0]                    
__________

                                                                 decoder_lstm_1[230][0]           
                                                                 decoder_lstm_1[231][0]           
                                                                 decoder_lstm_1[232][0]           
                                                                 decoder_lstm_1[233][0]           
                                                                 decoder_lstm_1[234][0]           
                                                                 decoder_lstm_1[235][0]           
                                                                 decoder_lstm_1[236][0]           
                                                                 decoder_lstm_1[237][0]           
                                                                 decoder_lstm_1[238][0]           
                                                                 decoder_lstm_1[239][0]           
          

Train the model.

In [19]:
X = np.random.rand(8, MAX_SEQ_LENGTH, 39)
Y = np.random.randint(0, 31, size=(8, MAX_LABEL_LENGTH-1))

bos_idx = Seq2SeqCharMap.CHAR_TO_IDX_MAP['<']
eos_idx = Seq2SeqCharMap.CHAR_TO_IDX_MAP['>']
pad_idx = Seq2SeqCharMap.CHAR_TO_IDX_MAP[' ']
encoder_input_data = X
decoder_target_data = np.apply_along_axis(lambda x: np.append(x[:-1], [eos_idx, pad_idx]), 1, Y)  # at end of sentence + blank token
decoder_input_data = np.apply_along_axis(lambda x: np.append([bos_idx], x[:-1]), 1, decoder_target_data)  # prepend start token
decoder_target_data = onehottify(decoder_target_data)
decoder_input_data = onehottify(decoder_input_data)

print(encoder_input_data.shape)
print(decoder_target_data.shape)
print(decoder_input_data.shape)

(8, 2500, 39)
(8, 250, 31)
(8, 250, 31)


In [20]:
LAS_Model.fit([encoder_input_data, decoder_input_data], decoder_target_data)

Train on 8 samples


<tensorflow.python.keras.callbacks.History at 0x17c5caf08>

## Model Inferencing

In [15]:
LAS_Model.load_weights('LAS_Model_weights.h5')

In [53]:
def decode_sequence(X):    
    encoder_outputs, encoder_states = encoder_model(X)
    
    target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
    target_seq[0, 0, Seq2SeqCharMap.CHAR_TO_IDX_MAP['<']] = 1
    
    stop_condition = False
    decoded_outputs = []
    attention_outputs = []
    while not stop_condition:
        context_vector, alphas = attention([encoder_outputs, *encoder_states])
        concat_inputs = Concatenate(axis=-1)([target_seq, context_vector])
        
        decoder_outputs, encoder_states = decoder_model([concat_inputs, *encoder_states])
        decoded_outputs.append(decoder_outputs)
        attention_outputs.append(alphas)
        
        predicted_id = tf.argmax(decoded_outputs[0], axis=-1)
        if (predicted_id == Seq2SeqCharMap.CHAR_TO_IDX_MAP['>'] or len(decoded_outputs) > MAX_LABEL_LENGTH):
            stop_condition = True
        
        target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
        target_seq[0, 0, predicted_id] = 1

    return decoded_outputs, attention_outputs

In [55]:
decoded_outputs, attention_outputs = decode_sequence(X[0:1])
decoded_outputs = tf.concat(decoded_outputs, axis=1).numpy()

In [48]:
y_pred_string = ''.join([Seq2SeqCharMap.IDX_TO_CHAR_MAP[c] for c in np.argmax(decoded_outputs[0], axis=-1)])
y_pred_string

'uu,,,ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff'

# Listen, Attend, and Spell SpecAugment

Keras implementation of LAS SpecAugment [paper](https://arxiv.org/pdf/1904.08779.pdf)

In [6]:
from gurih.models.my_keras_layers import EncoderLSTMSpecAugment, LuongAttention, DecoderLSTMSpecAugment
from gurih.models.my_keras_metrics import CER

Configure parameters.

In [7]:
BATCH_SIZE = 16
MAX_EPOCHS = 5000
MAX_SEQ_LENGTH = 2500
n_mfcc = 39
MAX_LABEL_LENGTH = 250
VOCAB_INPUT_SIZE = len(Seq2SeqCharMap())
VOCAB_OUTPUT_SIZE = len(Seq2SeqCharMap())

# Small
# n_lstm = 256
# n_dense = 256

# Medium
# n_lstm = 512
# n_dense = 512

# Large
n_lstm = 1024
n_dense = 512

## Model Training

In [8]:
# Inputs
encoder_inputs = Input(shape=(MAX_SEQ_LENGTH, n_mfcc))

# Encoder Model
encoder_model = EncoderLSTMSpecAugment(n_lstm, n_filters=256, kernel_size=11)
encoder_outputs, encoder_states = encoder_model(encoder_inputs)
encoder_states = [encoder_states[1], encoder_states[3]]  # only get the cell state

# Attention Model
attention = LuongAttention(n_dense)

# Decoder Model
decoder_inputs = Input(shape=(MAX_LABEL_LENGTH, VOCAB_INPUT_SIZE))
decoder_model = DecoderLSTMSpecAugment(n_lstm, VOCAB_OUTPUT_SIZE)

# Teacher Forcing Training
decoder_targets = []
for Ty in range(MAX_LABEL_LENGTH):
    slice_input = Lambda(lambda x: x[:, Ty:Ty+1, :])(decoder_inputs)
    context_vector, _ = attention([encoder_outputs, *encoder_states])
    concat_inputs = Concatenate(axis=-1)([slice_input, context_vector])
    decoder_outputs, encoder_states = decoder_model([concat_inputs, *encoder_states])    
    decoder_targets.append(decoder_outputs)

decoder_targets = Concatenate(axis=1)(decoder_targets)
LAS_Model = Model([encoder_inputs, decoder_inputs], decoder_targets)

# Compile Model
LAS_Model.compile(optimizer='rmsprop', 
                  loss='categorical_crossentropy',
                  metrics=[CER])

In [9]:
LAS_Model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2500, 39)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 250, 31)]    0                                            
__________________________________________________________________________________________________
encoder_lstm_spec_augment (Enco ((None, 625, 2048),  86847232    input_1[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 1, 31)        0           input_2[0][0]                    
______________________________________________________________________________________________

lambda_83 (Lambda)              (None, 1, 31)        0           input_2[0][0]                    
__________________________________________________________________________________________________
concatenate_84 (Concatenate)    (None, 1, 2079)      0           lambda_83[0][0]                  
                                                                 luong_attention[83][0]           
__________________________________________________________________________________________________
lambda_84 (Lambda)              (None, 1, 31)        0           input_2[0][0]                    
__________________________________________________________________________________________________
concatenate_85 (Concatenate)    (None, 1, 2079)      0           lambda_84[0][0]                  
                                                                 luong_attention[84][0]           
__________________________________________________________________________________________________
lambda_85 

                                                                 luong_attention[185][0]          
__________________________________________________________________________________________________
lambda_186 (Lambda)             (None, 1, 31)        0           input_2[0][0]                    
__________________________________________________________________________________________________
concatenate_187 (Concatenate)   (None, 1, 2079)      0           lambda_186[0][0]                 
                                                                 luong_attention[186][0]          
__________________________________________________________________________________________________
lambda_187 (Lambda)             (None, 1, 31)        0           input_2[0][0]                    
__________________________________________________________________________________________________
concatenate_188 (Concatenate)   (None, 1, 2079)      0           lambda_187[0][0]                 
          

Train the model.

In [10]:
X = np.random.rand(8, MAX_SEQ_LENGTH, 39)
Y = np.random.randint(0, 31, size=(8, MAX_LABEL_LENGTH-1))

bos_idx = Seq2SeqCharMap.CHAR_TO_IDX_MAP['<']
eos_idx = Seq2SeqCharMap.CHAR_TO_IDX_MAP['>']
pad_idx = Seq2SeqCharMap.CHAR_TO_IDX_MAP[' ']
encoder_input_data = X
decoder_target_data = np.apply_along_axis(lambda x: np.append(x[:-1], [eos_idx, pad_idx]), 1, Y)  # at end of sentence + blank token
decoder_input_data = np.apply_along_axis(lambda x: np.append([bos_idx], x[:-1]), 1, decoder_target_data)  # prepend start token
decoder_target_data = onehottify(decoder_target_data)
decoder_input_data = onehottify(decoder_input_data)

print(encoder_input_data.shape)
print(decoder_target_data.shape)
print(decoder_input_data.shape)

(8, 2500, 39)
(8, 250, 31)
(8, 250, 31)


In [11]:
LAS_Model.fit([encoder_input_data, decoder_input_data], decoder_target_data)

Train on 8 samples


<tensorflow.python.keras.callbacks.History at 0x547b208>

## Model Inferencing

In [14]:
def decode_sequence(X):    
    encoder_outputs, encoder_states = encoder_model(X)
    encoder_states = [encoder_states[1], encoder_states[3]]  # only get the cell state
    
    target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
    target_seq[0, 0, Seq2SeqCharMap.CHAR_TO_IDX_MAP['<']] = 1
    
    stop_condition = False
    decoded_outputs = []
    attention_outputs = []
    while not stop_condition:
        context_vector, alphas = attention([encoder_outputs, *encoder_states])
        concat_inputs = Concatenate(axis=-1)([target_seq, context_vector])
        
        decoder_outputs, encoder_states = decoder_model([concat_inputs, *encoder_states])
        decoded_outputs.append(decoder_outputs)
        attention_outputs.append(alphas)
        
        predicted_id = tf.argmax(decoded_outputs[0], axis=-1)
        if (predicted_id == Seq2SeqCharMap.CHAR_TO_IDX_MAP['>'] or len(decoded_outputs) > MAX_LABEL_LENGTH):
            stop_condition = True
        
        target_seq = np.zeros((1, 1, VOCAB_INPUT_SIZE))
        target_seq[0, 0, predicted_id] = 1

    return decoded_outputs, attention_outputs

In [15]:
decoded_outputs, attention_outputs = decode_sequence(X[0:1])
decoded_outputs = tf.concat(decoded_outputs, axis=1).numpy()



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float

In [16]:
y_pred_string = ''.join([Seq2SeqCharMap.IDX_TO_CHAR_MAP[c] for c in np.argmax(decoded_outputs[0], axis=-1)])
y_pred_string

'ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc'

In [60]:
from kapre.time_frequency import Melspectrogram
from tensorflow.keras.layers import Reshape

In [39]:
test = tf.convert_to_tensor("../../test/test_data/test_20kb.wav")

In [64]:
# Turn the wav file into a string tensor
input_data = tf.io.read_file(test)
# Convert the string tensor to a float32 tensor
audio, sr = tf.audio.decode_wav(input_data)

In [66]:
audio = tf.expand_dims(tf.transpose(audio, [1, 0]), axis=0)
audio

<tf.Tensor: shape=(1, 1, 7808), dtype=float32, numpy=
array([[[0.00170898, 0.00244141, 0.00219727, ..., 0.00244141,
         0.00292969, 0.00268555]]], dtype=float32)>

In [67]:
spectrogrator = Melspectrogram(n_dft=512, n_hop=256, input_shape=(1, 7808),
                              padding='same', sr=sr, n_mels=128,
                              fmin=0.0, fmax=sr/2, power_melgram=2.0,
                              return_decibel_melgram=True, trainable_fb=False,
                              trainable_kernel=False,
                              name='trainable_stft')

In [68]:
tf.squeeze(spectrogrator(audio), axis=-1)

<tf.Tensor: shape=(1, 128, 31), dtype=float32, numpy=
array([[[-53.520317, -57.2899  , -51.22443 , ..., -55.41263 ,
         -50.060318, -44.754036],
        [-50.451477, -54.22106 , -48.15559 , ..., -52.343784,
         -46.991474, -41.685196],
        [-48.228218, -52.768066, -47.68058 , ..., -43.341637,
         -61.34958 , -40.598373],
        ...,
        [-80.      , -78.64502 , -79.25717 , ..., -80.      ,
         -80.      , -80.      ],
        [-80.      , -80.      , -77.98331 , ..., -80.      ,
         -80.      , -80.      ],
        [-80.      , -80.      , -80.      , ..., -80.      ,
         -80.      , -80.      ]]], dtype=float32)>