# Deepspeech Implementation

*Implementation referenced from arXiv:1412.5567. All credits belong to original authors*

First we import the required libraries. For this implementation, we will mainly use functions from the Keras library, with the exception of a few functions in the CTC_loss method, where we will need to rely on the Tensorflow library.

In [43]:
import numpy as np
import keras
import sklearn
import tensorflow as tf
from keras import Sequential, Model
from keras import optimizers
from keras import layers

# Model Architecture

We show two implementations of the model architecture. One using the Keras.Model() method and one using Keras.Sequential() method

*TODO: ADD DESCRIPTIONS*

**Keras.Model Architecture**

In [44]:
# Default params for DeepSpeech model
def buildModel_test(input_dim, output_dim, context = 5, units = 1024, dropouts = (0.1,0.1,0)):    
    
    # Create Input Layer and preprocessing for first FC layer
    _input = layers.Input([None, input_dim])
    
    # Call Keras expand_dims to add extra channel dimension (axis = -1) to input required by convolution 2D layer
    x = layers.Lambda(keras.backend.expand_dims, arguments = dict(axis=-1))(_input)
    
    # ** Layer 1 **
    # A zero-padded convolutional layer applied on time dimension only.
    # Thus, we will need to pad time dimension and specify kernel size for time dimension based on specified context.
    x = layers.ZeroPadding2D(padding=(context,0))(x)
    x = layers.Conv2D(filters = units, kernel_size=(context*2+1, input_dim))(x)
    
    # Reshaping after convolution
    x = layers.Lambda(keras.backend.squeeze, arguments=dict(axis=2))(x)
    
    # Clipped Relu (max=20) and Dropout are then applied to convolutional output:
    x = layers.ReLU(max_value=20)(x)
    x = layers.Dropout(rate=dropouts[0])(x)
    
    # ** Layer 2 **
    # Dense Layer, followed by clipped Relu and Dropout operating on 
    # independent data for each time-step via TimeDistributed Layer
    x = layers.TimeDistributed(layers.Dense(units))(x)
    x = layers.ReLU(max_value=20)(x)
    x = layers.Dropout(rate=dropouts[1])(x)
    
    # ** Layer 3 **
    # Similar to Layer 2
    x = layers.TimeDistributed(layers.Dense(units))(x)
    x = layers.ReLU(max_value=20)(x)
    x = layers.Dropout(rate=dropouts[2])(x)
    
    # ** Layer 4 **
    # Bidirectional RNN, with output being sum of both forward and backward units
    x = layers.Bidirectional(layers.SimpleRNN(units, return_sequences=True), merge_mode='sum')(x)
    
    # ** Layer 5 **
    # Final Dense Layer followed by Softmax to get predictions along characters for each timestep
    x = layers.TimeDistributed(layers.Dense(output_dim))(x)
    _output = layers.Softmax()(x)
    
    # Create model
    model = keras.Model(_input, _output)
                               
   # Print summary
    model.summary()
    
    return model

**Keras.Sequential Architecture**

In [45]:
def buildModel(input_dim, output_dim, context = 5, units = 1024, dropouts = (0.1,0.1,0)):    
    model2 = keras.Sequential()
    # Create Input Layer and preprocessing for first FC layer
    model2.add(layers.Input([None, input_dim]))
    
    # Call Keras expand_dims to add extra channel dimension (axis = -1) to input required by convolution 2D layer
    model2.add(layers.Lambda(keras.backend.expand_dims, arguments = dict(axis=-1)))
    
    # ** Layer 1 **
    # A zero-padded convolutional layer applied on time dimension only.
    # Thus, we will need to pad time dimension and specify kernel size for time dimension based on specified context.
    model2.add(layers.ZeroPadding2D(padding=(context,0)))
    model2.add(layers.Conv2D(filters = units, kernel_size=(context*2+1, input_dim)))
    
    # Reshaping after convolution
    model2.add(layers.Lambda(keras.backend.squeeze, arguments=dict(axis=2)))
    
    # Clipped Relu (max=20) and Dropout are then applied to convolutional output:
    model2.add(layers.ReLU(max_value=20))
    model2.add(layers.Dropout(rate=dropouts[0]))
    
    # ** Layer 2 **
    # Dense Layer, followed by clipped Relu and Dropout operating on 
    # independent data for each time-step via TimeDistributed Layer
    model2.add(layers.TimeDistributed(layers.Dense(units)))
    model2.add(layers.ReLU(max_value=20))
    model2.add(layers.Dropout(rate=dropouts[1]))
    
    # ** Layer 3 **
    # Similar to Layer 2
    model2.add(layers.TimeDistributed(layers.Dense(units)))
    model2.add(layers.ReLU(max_value=20))
    model2.add(layers.Dropout(rate=dropouts[2]))
    
    # ** Layer 4 **
    # Bidirectional RNN, with output being sum of both forward and backward units
    model2.add(layers.Bidirectional(layers.SimpleRNN(units, return_sequences=True), merge_mode='sum'))
    model2.add(layers.TimeDistributed(layers.Dense(output_dim, activation='softmax')))
    
    model2.summary()
    
    return model2

In [46]:
# Test build and summary on random input/output size to make sure equivalent
model = buildModel(1000, 25)
model = buildModel_test(1000, 25)

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda_26 (Lambda)           (None, None, 1000, 1)     0         
_________________________________________________________________
zero_padding2d_13 (ZeroPaddi (None, None, 1000, 1)     0         
_________________________________________________________________
conv2d_13 (Conv2D)           (None, None, 1, 1024)     11265024  
_________________________________________________________________
lambda_27 (Lambda)           (None, None, 1024)        0         
_________________________________________________________________
re_lu_39 (ReLU)              (None, None, 1024)        0         
_________________________________________________________________
dropout_39 (Dropout)         (None, None, 1024)        0         
_________________________________________________________________
time_distributed_39 (TimeDis (None, None, 1024)       

# Implementation of CTC Loss

The original paper uses the tensorflow backend function *"tf.nn.ctc_loss"* to define CTC_loss. In our implementation, we attempt to use the *"keras.backend.ctc_batch_cost"* function. The Keras function is more streamlined in that we only need to provide 4 arguments: Y_true (Ground truth labels), Y_pred (softmax output from our model), pred_length (sequence length of each batch item in Y_pred), and true_length (sequence length of each batch item in Y_true).

In order to calculate pred_length and true_length, we reference several tensorflow functions used in the orginal implementation:

> *tf.ones_like()* and *tf.math.reduce_sum()*

The first function creates a copy of any input tensor, where all values replaced with 1's. The second function allows us to perform a summation of values along a specified axis. If we were to apply this function on a tensor of shape (batch, sequence_length) where all values are 1, the result is an output vector of shape (batch, 1) that tells us the sequence length for each batch item. This allows us to obtain the true_length array from the Y_true tensor.

> *tf.reduce_max()* 

However, to obtain the pred_length array from Y_pred, we need to perform one extra step. Our Y_pred has dimensions (batch, sequence_length, num_char_classes). Our softmax gives us a one-hot encoding of all possible classes. In order obtain the right dimensions to apply *tf.ones_like()* and *tf.math.reduce_sum()*, we need to choose a prediction class. We can do this by calling *tf.reduce_max()* on our output, which returns the index of the maximum value along a specific axis, thus effectively removing the chosen dimension. Here, we call reduce_max on axis 2, which represents the one-hot encodings of the different characters, to obtain a tensor of shape (batch, sequence_length) that we can then operate on similar to Y_true.



In [47]:
def ctc_loss(y_true, y_pred): 
    # Get length array of y_true
    true_length = tf.math.reduce_sum(tf.ones_like(y_true), 1)
    # Get length array of y_pred:
    pred_length = tf.math.reduce_sum(tf.ones_like(tf.math.reduce_max(y_pred, 2)), 1)
    return tf.keras.backend.ctc_batch_cost(y_true, y_pred, pred_length, true_length)

** Optimizer **

We use the SGD optimizer with Nesterove Accelerated Gradient as per the paper. Momentum is set to 0.99

In [48]:
optimizer = optimizers.SGD(learning_rate=0.01, momentum=0.99, nesterov=True, name="SGD")

Now that we have defined our CTC_loss and optimizer, as well as specified the model architecture, we will need to compile the model as shown below

In [49]:
model.compile(loss=ctc_loss, optimizer=optimizer, metrics=['accuracy'])

In [52]:
# We can now fit the model:
# TODO: Make sure fit can run once processed input data and truth labels available
# model.fit()

# Resources:

+ DeepSpeech paper: https://arxiv.org/pdf/1412.5567.pdf
+ The original implementation of DeepSpeech: https://github.com/rolczynski/Automatic-Speech-Recognition
+ Keras Library: https://keras.io/api/
+ Tensorflow Library: https://www.tensorflow.org/api_docs/python/
+ https://www.tensorflow.org/api_docs/python/tf/nn/ctc_loss
+ https://stackoverflow.com/questions/57292896/understanding-ctc-loss-for-speech-recognition-in-keras
+ https://chadrick-kwag.net/tf-keras-rnn-ctc-example/