**Deepspeech 1** Implementation referenced from arXiv:1412.5567. All credits to original authors

In [50]:
import numpy as np
import keras
import sklearn
import tensorflow
from keras import Sequential
from keras import layers
# print("Finished importing")

In [80]:
# Default params for DeepSpeech model
def buildModel_test(input_dim, output_dim, context = 5, units = 1024, dropouts = (0.1,0.1,0)):    
    
    # Create Input Layer and preprocessing for first FC layer
    _input = layers.Input([None, input_dim])
    
    # Call Keras expand_dims to add extra channel dimension (axis = -1) to input required by convolution 2D layer
    x = layers.Lambda(keras.backend.expand_dims, arguments = dict(axis=-1))(_input)
    
    # ** Layer 1 **
    # A zero-padded convolutional layer applied on time dimension only.
    # Thus, we will need to pad time dimension and specify kernel size for time dimension based on specified context.
    x = layers.ZeroPadding2D(padding=(context,0))(x)
    x = layers.Conv2D(filters = units, kernel_size=(context*2+1, input_dim))(x)
    
    # Reshaping after convolution
    x = layers.Lambda(keras.backend.squeeze, arguments=dict(axis=2))(x)
    
    # Clipped Relu (max=20) and Dropout are then applied to convolutional output:
    x = layers.ReLU(max_value=20)(x)
    x = layers.Dropout(rate=dropouts[0])(x)
    
    # ** Layer 2 **
    # Dense Layer, followed by clipped Relu and Dropout operating on 
    # independent data for each time-step via TimeDistributed Layer
    x = layers.TimeDistributed(layers.Dense(units))(x)
    x = layers.ReLU(max_value=20)(x)
    x = layers.Dropout(rate=dropouts[1])(x)
    
    # ** Layer 3 **
    # Similar to Layer 2
    x = layers.TimeDistributed(layers.Dense(units))(x)
    x = layers.ReLU(max_value=20)(x)
    x = layers.Dropout(rate=dropouts[2])(x)
    
    # ** Layer 4 **
    # Bidirectional RNN, with output being sum of both forward and backward units
    x = layers.Bidirectional(layers.SimpleRNN(units, return_sequences=True), merge_mode='sum')(x)
    
    # ** Layer 5 **
    # Final Dense Layer followed by Softmax to get predictions along characters for each timestep
    x = layers.TimeDistributed(layers.Dense(output_dim))(x)
    _output = layers.Softmax()(x)
    
    # Create model
    model = keras.Model(_input, _output)
                               
   # Print summary
    model.summary()
    
    return model

In [83]:
def buildModel(input_dim, output_dim, context = 5, units = 1024, dropouts = (0.1,0.1,0)):    
    model2 = keras.Sequential()
    # Create Input Layer and preprocessing for first FC layer
    model2.add(layers.Input([None, input_dim]))
    
    # Call Keras expand_dims to add extra channel dimension (axis = -1) to input required by convolution 2D layer
    model2.add(layers.Lambda(keras.backend.expand_dims, arguments = dict(axis=-1)))
    
    # ** Layer 1 **
    # A zero-padded convolutional layer applied on time dimension only.
    # Thus, we will need to pad time dimension and specify kernel size for time dimension based on specified context.
    model2.add(layers.ZeroPadding2D(padding=(context,0)))
    model2.add(layers.Conv2D(filters = units, kernel_size=(context*2+1, input_dim)))
    
    # Reshaping after convolution
    model2.add(layers.Lambda(keras.backend.squeeze, arguments=dict(axis=2)))
    
    # Clipped Relu (max=20) and Dropout are then applied to convolutional output:
    model2.add(layers.ReLU(max_value=20))
    model2.add(layers.Dropout(rate=dropouts[0]))
    
    # ** Layer 2 **
    # Dense Layer, followed by clipped Relu and Dropout operating on 
    # independent data for each time-step via TimeDistributed Layer
    model2.add(layers.TimeDistributed(layers.Dense(units)))
    model2.add(layers.ReLU(max_value=20))
    model2.add(layers.Dropout(rate=dropouts[1]))
    
    # ** Layer 3 **
    # Similar to Layer 2
    model2.add(layers.TimeDistributed(layers.Dense(units)))
    model2.add(layers.ReLU(max_value=20))
    model2.add(layers.Dropout(rate=dropouts[2]))
    
    # ** Layer 4 **
    # Bidirectional RNN, with output being sum of both forward and backward units
    model2.add(layers.Bidirectional(layers.SimpleRNN(units, return_sequences=True), merge_mode='sum'))
    model2.add(layers.TimeDistributed(layers.Dense(output_dim, activation='softmax')))
    
    model2.summary()
    
    return model2

In [86]:
# Test build and summary on random input/output size to make sure equivalent
model = buildModel(1000, 25)
model = buildModel_test(1000, 25)
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# TODO:
    # Implement CTC Loss
    # Check details for Nesterov Accelerated optimizer to pass into compile

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda_48 (Lambda)           (None, None, 1000, 1)     0         
_________________________________________________________________
zero_padding2d_24 (ZeroPaddi (None, None, 1000, 1)     0         
_________________________________________________________________
conv2d_22 (Conv2D)           (None, None, 1, 1024)     11265024  
_________________________________________________________________
lambda_49 (Lambda)           (None, None, 1024)        0         
_________________________________________________________________
re_lu_50 (ReLU)              (None, None, 1024)        0         
_________________________________________________________________
dropout_49 (Dropout)         (None, None, 1024)        0         
_________________________________________________________________
time_distributed_43 (TimeDis (None, None, 1024)      