In [0]:
!wget https://downloads.tatoeba.org/audio/tatoeba_audio_eng.zip

--2020-01-23 11:30:47--  https://downloads.tatoeba.org/audio/tatoeba_audio_eng.zip
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4116471982 (3.8G) [application/zip]
Saving to: ‘tatoeba_audio_eng.zip’


2020-01-23 11:34:04 (20.0 MB/s) - ‘tatoeba_audio_eng.zip’ saved [4116471982/4116471982]



In [0]:
import os
import zipfile

local_zip = '/content/tatoeba_audio_eng.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/content/sample_data/tatoeba')
zip_ref.close()

In [0]:
import pandas as pd

t_data = pd.read_csv("/content/sentence_200.tsv",sep='\t')
t_data.head()

Unnamed: 0,id,username,text
0,1276,CK,Let's try something.
1,1277,CK,I have to go to sleep.
2,1284,CK,I will be back soon.
3,1287,CK,This is never going to end.
4,1288,CK,I just don't know what to say.


In [0]:
file_name = []
abs_path = []
target = []
features = []
def get_all():
  for i,d in t_data.iterrows():
    fname = str(d["id"])+".mp3"
    path = "/content/sample_data/tatoeba/tatoeba_audio_eng/audio/"+d["username"]+"/"

    fpath = path + fname

    abs_path.append(fpath)
    file_name.append(fname)
    target.append(d["text"])
    #print(fname)

    wave, sr = librosa.load(fpath, mono=True)
    mfccs = librosa.feature.mfcc(wave, sr=sr)
    mfccs=np.pad(mfccs,((0,0),(0,350-len(mfccs[0]))), mode='constant', constant_values=0)
    features.append(np.array(mfccs))

In [0]:
import librosa
import numpy as np

get_all()

## CNN+BLSTM+CTC Loss

In [0]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import librosa
from string import ascii_lowercase


class ASR(tf.keras.Model):
    '''
    Class for defining the end to end ASR model.
    This model consists of a 1D convolutional layer followed by a bidirectional LSTM
    followed by a fully connected layer applied at each timestep.
    This is a bare-bones architecture.
    Experiment with your own architectures to get a good WER
    '''
    def __init__(self, filters, kernel_size, conv_stride, conv_border, n_lstm_units, n_dense_units):
        batch_size = 10
        super(ASR, self).__init__()
        self.conv_layer = tf.keras.layers.Conv1D(filters,
                                                 kernel_size,
                                                 strides=conv_stride,
                                                 padding=conv_border,
                                                 activation='relu')
        self.lstm_layer = tf.keras.layers.LSTM(n_lstm_units,
                                               return_sequences=True,
                                               batch_input_shape=(batch_size, 450, 129),
                                               stateful=True,
                                               activation='tanh')
        self.lstm_layer_back = tf.keras.layers.LSTM(n_lstm_units,
                                                    return_sequences=True,
                                                    go_backwards=True,
                                                    batch_input_shape=(batch_size, 450, 129),
                                                    stateful =True,
                                                    activation='tanh')
        self.blstm_layer = tf.keras.layers.Bidirectional(self.lstm_layer, backward_layer=self.lstm_layer_back)
        self.dense_layer = tf.keras.layers.Dense(n_dense_units)

    def call(self, x):
        x = self.conv_layer(x)
        x = self.blstm_layer(x)
        x = self.dense_layer(x)
        return x


def compute_ctc_loss(logits, labels, logit_length, label_length):
    '''
    function to compute CTC loss.
    Note: tf.nn.ctc_loss applies log softmax to its input automatically
    :param logits: Logits from the output dense layer
    :param labels: Labels converted to array of indices
    :param logit_length: Array containing length of each input in the batch
    :param label_length: Array containing length of each label in the batch
    :return: array of ctc loss for each element in batch
    '''
    return tf.nn.ctc_loss(
        labels=labels,
        logits=logits,
        label_length=label_length,
        logit_length=logit_length,
        logits_time_major=False,
        unique=None,
        blank_index=-1,
        name=None
    )


def create_spectrogram(signals):
    '''
    function to create spectrogram from signals loaded from an audio file
    :param signals:
    :return:
    '''
    stfts = tf.signal.stft(signals, frame_length=350, frame_step=80, fft_length=256)
    spectrograms = tf.math.pow(tf.abs(stfts), 0.5)
    return spectrograms


def generate_input_from_audio_file(path_to_audio_file, resample_to=8000):
    '''
    function to create input for our neural network from an audio file.
    The function loads the audio file using librosa, resamples it, and creates spectrogram form it
    :param path_to_audio_file: path to the audio file
    :param resample_to:
    :return: spectrogram corresponding to the input file
    '''
    # load the signals and resample them
    signal, sample_rate = librosa.core.load(path_to_audio_file)
    if signal.shape[0] == 2:
        signal = np.mean(signal, axis=0)
    signal_resampled = librosa.core.resample(signal, sample_rate, resample_to)

    # create spectrogram
    X = create_spectrogram(signal_resampled)

    # normalisation
    means = tf.math.reduce_mean(X, 1, keepdims=True)
    stddevs = tf.math.reduce_std(X, 1, keepdims=True)
    X = tf.divide(tf.subtract(X, means), stddevs)
    return X


def generate_target_output_from_text(target_text):
    '''
    Target output is an array of indices for each character in your string.
    The indices comes from a mapping that will
    be used while decoding the ctc output.
    :param target_text: (str) target string
    :return: array of indices for each character in the string
    '''
    space_token = ' '
    end_token = '>'
    blank_token = '%'
    alphabet = list(ascii_lowercase) + [space_token, end_token, blank_token]
    char_to_index = {}
    for idx, char in enumerate(alphabet):
        char_to_index[char] = idx

    y = []
    for char in target_text:
        y.append(char_to_index[char])
    return y


def train_sample(x, y, optimizer, model):
    '''
    function perform forward and backpropagation on one batch
    :param x: one batch of input
    :param y: one batch of target
    :param optimizer: optimizer
    :param model: object of the ASR class
    :return: loss from this step
    '''
    with tf.GradientTape() as tape:
        logits = model(x)
        labels = y
        logits_length = [logits.shape[1]]*logits.shape[0]
        labels_length = [labels.shape[1]]*labels.shape[0]
        loss = compute_ctc_loss(logits, labels, logit_length=logits_length, label_length=labels_length)
        loss = tf.reduce_mean(loss)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss


def train(model, optimizer, X, Y, epochs):
    '''
    function to train the model for given number of epochs
    Note:
    For this example, I am passing a single batch of input to this function
    Therefore, the loop for iterating through batches is missing
    :param model: object of class ASR
    :param optimizer: optimizer
    :param X:
    :param Y:
    :param epochs:
    :return: None
    '''
    for step in range(1, epochs):
        loss = train_sample(X, Y, optimizer, model)
        print('Epoch {}, Loss: {}'.format(step, loss))



In [0]:
model = ASR(200, 11, 2, 'valid', 400, 29)
optimizer = tf.keras.optimizers.Adam()

for v in range(0,3):
  sample_call = abs_path[v]
  print("-------------------------ITERATION_NUMBER: ",(v+1),"--------------------------")
  transcript = target[v].lower().replace("'","").replace(".",">").replace("?","").replace("!","").replace(",","").replace("6","")
  X = generate_input_from_audio_file(sample_call)
  X = tf.expand_dims(X, axis=0)  # converting input into a batch of size 1
  y = generate_target_output_from_text(transcript)
  y = tf.expand_dims(tf.convert_to_tensor(y), axis=0)  # converting output to a batch of size 1
  print('Input shape: {}'.format(X.shape))
  print('Target shape: {}'.format(y.shape))

  train(model, optimizer, X, y, 100)


-------------------------ITERATION_NUMBER:  1 --------------------------
Input shape: (1, 141, 129)
Target shape: (1, 19)
Epoch 1, Loss: 171.0594940185547
Epoch 2, Loss: 59.87071228027344
Epoch 3, Loss: 105.64752197265625
Epoch 4, Loss: 77.04242706298828
Epoch 5, Loss: 46.46166229248047
Epoch 6, Loss: 64.21551513671875
Epoch 7, Loss: 56.4273681640625
Epoch 8, Loss: 43.419639587402344
Epoch 9, Loss: 39.608238220214844
Epoch 10, Loss: 40.07508087158203
Epoch 11, Loss: 39.13264465332031
Epoch 12, Loss: 35.725582122802734
Epoch 13, Loss: 31.462244033813477
Epoch 14, Loss: 28.292951583862305
Epoch 15, Loss: 26.630937576293945
Epoch 16, Loss: 25.10251235961914
Epoch 17, Loss: 22.86440658569336
Epoch 18, Loss: 20.521387100219727
Epoch 19, Loss: 18.484294891357422
Epoch 20, Loss: 16.6839656829834
Epoch 21, Loss: 14.987215042114258
Epoch 22, Loss: 13.547384262084961
Epoch 23, Loss: 12.29317569732666
Epoch 24, Loss: 11.104994773864746
Epoch 25, Loss: 10.037562370300293
Epoch 26, Loss: 9.18428707

In [0]:
# getting the ctc output
i =2
X = generate_input_from_audio_file(abs_path[i])

X = tf.expand_dims(X, axis=0)  # converting input into a batch of size 1
  
ctc_output = model(X)
ctc_output = tf.nn.log_softmax(ctc_output)

#model.save('Test_Model')
print(target[i])
# greedy decoding
space_token = ' '
end_token = '>'
blank_token = '%'
lm = None
alphabet = list(ascii_lowercase) + [space_token, end_token, blank_token]
output_text = ''
for timestep in ctc_output[0]:
    output_text += alphabet[tf.math.argmax(timestep)]
print(output_text)
#print(prefix_beam_search(ctc_output, alphabet, blank_token, end_token, space_token, lm=lm))
print('\n\nNote: Applying a good decoder on this output will give you readable output')


I will be back soon.
ii%%%%%%%%%%%  %%%%%%%%wwwiiiiiiiii%%%%%ll%%ll%  be back soo%%oo%%%%%%nn%%>>>%


Note: Applying a good decoder on this output will give you readable output


In [0]:
# getting the ctc output
i =7
X = generate_input_from_audio_file(abs_path[i])

X = tf.expand_dims(X, axis=0)  # converting input into a batch of size 1
  
ctc_output = model(X)
ctc_output = tf.nn.log_softmax(ctc_output)

#model.save('Test_Model')
print(target[i])
# greedy decoding
space_token = ' '
end_token = '>'
blank_token = '%'
lm = None
alphabet = list(ascii_lowercase) + [space_token, end_token, blank_token]
output_text = ''
for timestep in ctc_output[0]:
    output_text += alphabet[tf.math.argmax(timestep)]
print(output_text)
#print(prefix_beam_search(ctc_output, alphabet, blank_token, end_token, space_token, lm=lm))
print('\n\nNote: Applying a good decoder on this output will give you readable output')


I don't know if I have the time.
%%%%%%%%%%%w%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%wwi%%%%%%%%%%%%%%%%%%%%%%%ii%%%%%%%%%


Note: Applying a good decoder on this output will give you readable output


In [0]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import librosa
from string import ascii_lowercase

class ASR(tf.keras.Model):
    '''
    Class for defining the end to end ASR model.
    This model consists of a 1D convolutional layer followed by a bidirectional LSTM
    followed by a fully connected layer applied at each timestep.
    This is a bare-bones architecture.
    Experiment with your own architectures to get a good WER
    '''
    def __init__(self, filters, kernel_size, conv_stride, conv_border, n_lstm_units, n_dense_units):
        batch_size = 10
        super(ASR, self).__init__()
        self.conv_layer = tf.keras.layers.Conv1D(filters,
                                                 kernel_size,
                                                 strides=conv_stride,
                                                 padding=conv_border,
                                                 activation='relu')
        self.lstm_layer = tf.keras.layers.LSTM(n_lstm_units,
                                               return_sequences=True,
                                               batch_input_shape=(batch_size, 129, 450),
                                               stateful=True,
                                               activation='tanh')
        self.lstm_layer_back = tf.keras.layers.LSTM(n_lstm_units,
                                                    return_sequences=True,
                                                    go_backwards=True,
                                                    batch_input_shape=(batch_size, 129, 450),
                                                    stateful =True,
                                                    activation='tanh')
        self.blstm_layer = tf.keras.layers.Bidirectional(self.lstm_layer, backward_layer=self.lstm_layer_back)
        self.dense_layer = tf.keras.layers.Dense(n_dense_units)

    def call(self, x):
        x = self.conv_layer(x)
        x = self.blstm_layer(x)
        x = self.dense_layer(x)
        return x


def compute_ctc_loss(logits, labels, logit_length, label_length):
    '''
    function to compute CTC loss.
    Note: tf.nn.ctc_loss applies log softmax to its input automatically
    :param logits: Logits from the output dense layer
    :param labels: Labels converted to array of indices
    :param logit_length: Array containing length of each input in the batch
    :param label_length: Array containing length of each label in the batch
    :return: array of ctc loss for each element in batch
    '''
    return tf.nn.ctc_loss(
        labels=labels,
        logits=logits,
        label_length=label_length,
        logit_length=logit_length,
        logits_time_major=False,
        unique=None,
        blank_index=-1,
        name=None
    )


def create_spectrogram(signals):
    '''
    function to create spectrogram from signals loaded from an audio file
    :param signals:
    :return:
    '''
    stfts = tf.signal.stft(signals, frame_length=450, frame_step=80, fft_length=256)
    spectrograms = tf.math.pow(tf.abs(stfts), 0.5)
    #print(spectrograms)
    return spectrograms

def generate_input_from_audio_file2(path_to_audio_file, resample_to=8000):
    '''
    function to create input for our neural network from an audio file.
    The function loads the audio file using librosa, resamples it, and creates spectrogram form it
    :param path_to_audio_file: path to the audio file
    :param resample_to:
    :return: spectrogram corresponding to the input file
    '''
    # load the signals and resample them
    signal, sample_rate = librosa.core.load(path_to_audio_file)
    if signal.shape[0] == 2:
        signal = np.mean(signal, axis=0)
    signal_resampled = librosa.core.resample(signal, sample_rate, resample_to)

    # create spectrogram
    X = create_spectrogram(signal_resampled)

    # normalisation
    means = tf.math.reduce_mean(X, 1, keepdims=True)
    stddevs = tf.math.reduce_std(X, 1, keepdims=True)
    X = tf.divide(tf.subtract(X, means), stddevs)
    return X

def generate_input_from_audio_file(resample_to=8000):
    '''
    function to create input for our neural network from an audio file.
    The function loads the audio file using librosa, resamples it, and creates spectrogram form it
    :param path_to_audio_file: path to the audio file
    :param resample_to:
    :return: spectrogram corresponding to the input file
    '''
    # load the signals and resample them
    El = []
    for i in range(0,10):
      path_to_audio_file = abs_path[i]
      signal, sample_rate = librosa.core.load(path_to_audio_file)
      if signal.shape[0] == 2:
          signal = np.mean(signal, axis=0)
      signal_resampled = librosa.core.resample(signal, sample_rate, resample_to)
    # print(signal_resampled)
      # create spectrogram
      X = create_spectrogram(signal_resampled)
      means = tf.math.reduce_mean(X, 1, keepdims=True)
      stddevs = tf.math.reduce_std(X, 1, keepdims=True)
      tp = tf.divide(tf.subtract(X, means), stddevs)
      
      tq=tf.transpose(tp,[1,0])
      padding = tf.tile([[0]], tf.stack([tf.shape(tq)[0], 450 - tf.shape(tq)[1]], 0))
      #print(padding)
      tq = tf.concat([tq, tf.cast(padding,tf.float32)], axis=1)
      tp =tf.transpose(tq,[1,0])
      El.append(tq)
      
    X = tf.stack(El)  
    return X


def generate_target_output_from_text():
    '''
    Target output is an array of indices for each character in your string.
    The indices comes from a mapping that will
    be used while decoding the ctc output.
    :param target_text: (str) target string
    :return: array of indices for each character in the string
    '''
    space_token = ' '
    end_token = '>'
    blank_token = '%'
    alphabet = list(ascii_lowercase) + [space_token, end_token, blank_token]
    char_to_index = {}
    ret = []
    for idx, char in enumerate(alphabet):
      char_to_index[char] = idx
    for i in range(0,10):
      target_text = target[i].lower().replace(",","").replace(".",">").replace("!","").replace("'","").replace("?","")
      
      y = []
      for char in target_text:
          y.append(char_to_index[char])
      for i in range(len(y),121):
          y.append(0)
      tp = tf.convert_to_tensor(y)
      ret.append(tp)
    y = tf.stack(ret)
    return y


def train_sample(x, y, optimizer, model):
    '''
    function perform forward and backpropagation on one batch
    :param x: one batch of input
    :param y: one batch of target
    :param optimizer: optimizer
    :param model: object of the ASR class
    :return: loss from this step
    '''
    with tf.GradientTape() as tape:
        logits = model(x)
        labels = y
        logits_length = [logits.shape[1]]*logits.shape[0]
        labels_length = [labels.shape[1]]*labels.shape[0]
        loss = compute_ctc_loss(logits, labels, logit_length=logits_length, label_length=labels_length)
        loss = tf.reduce_mean(loss)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss


def train(model, optimizer, X, Y, epochs):
    '''
    function to train the model for given number of epochs
    Note:
    For this example, I am passing a single batch of input to this function
    Therefore, the loop for iterating through batches is missing
    :param model: object of class ASR
    :param optimizer: optimizer
    :param X:
    :param Y:
    :param epochs:
    :return: None
    '''
    for step in range(1, epochs):
        loss = train_sample(X, Y, optimizer, model)
        print('Epoch {}, Loss: {}'.format(step, loss))


if __name__ == '__main__':
    
    model = ASR(200, 11, 2, 'valid', 450, 29)
    optimizer = tf.keras.optimizers.Adam()

    
    sample_call =[]
    X = generate_input_from_audio_file()
      #X.append(tp)
    #X = tf.expand_dims(X, axis=0)  # converting input into a batch of size 1
    print('Input shape: {}'.format(X.shape))

    y = generate_target_output_from_text()
    #   y = tf.expand_dims(tf.convert_to_tensor(y), axis=0)  # converting output to a batch of size 1
    #   print('Input shape: {}'.format(X.shape))
    print('Target shape: {}'.format(y.shape))
    #print(X)
    #print(y)
    train(model, optimizer, X, y, 99)

    # # getting the ctc output
    #X = generate_input_from_audio_file()
    #X = tf.expand_dims(X, axis=2)
    ctc_output = model(X)
    ctc_output = tf.nn.log_softmax(ctc_output)

    # #model.save('Test_Model')

    # # greedy decoding
    space_token = ' '
    end_token = '>'
    blank_token = '%'
    alphabet = list(ascii_lowercase) + [space_token, end_token, blank_token]
    output_text = ''
    for timestep in ctc_output[0]:
      output_text += alphabet[tf.math.argmax(timestep)]
    print(output_text)
    print('\n\nNote: Applying a good decoder on this output will give you readable output')

Input shape: (10, 129, 450)
Target shape: (10, 121)
Epoch 1, Loss: 855.1011962890625
Epoch 2, Loss: 737.8053588867188
Epoch 3, Loss: 707.0111083984375
Epoch 4, Loss: 706.8961181640625
Epoch 5, Loss: 706.893798828125
Epoch 6, Loss: 706.8936767578125
Epoch 7, Loss: 706.8936767578125
Epoch 8, Loss: 706.8936767578125
Epoch 9, Loss: 706.8936767578125
Epoch 10, Loss: 706.8936157226562
Epoch 11, Loss: 706.8936157226562
Epoch 12, Loss: 706.8936157226562
Epoch 13, Loss: 706.8936157226562
Epoch 14, Loss: 706.8936157226562
Epoch 15, Loss: 706.8936157226562
Epoch 16, Loss: 706.8936157226562
Epoch 17, Loss: 706.8936157226562
Epoch 18, Loss: 706.8936157226562
Epoch 19, Loss: 706.8936157226562
Epoch 20, Loss: 706.8936157226562
Epoch 21, Loss: 706.8936157226562
Epoch 22, Loss: 706.8936157226562
Epoch 23, Loss: 706.8936157226562
Epoch 24, Loss: 706.8936157226562
Epoch 25, Loss: 706.8936157226562
Epoch 26, Loss: 706.8936157226562
Epoch 27, Loss: 706.8936157226562
Epoch 28, Loss: 706.8936157226562
Epoch 

## BLSTM+CTC

In [0]:
!wget https://www.dropbox.com/s/xecprghgwbbuk3m/vctk-pc225.tar.gz?dl=1

--2020-01-23 17:33:50--  https://www.dropbox.com/s/xecprghgwbbuk3m/vctk-pc225.tar.gz?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.9.1, 2620:100:601f:1::a27d:901
Connecting to www.dropbox.com (www.dropbox.com)|162.125.9.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/dl/xecprghgwbbuk3m/vctk-pc225.tar.gz [following]
--2020-01-23 17:33:50--  https://www.dropbox.com/s/dl/xecprghgwbbuk3m/vctk-pc225.tar.gz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc39524a90f34f96b9a9f68949ba.dl.dropboxusercontent.com/cd/0/get/AwsRKIK2FgiqMdKho3YQU2nVp8qf6laFi66lFIxWZaMw4Zvp89DSUTZAoGqn9i4OG_3G8Hdt1YP5sDl0QCmfTyeX8nPgNTBSxEimvVkVCGivpg/file?dl=1# [following]
--2020-01-23 17:33:51--  https://uc39524a90f34f96b9a9f68949ba.dl.dropboxusercontent.com/cd/0/get/AwsRKIK2FgiqMdKho3YQU2nVp8qf6laFi66lFIxWZaMw4Zvp89DSUTZAoGqn9i4OG_3G8Hdt1YP5sDl0QCmfTyeX8nPgNTBSxEimvVkVCGivpg/file?dl

In [0]:
%tensorflow_version 1.x
import tensorflow as tf

In [0]:
!tar xvzf vctk-pc225.tar.gz?dl=1 && rm -rf vctk-pc225.tar.gz?dl=1.1

vctk-p225/
vctk-p225/txt/
vctk-p225/txt/p225/
vctk-p225/txt/p225/p225_026.txt
vctk-p225/txt/p225/p225_248.txt
vctk-p225/txt/p225/p225_030.txt
vctk-p225/txt/p225/p225_274.txt
vctk-p225/txt/p225/p225_258.txt
vctk-p225/txt/p225/p225_264.txt
vctk-p225/txt/p225/p225_201.txt
vctk-p225/txt/p225/p225_320.txt
vctk-p225/txt/p225/p225_109.txt
vctk-p225/txt/p225/p225_281.txt
vctk-p225/txt/p225/p225_212.txt
vctk-p225/txt/p225/p225_355.txt
vctk-p225/txt/p225/p225_021.txt
vctk-p225/txt/p225/p225_318.txt
vctk-p225/txt/p225/p225_202.txt
vctk-p225/txt/p225/p225_322.txt
vctk-p225/txt/p225/p225_275.txt
vctk-p225/txt/p225/p225_291.txt
vctk-p225/txt/p225/p225_356.txt
vctk-p225/txt/p225/p225_014.txt
vctk-p225/txt/p225/p225_013.txt
vctk-p225/txt/p225/p225_103.txt
vctk-p225/txt/p225/p225_157.txt
vctk-p225/txt/p225/p225_282.txt
vctk-p225/txt/p225/p225_028.txt
vctk-p225/txt/p225/p225_337.txt
vctk-p225/txt/p225/p225_135.txt
vctk-p225/txt/p225/p225_037.txt
vctk-p225/txt/p225/p225_133.txt
vctk-p225/txt/p225/p225_17

In [0]:
!pip install python_speech_features

Collecting python_speech_features
  Downloading https://files.pythonhosted.org/packages/ff/d1/94c59e20a2631985fbd2124c45177abaa9e0a4eee8ba8a305aa26fc02a8e/python_speech_features-0.6.tar.gz
Building wheels for collected packages: python-speech-features
  Building wheel for python-speech-features (setup.py) ... [?25l[?25hdone
  Created wheel for python-speech-features: filename=python_speech_features-0.6-cp36-none-any.whl size=5889 sha256=9c505ca78fb2e2177e5f8bea81c5968f985822c908eafe18db3986b961cc0293
  Stored in directory: /root/.cache/pip/wheels/3c/42/7c/f60e9d1b40015cd69b213ad90f7c18a9264cd745b9888134be
Successfully built python-speech-features
Installing collected packages: python-speech-features
Successfully installed python-speech-features-0.6


In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from six.moves.urllib.request import urlretrieve
from six.moves import xrange as range

import os
import sys
import numpy as np

url = 'https://catalog.ldc.upenn.edu/desc/addenda/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
    """A hook to report the progress of a download. This is mostly intended for
    users with slow internet connections. Reports every 1% change in download
    progress.
    """
    global last_percent_reported
    percent = int(count * blockSize * 100 / totalSize)

    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()

        last_percent_reported = percent


def maybe_download(filename, expected_bytes, force=False):
    """Download a file if not present, and make sure it's the right size."""
    if force or not os.path.exists(filename):
        print('Attempting to download:', filename)
        filename, _ = urlretrieve(url + filename, filename,
                                  reporthook=download_progress_hook)
        print('\nDownload Complete!')
    statinfo = os.stat(filename)

    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception(
                        'Failed to verify ' + filename + \
                        '. Can you get to it with a browser?')
    return filename

def sparse_tuple_from(sequences, dtype=np.int32):
    """Create a sparse representention of x.
    Args:
        sequences: a list of lists of type dtype where each element is a sequence
    Returns:
        A tuple with (indices, values, shape)
    """
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n]*len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)

    return indices, values, shape

def pad_sequences(sequences, maxlen=None, dtype=np.float32,
                  padding='post', truncating='post', value=0.):
    '''Pads each sequence to the same length: the length of the longest
    sequence.
        If maxlen is provided, any sequence longer than maxlen is truncated to
        maxlen. Truncation happens off either the beginning or the end
        (default) of the sequence. Supports post-padding (default) and
        pre-padding.
        Args:
            sequences: list of lists where each element is a sequence
            maxlen: int, maximum length
            dtype: type to cast the resulting sequence.
            padding: 'pre' or 'post', pad either before or after each sequence.
            truncating: 'pre' or 'post', remove values from sequences larger
            than maxlen either in the beginning or in the end of the sequence
            value: float, value to pad the sequences to the desired value.
        Returns
            x: numpy array with dimensions (number_of_sequences, maxlen)
            lengths: numpy array with the original sequence lengths
    '''
    lengths = np.asarray([len(s) for s in sequences], dtype=np.int64)

    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x, lengths

In [0]:
class FileLogger(object):
    def __init__(self, full_filename, headers):
        self._headers = headers
        self._out_fp = open(full_filename, 'w')
        self._write(headers)

    def write(self, line):
        assert len(line) == len(self._headers)
        self._write(line)

    def close(self):
        self._out_fp.close()

    def _write(self, arr):
        arr = [str(e) for e in arr]
        self._out_fp.write(' '.join(arr) + '\n')
        self._out_fp.flush()

In [0]:
import os
from glob import glob
from random import shuffle
from time import time

import dill
import librosa

SENTENCE_ID = 'sentence_id'
SPEAKER_ID = 'speaker_id'
FILENAME = 'filename'


def find_files(directory, pattern='**/*.wav'):
    """Recursively finds all files matching the pattern."""
    return sorted(glob(os.path.join(directory, pattern), recursive=True))


def read_audio_from_filename(filename, sample_rate):
    # import scipy.io.wavfile as wav
    # fs, audio = wav.read(filename)
    audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    audio = audio.reshape(-1, 1)
    return audio


def extract_speaker_id(filename):
    return filename.split('/')[-2]


def extract_sentence_id(filename):
    return filename.split('/')[-1].split('_')[1].split('.')[0]


class AudioReader(object):
    def __init__(self,
                 audio_dir,
                 sample_rate,
                 cache_dir='cache',
                 speakers_sub_list=None):
        print('Initializing AudioReader()')
        print('audio_dir = {}'.format(audio_dir))
        print('cache_dir = {}'.format(cache_dir))
        print('sample_rate = {}'.format(sample_rate))
        print('speakers_sub_list = {}'.format(speakers_sub_list))
        self.audio_dir = audio_dir
        self.cache_dir = cache_dir
        self.sample_rate = sample_rate
        self.metadata = dict()  # small cache <SPEAKER_ID -> SENTENCE_ID, filename>
        self.cache = dict()  # big cache <filename, data:audio librosa, text.>

        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)

        #st = time()
        if len(find_files(cache_dir, pattern='*.pkl')) == 0:  # generate all the pickle files.
            print('Nothing found at {}. Generating all the caches now.'.format(cache_dir))
            files = find_files(audio_dir)
            assert len(files) != 0, 'Cannot find any VCTK files there. Are you sure audio_dir is correct?'
            print('Found {} files in total in {}.'.format(len(files), audio_dir))
            if speakers_sub_list is not None:
                files = list(
                    filter(lambda x: any(word in extract_speaker_id(x) for word in speakers_sub_list), files))
                print('{} files correspond to the speaker list {}.'.format(len(files), speakers_sub_list))
            assert len(files) != 0

            for filename in files:
                try:
                    target_text = open(filename.replace('wav48', 'txt').replace('.wav', '.txt'), 'r').read().strip()
                    speaker_id = extract_speaker_id(filename)
                    audio = read_audio_from_filename(filename, self.sample_rate)
                    obj = {'audio': audio,
                           'target': target_text,
                           FILENAME: filename}
                    cache_filename = filename.split('/')[-1].split('.')[0] + '_cache'
                    tmp_filename = os.path.join(cache_dir, cache_filename) + '.pkl'
                    with open(tmp_filename, 'wb') as f:
                        dill.dump(obj, f)
                        print('[DUMP AUDIO] {}'.format(tmp_filename))
                    if speaker_id not in self.metadata:
                        self.metadata[speaker_id] = {}
                    sentence_id = extract_sentence_id(filename)
                    if sentence_id not in self.metadata[speaker_id]:
                        self.metadata[speaker_id][sentence_id] = []
                    self.metadata[speaker_id][sentence_id] = {SPEAKER_ID: speaker_id,
                                                              SENTENCE_ID: sentence_id,
                                                              FILENAME: filename}
                except librosa.util.exceptions.ParameterError as e:
                    print(e)
                    print('[DUMP AUDIO ERROR SKIPPING FILENAME] {}'.format(filename))
            dill.dump(self.metadata, open(os.path.join(cache_dir, 'metadata.pkl'), 'wb'))

        print('Using the generated files at {}. Using them to load the cache. '
              'Be sure to have enough memory.'.format(cache_dir))
        self.metadata = dill.load(open(os.path.join(cache_dir, 'metadata.pkl'), 'rb'))

        pickle_files = find_files(cache_dir, pattern='*.pkl')
        for pkl_file in pickle_files:
            if 'metadata' not in pkl_file:
                with open(pkl_file, 'rb') as f:
                    obj = dill.load(f)
                    self.cache[obj[FILENAME]] = obj
        #print('Cache took {0:.2f} seconds to load. {1:} keys.'.format(time() - st, len(self.cache)))

    def get_speaker_list(self):
        return sorted(list(self.metadata.keys()))

    def sample_speakers(self, speaker_list, num_speakers):
        if speaker_list is None:
            speaker_list = self.get_speaker_list()
        all_speakers = list(speaker_list)
        shuffle(all_speakers)
        speaker_list = all_speakers[0:num_speakers]
        return speaker_list

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from python_speech_features import mfcc

SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1  # 0 is reserved to space


def convert_inputs_to_ctc_format(audio, fs, target_text, num_features):
    # print(target_text)
    inputs = mfcc(audio, samplerate=fs, numcep=num_features)
    # Transform in 3D array
    train_inputs = np.asarray(inputs[np.newaxis, :])
    train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)
    train_seq_len = [train_inputs.shape[1]]

    # Get only the words between [a-z] and replace period for none
    original = ' '.join(target_text.strip().lower().split(' ')).replace('.', '').replace('?', '').replace(',',
                                                                                                          '').replace(
        "'", '').replace('!', '').replace('-', '')
    # print(original)
    targets = original.replace(' ', '  ')
    targets = targets.split(' ')

    # Adding blank label
    targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets])

    # Transform char into index
    targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                          for x in targets])

    return train_inputs, targets, train_seq_len, original


def sparse_tuple_from(sequences, dtype=np.int32):
    """Create a sparse representention of x.
    Args:
        sequences: a list of lists of type dtype where each element is a sequence
    Returns:
        A tuple with (indices, values, shape)
    """
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)

    return indices, values, shape


def pad_sequences(sequences, maxlen=None, dtype=np.float32,
                  padding='post', truncating='post', value=0.):
    '''Pads each sequence to the same length: the length of the longest
    sequence.
        If maxlen is provided, any sequence longer than maxlen is truncated to
        maxlen. Truncation happens off either the beginning or the end
        (default) of the sequence. Supports post-padding (default) and
        pre-padding.
        Args:
            sequences: list of lists where each element is a sequence
            maxlen: int, maximum length
            dtype: type to cast the resulting sequence.
            padding: 'pre' or 'post', pad either before or after each sequence.
            truncating: 'pre' or 'post', remove values from sequences larger
            than maxlen either in the beginning or in the end of the sequence
            value: float, value to pad the sequences to the desired value.
        Returns
            x: numpy array with dimensions (number_of_sequences, maxlen)
            lengths: numpy array with the original sequence lengths
    '''
    lengths = np.asarray([len(s) for s in sequences], dtype=np.int64)

    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x, lengths

### num_epochs = 100000
num_hidden = 256
batch_size = 16

num_examples = 1
num_batches_per_epoch = 10

In [0]:
import operator
import random
import time

import numpy as np
import tensorflow as tf


sample_rate = 8000
# Some configs
num_features = 13  # log filter bank or MFCC features
# Accounting the 0th index +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 100000
num_hidden = 256
batch_size = 16

num_examples = 1
num_batches_per_epoch = 10

# make sure the values match the ones in generate_audio_cache.py
audio = AudioReader(audio_dir="vctk-p225",
                    cache_dir='cache',
                    sample_rate=sample_rate)

file_logger = FileLogger('out.tsv', ['curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler'])


def next_batch(bs=batch_size, train=True):
    x_batch = []
    y_batch = []
    seq_len_batch = []
    original_batch = []
    for k in range(bs):
        ut_length_dict = dict([(k, len(v['target'])) for (k, v) in audio.cache.items()])
        utterances = sorted(ut_length_dict.items(), key=operator.itemgetter(1))
        test_index = 15
        if train:
            utterances = [a[0] for a in utterances[test_index:]]
        else:
            utterances = [a[0] for a in utterances[:test_index]]
        random_utterance = random.choice(utterances)
        training_element = audio.cache[random_utterance]
        target_text = training_element['target']
        if train:
            l_shift = np.random.randint(low=1, high=1000)
            audio_buffer = training_element['audio'][l_shift:]
        else:
            audio_buffer = training_element['audio']
        x, y, seq_len, original = convert_inputs_to_ctc_format(audio_buffer,
                                                               sample_rate,
                                                               target_text,
                                                               num_features)
        x_batch.append(x)
        y_batch.append(y)
        seq_len_batch.append(seq_len)
        original_batch.append(original)

    # Creating sparse representation to feed the placeholder
    # inputs = np.concatenate(x_batch, axis=0)
    y_batch = sparse_tuple_from(y_batch)
    seq_len_batch = np.array(seq_len_batch)[:, 0]
    for i, pad in enumerate(np.max(seq_len_batch) - seq_len_batch):
        x_batch[i] = np.pad(x_batch[i], ((0, 0), (0, pad), (0, 0)), mode='constant', constant_values=0)

    x_batch = np.concatenate(x_batch, axis=0)
    # return np.array(list(x_batch[0]) * batch_size), y_batch, np.array(seq_len_batch[0] * batch_size), original_batch
    # np.pad(x_batch[0], ((0, 0), (10, 0), (0, 0)), mode='constant', constant_values=0)

    return x_batch, y_batch, seq_len_batch, original_batch


def decode_batch(d, original, phase='training'):
    aligned_original_string = ''
    aligned_decoded_string = ''
    for jj in range(batch_size)[0:2]:  # just for visualisation purposes. we display only 2.
        values = d.values[np.where(d.indices[:, 0] == jj)[0]]
        str_decoded = ''.join([chr(x) for x in np.asarray(values) + FIRST_INDEX])
        # Replacing blank label to none
        str_decoded = str_decoded.replace(chr(ord('z') + 1), '')
        # Replacing space label to space
        str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ')
        maxlen = max(len(original[jj]), len(str_decoded))
        aligned_original_string += str(original[jj]).ljust(maxlen) + ' | '
        aligned_decoded_string += str(str_decoded).ljust(maxlen) + ' | '
    print('- Original (%s) : %s ...' % (phase, aligned_original_string))
    print('- Decoded  (%s) : %s ...' % (phase, aligned_decoded_string))


def run_ctc():
    graph = tf.Graph()
    with graph.as_default():
        # e.g: log filter bank or MFCC features
        # Has size [batch_size, max_step_size, num_features], but the
        # batch_size and max_step_size can vary along each step
        inputs = tf.placeholder(tf.float32, [None, None, num_features], name='inputs')

        # Here we use sparse_placeholder that will generate a
        # SparseTensor required by ctc_loss op.
        # https://www.tensorflow.org/api_docs/python/tf/sparse/SparseTensor
        # https://www.tensorflow.org/api_docs/python/tf/nn/ctc_loss
        targets = tf.sparse_placeholder(tf.int32, name='targets')

        # 1d array of size [batch_size]
        seq_len = tf.placeholder(tf.int32, [None], name='seq_len')

        # Defining the cell
        # Can be:
        #   tf.nn.rnn_cell.RNNCell
        #   tf.nn.rnn_cell.GRUCell
        cell = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)

        # Stacking rnn cells
        stack = tf.contrib.rnn.MultiRNNCell([cell], state_is_tuple=True)

        # The second output is the last state and we will no use that
        outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32)

        shape = tf.shape(inputs)
        batch_s, max_time_steps = shape[0], shape[1]

        # Reshaping to apply the same weights over the timesteps
        outputs = tf.reshape(outputs, [-1, num_hidden])

        # Truncated normal with mean 0 and stdev=0.1
        # Tip: Try another initialization
        # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
        W = tf.Variable(tf.truncated_normal([num_hidden,
                                             num_classes],
                                            stddev=0.1))
        # Zero initialization
        # Tip: Is tf.zeros_initializer the same?
        b = tf.Variable(tf.constant(0., shape=[num_classes]))

        # Doing the affine projection
        logits = tf.matmul(outputs, W) + b

        # Reshaping back to the original shape
        logits = tf.reshape(logits, [batch_s, -1, num_classes])

        # Time major
        logits = tf.transpose(logits, (1, 0, 2))

        loss = tf.nn.ctc_loss(targets, logits, seq_len)
        cost = tf.reduce_mean(loss)

        # optimizer = tf.train.AdamOptimizer().minimize(cost)
        # optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9).minimize(cost)
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)

        # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
        # (it's slower but you'll get better results)
        decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len)

        # Inaccuracy: label error rate
        ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                              targets))

    with tf.Session(graph=graph) as session:

        tf.global_variables_initializer().run()

        for curr_epoch in range(num_epochs):
            train_cost = train_ler = 0
            start = time.time()

            for batch in range(num_batches_per_epoch):
                train_inputs, train_targets, train_seq_len, original = next_batch(train=True)
                feed = {inputs: train_inputs,
                        targets: train_targets,
                        seq_len: train_seq_len}

                batch_cost, _, train_ler_p, d = session.run([cost, optimizer, ler, decoded[0]], feed)
                train_cost += batch_cost / num_batches_per_epoch
                train_ler += train_ler_p / num_batches_per_epoch
                decode_batch(d, original, phase='training')

            val_inputs, val_targets, val_seq_len, val_original = next_batch(train=False)
            val_feed = {inputs: val_inputs,
                        targets: val_targets,
                        seq_len: val_seq_len}

            val_cost, val_ler = session.run([cost, ler], feed_dict=val_feed)

            # Decoding
            # np.where(np.diff(d.indices[:, 0]) == 1)
            d = session.run(decoded[0], feed_dict=val_feed)
            decode_batch(d, val_original, phase='validation')

            print('-' * 80)
            log = "Epoch {}/{}, train_cost = {:.3f}, train_ler = {:.3f}, " \
                  "val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}"

            file_logger.write([curr_epoch + 1,
                               train_cost,
                               train_ler,
                               val_cost,
                               val_ler])

            print(log.format(curr_epoch + 1, num_epochs, train_cost, train_ler,
                             val_cost, val_ler, time.time() - start))

  

Initializing AudioReader()
audio_dir = vctk-p225
cache_dir = cache
sample_rate = 8000
speakers_sub_list = None
Using the generated files at cache. Using them to load the cache. Be sure to have enough memory.


In [0]:
run_ctc()

- Original (training) : three hours later the man was free           | the occasion was the new labour conference in brighton last year                                        |  ...
- Decoded  (training) : bdkbdtyjhgjtstogcqbkbkiwgjyhngieujrutisdckcb | bkbkb qbkqcqcqckbcbkbkbkbkckbobkwcwgisisssusvgv vkbgvigsigiehey jhvieauhgvibkboghietgogv avakrtustogceb |  ...
- Original (training) : the event was held in the councils headquarters in hamilton | already he has been a tremendous influence in the dressing room |  ...
- Decoded  (training) : bt ggyho yauavgogynvgvuog                                   | bav a jtssbsgotib                                               |  ...
- Original (training) : but the story of the play is worth a play in itself | they did not attack the themes of the book |  ...
- Decoded  (training) : beb bbo rbegat b                                    | begigjtrb                                  |  ...
- Original (training) : why do you want to come to edinburgh  | i 

KeyboardInterrupt: ignored

### New Section

In [0]:
import operator
import random
import time

import numpy as np
import tensorflow as tf


sample_rate = 8000
# Some configs
num_features = 20  # log filter bank or MFCC features
# Accounting the 0th index +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 250
num_hidden = 256
batch_size = 16
dropout = 0.8
num_examples = 1
num_batches_per_epoch = 10

# make sure the values match the ones in generate_audio_cache.py
audio = AudioReader(audio_dir="vctk-p225",
                    cache_dir='cache',
                    sample_rate=sample_rate)

file_logger = FileLogger('out.tsv', ['curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler'])


def next_batch(bs=batch_size, train=True):
    x_batch = []
    y_batch = []
    seq_len_batch = []
    original_batch = []
    for k in range(bs):
        ut_length_dict = dict([(k, len(v['target'])) for (k, v) in audio.cache.items()])
        utterances = sorted(ut_length_dict.items(), key=operator.itemgetter(1))
        test_index = 15
        if train:
            utterances = [a[0] for a in utterances[test_index:]]
        else:
            utterances = [a[0] for a in utterances[:test_index]]
        random_utterance = random.choice(utterances)
        training_element = audio.cache[random_utterance]
        target_text = training_element['target']
        if train:
            l_shift = np.random.randint(low=1, high=1000)
            audio_buffer = training_element['audio'][l_shift:]
        else:
            audio_buffer = training_element['audio']
        x, y, seq_len, original = convert_inputs_to_ctc_format(audio_buffer,
                                                               sample_rate,
                                                               target_text,
                                                               num_features)
        x_batch.append(x)
        y_batch.append(y)
        seq_len_batch.append(seq_len)
        original_batch.append(original)

    # Creating sparse representation to feed the placeholder
    # inputs = np.concatenate(x_batch, axis=0)
    y_batch = sparse_tuple_from(y_batch)
    seq_len_batch = np.array(seq_len_batch)[:, 0]
    for i, pad in enumerate(np.max(seq_len_batch) - seq_len_batch):
        x_batch[i] = np.pad(x_batch[i], ((0, 0), (0, pad), (0, 0)), mode='constant', constant_values=0)

    x_batch = np.concatenate(x_batch, axis=0)
    # return np.array(list(x_batch[0]) * batch_size), y_batch, np.array(seq_len_batch[0] * batch_size), original_batch
    # np.pad(x_batch[0], ((0, 0), (10, 0), (0, 0)), mode='constant', constant_values=0)

    return x_batch, y_batch, seq_len_batch, original_batch


def decode_batch(d, original, phase='training'):
    aligned_original_string = ''
    aligned_decoded_string = ''
    for jj in range(batch_size)[0:2]:  # just for visualisation purposes. we display only 2.
        values = d.values[np.where(d.indices[:, 0] == jj)[0]]
        str_decoded = ''.join([chr(x) for x in np.asarray(values) + FIRST_INDEX])
        # Replacing blank label to none
        str_decoded = str_decoded.replace(chr(ord('z') + 1), '')
        # Replacing space label to space
        str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ')
        maxlen = max(len(original[jj]), len(str_decoded))
        aligned_original_string += str(original[jj]).ljust(maxlen) + ' | '
        aligned_decoded_string += str(str_decoded).ljust(maxlen) + ' | '
    print('- Original (%s) : %s ...' % (phase, aligned_original_string))
    print('- Decoded  (%s) : %s ...' % (phase, aligned_decoded_string))


def run_ctc():
    graph = tf.Graph()
    with graph.as_default():
        # e.g: log filter bank or MFCC features
        # Has size [batch_size, max_step_size, num_features], but the
        # batch_size and max_step_size can vary along each step
        inputs = tf.placeholder(tf.float32, [None, None, num_features], name='inputs')

        # Here we use sparse_placeholder that will generate a
        # SparseTensor required by ctc_loss op.
        # https://www.tensorflow.org/api_docs/python/tf/sparse/SparseTensor
        # https://www.tensorflow.org/api_docs/python/tf/nn/ctc_loss
        targets = tf.sparse_placeholder(tf.int32, name='targets')

        # 1d array of size [batch_size]
        seq_len = tf.placeholder(tf.int32, [None], name='seq_len')

        # Defining the cell
        # Can be:
        #   tf.nn.rnn_cell.RNNCell
        #   tf.nn.rnn_cell.GRUCell
        cell = tf.contrib.rnn.LSTMCell(num_hidden,forget_bias=1.0, state_is_tuple=True)

        # Stacking rnn cells
        stack = tf.contrib.rnn.MultiRNNCell([cell], state_is_tuple=True)

        # The second output is the last state and we will no use that
        outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32)

        shape = tf.shape(inputs)
        batch_s, max_time_steps = shape[0], shape[1]

        # Reshaping to apply the same weights over the timesteps
        outputs = tf.reshape(outputs, [-1, num_hidden])

        # Truncated normal with mean 0 and stdev=0.1
        # Tip: Try another initialization
        # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
        W = tf.Variable(tf.truncated_normal([num_hidden,
                                             num_classes],
                                            stddev=0.1))
        # Zero initialization
        # Tip: Is tf.zeros_initializer the same?
        b = tf.Variable(tf.constant(0., shape=[num_classes]))

        # Doing the affine projection
        logits = tf.matmul(outputs, W) + b

        # Reshaping back to the original shape
        logits = tf.reshape(logits, [batch_s, -1, num_classes])

        # Time major
        logits = tf.transpose(logits, (1, 0, 2))

        loss = tf.nn.ctc_loss(targets, logits, seq_len)
        cost = tf.reduce_mean(loss)

        # optimizer = tf.train.AdamOptimizer().minimize(cost)
        # optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9).minimize(cost)
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)

        # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
        # (it's slower but you'll get better results)
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False)

        # Inaccuracy: label error rate
        ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                              targets))

    with tf.Session(graph=graph) as session:

        tf.global_variables_initializer().run()

        for curr_epoch in range(num_epochs):
            train_cost = train_ler = 0
            start = time.time()

            for batch in range(num_batches_per_epoch):
                train_inputs, train_targets, train_seq_len, original = next_batch(train=True)
                feed = {inputs: train_inputs,
                        targets: train_targets,
                        seq_len: train_seq_len}

                batch_cost, _, train_ler_p, d = session.run([cost, optimizer, ler, decoded[0]], feed)
                train_cost += batch_cost / num_batches_per_epoch
                train_ler += train_ler_p / num_batches_per_epoch
                decode_batch(d, original, phase='training')

            val_inputs, val_targets, val_seq_len, val_original = next_batch(train=False)
            val_feed = {inputs: val_inputs,
                        targets: val_targets,
                        seq_len: val_seq_len}

            val_cost, val_ler = session.run([cost, ler], feed_dict=val_feed)

            # Decoding
            # np.where(np.diff(d.indices[:, 0]) == 1)
            d = session.run(decoded[0], feed_dict=val_feed)
            decode_batch(d, val_original, phase='validation')

            print('-' * 80)
            log = "Epoch {}/{}, train_cost = {:.3f}, train_ler = {:.3f}, " \
                  "val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}"

            file_logger.write([curr_epoch + 1,
                               train_cost,
                               train_ler,
                               val_cost,
                               val_ler])

            print(log.format(curr_epoch + 1, num_epochs, train_cost, train_ler,
                             val_cost, val_ler, time.time() - start))

  

Initializing AudioReader()
audio_dir = vctk-p225
cache_dir = cache
sample_rate = 8000
speakers_sub_list = None
Using the generated files at cache. Using them to load the cache. Be sure to have enough memory.


In [0]:
run_ctc()

- Original (training) : and it had been a marvellous occasion                                                                                                                                                                                                       | it is a common problem                                               |  ...
- Decoded  (training) : nwnwnwnwnwnwnwnwnwnwnwnqnwnwnqnqnqnqnqnqnqnqnqnqnqnqnwnwnqnwngnwnwnwnoxoxsxnxncxcypxsyaypynxngnhnwnwnwnwhwhwhnhqhohohohohowhewhohohoeoehewewnwnynwnkdudrdrdrdriririrjrjrmriririrgridvdtbtbtbtbdbagji i itdrdrdrirdrprfrfrirmrtrfrghohxqxqxq | nwnxexexexcmsmruririririt tbtatatirirzfbtmatfriv ititmtxtxwxnxyxnxnx |  ...
- Original (training) : i thought his speech was a disgrace                                                                            | already he has been a tremendous influence in the dressing room                                                                                                                            

# Converting Above Model in Keras and Tensorflow 2.0

In [0]:
import operator
import random
import time

import numpy as np
import tensorflow as tf


sample_rate = 8000
# Some configs
num_features = 13  # log filter bank or MFCC features
# Accounting the 0th index +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 100000
num_hidden = 256
batch_size = 16

num_examples = 1
num_batches_per_epoch = 10

# make sure the values match the ones in generate_audio_cache.py
audio = AudioReader(audio_dir="vctk-p225",
                    cache_dir='cache',
                    sample_rate=sample_rate)

file_logger = FileLogger('out.tsv', ['curr_epoch', 'train_cost', 'train_ler', 'val_cost', 'val_ler'])


def next_batch(bs=batch_size, train=True):
    x_batch = []
    y_batch = []
    seq_len_batch = []
    original_batch = []
    for k in range(bs):
        ut_length_dict = dict([(k, len(v['target'])) for (k, v) in audio.cache.items()])
        utterances = sorted(ut_length_dict.items(), key=operator.itemgetter(1))
        test_index = 15
        if train:
            utterances = [a[0] for a in utterances[test_index:]]
        else:
            utterances = [a[0] for a in utterances[:test_index]]
        random_utterance = random.choice(utterances)
        training_element = audio.cache[random_utterance]
        target_text = training_element['target']
        if train:
            l_shift = np.random.randint(low=1, high=1000)
            audio_buffer = training_element['audio'][l_shift:]
        else:
            audio_buffer = training_element['audio']
        x, y, seq_len, original = convert_inputs_to_ctc_format(audio_buffer,
                                                               sample_rate,
                                                               target_text,
                                                               num_features)
        x_batch.append(x)
        y_batch.append(y)
        seq_len_batch.append(seq_len)
        original_batch.append(original)

    # Creating sparse representation to feed the placeholder
    # inputs = np.concatenate(x_batch, axis=0)
    y_batch = sparse_tuple_from(y_batch)
    seq_len_batch = np.array(seq_len_batch)[:, 0]
    for i, pad in enumerate(np.max(seq_len_batch) - seq_len_batch):
        x_batch[i] = np.pad(x_batch[i], ((0, 0), (0, pad), (0, 0)), mode='constant', constant_values=0)

    x_batch = np.concatenate(x_batch, axis=0)
    # return np.array(list(x_batch[0]) * batch_size), y_batch, np.array(seq_len_batch[0] * batch_size), original_batch
    # np.pad(x_batch[0], ((0, 0), (10, 0), (0, 0)), mode='constant', constant_values=0)

    return x_batch, y_batch, seq_len_batch, original_batch


def decode_batch(d, original, phase='training'):
    aligned_original_string = ''
    aligned_decoded_string = ''
    for jj in range(batch_size)[0:2]:  # just for visualisation purposes. we display only 2.
        values = d.values[np.where(d.indices[:, 0] == jj)[0]]
        str_decoded = ''.join([chr(x) for x in np.asarray(values) + FIRST_INDEX])
        # Replacing blank label to none
        str_decoded = str_decoded.replace(chr(ord('z') + 1), '')
        # Replacing space label to space
        str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ')
        maxlen = max(len(original[jj]), len(str_decoded))
        aligned_original_string += str(original[jj]).ljust(maxlen) + ' | '
        aligned_decoded_string += str(str_decoded).ljust(maxlen) + ' | '
    print('- Original (%s) : %s ...' % (phase, aligned_original_string))
    print('- Decoded  (%s) : %s ...' % (phase, aligned_decoded_string))


def run_ctc():
    graph = tf.Graph()
    with graph.as_default():
        # e.g: log filter bank or MFCC features
        # Has size [batch_size, max_step_size, num_features], but the
        # batch_size and max_step_size can vary along each step
        inputs = tf.compat.v1.placeholder(tf.float32, [None, None, num_features], name='inputs')

        # Here we use sparse_placeholder that will generate a
        # SparseTensor required by ctc_loss op.
        # https://www.tensorflow.org/api_docs/python/tf/sparse/SparseTensor
        # https://www.tensorflow.org/api_docs/python/tf/nn/ctc_loss
        targets = tf.compat.v1.sparse_placeholder(tf.int32, name='targets')

        # 1d array of size [batch_size]
        seq_len = tf.compat.v1.placeholder(tf.int32, [None], name='seq_len')

        # Defining the cell
        # Can be:
        #   tf.nn.rnn_cell.RNNCell
        #   tf.nn.rnn_cell.GRUCell
        #cell = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)

        # Stacking rnn cells
        #stack = tf.contrib.rnn.MultiRNNCell([cell], state_is_tuple=True)

        # The second output is the last state and we will no use that
        #outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32)
        model = tf.keras.models.Sequential([
          tf.keras.layers.LSTM(num_hidden),
          tf.keras.layers.Dense(29, activation='softmax')
        ])

        model.compile(
          optimizer='adam',
          loss='sparse_categorical_crossentropy',
          metrics=['accuracy'])

        shape = tf.shape(inputs)
        batch_s, max_time_steps = shape[0], shape[1]

        # Reshaping to apply the same weights over the timesteps
        #outputs = tf.reshape(outputs, [-1, num_hidden])

        # Truncated normal with mean 0 and stdev=0.1
        # Tip: Try another initialization
        # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
        W = tf.Variable(tf.compat.v1.truncated_normal([num_hidden,
                                             num_classes],
                                            stddev=0.1))
        # Zero initialization
        # Tip: Is tf.zeros_initializer the same?
        b = tf.Variable(tf.constant(0., shape=[num_classes]))

        # Doing the affine projection
        logits = tf.matmul(outputs, W) + b

        # Reshaping back to the original shape
        logits = tf.reshape(logits, [batch_s, -1, num_classes])

        # Time major
        logits = tf.transpose(logits, (1, 0, 2))

        loss = tf.nn.ctc_loss(targets, logits, seq_len)
        cost = tf.reduce_mean(loss)

        # optimizer = tf.train.AdamOptimizer().minimize(cost)
        # optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9).minimize(cost)
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)

        # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
        # (it's slower but you'll get better results)
        decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len)

        # Inaccuracy: label error rate
        ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                              targets))

    with tf.Session(graph=graph) as session:

        tf.global_variables_initializer().run()

        for curr_epoch in range(num_epochs):
            train_cost = train_ler = 0
            start = time.time()

            for batch in range(num_batches_per_epoch):
                train_inputs, train_targets, train_seq_len, original = next_batch(train=True)
                feed = {inputs: train_inputs,
                        targets: train_targets,
                        seq_len: train_seq_len}

                batch_cost, _, train_ler_p, d = session.run([cost, optimizer, ler, decoded[0]], feed)
                train_cost += batch_cost / num_batches_per_epoch
                train_ler += train_ler_p / num_batches_per_epoch
                decode_batch(d, original, phase='training')

            val_inputs, val_targets, val_seq_len, val_original = next_batch(train=False)
            val_feed = {inputs: val_inputs,
                        targets: val_targets,
                        seq_len: val_seq_len}

            val_cost, val_ler = session.run([cost, ler], feed_dict=val_feed)

            # Decoding
            # np.where(np.diff(d.indices[:, 0]) == 1)
            d = session.run(decoded[0], feed_dict=val_feed)
            decode_batch(d, val_original, phase='validation')

            print('-' * 80)
            log = "Epoch {}/{}, train_cost = {:.3f}, train_ler = {:.3f}, " \
                  "val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}"

            file_logger.write([curr_epoch + 1,
                               train_cost,
                               train_ler,
                               val_cost,
                               val_ler])

            print(log.format(curr_epoch + 1, num_epochs, train_cost, train_ler,
                             val_cost, val_ler, time.time() - start))

  

Initializing AudioReader()
audio_dir = vctk-p225
cache_dir = cache
sample_rate = 8000
speakers_sub_list = None
Using the generated files at cache. Using them to load the cache. Be sure to have enough memory.


In [0]:
run_ctc()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


NameError: ignored