[View in Colaboratory](https://colab.research.google.com/github/daveshap/keras_asr/blob/master/keras_asr.ipynb)

# Keras ASR Experiment

## Step 1 - Setup Environment

Your output should look something like this:

```
Redirecting output to ‘wget-log’.
total 32
drwxr-xr-x 1 root root 4096 Sep 21 17:57 .
drwxr-xr-x 1 root root 4096 Sep 21 17:54 ..
drwxr-xr-x 4 root root 4096 Sep 19 23:48 .config
drwxr-xr-x 2 root root 4096 Sep 21 17:57 data
drwxr-xr-x 3 root root 4096 Sep 21 17:56 LibriSpeech
drwxr-xr-x 2 root root 4096 Sep 20 00:09 sample_data
-rw-r--r-- 1 root root 6094 Sep 21 17:57 wget-log
 ```

In [1]:
!wget -O - http://www.openslr.org/resources/12/dev-clean.tar.gz | tar xfz -
!mkdir speech_data
!ls -al
!apt-get -qq update
!apt-get -qq install -y libsndfile-dev
!pip install SoundFile -q
!pip install librosa -q


Redirecting output to ‘wget-log.2’.
mkdir: cannot create directory ‘speech_data’: File exists
total 164
drwxr-xr-x 1 root root   4096 Sep 21 21:20 .
drwxr-xr-x 1 root root   4096 Sep 21 18:19 ..
drwxr-xr-x 4 root root   4096 Sep 19 23:48 .config
drwxr-xr-x 3 root root   4096 Sep 21 21:20 LibriSpeech
drwxr-xr-x 2 root root   4096 Sep 20 00:09 sample_data
drwxr-xr-x 2 root root 135168 Sep 21 21:07 speech_data
-rw-r--r-- 1 root root   2174 Sep 21 21:04 wget-log
-rw-r--r-- 1 root root   2254 Sep 21 21:05 wget-log.1
-rw-r--r-- 1 root root   2174 Sep 21 21:20 wget-log.2


## Step 2 - Import Modules

Here we import modules and define a few things

In [3]:
import soundfile as sf
import librosa
import pickle
import os
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns

libri_dir = './LibriSpeech/dev-clean'
output_dir = './speech_data'
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"


def get_file_paths(root_dir):
    layer1 = os.listdir(root_dir)
    results = []
    for folder1 in layer1:
        layer2 = os.listdir(root_dir + '/' + folder1)
        for folder2 in layer2:
            layer3 = os.listdir(root_dir + '/' + folder1 + '/' + folder2)
            #print(layer3)
            result = {'path': root_dir + '/' + folder1 + '/' + folder2, 'files': layer3}
            results.append(result)
    return results

  
data_dirs = get_file_paths(libri_dir)
print(len(data_dirs))

97


## Step 3 - Process Datasets
This generates a bunch of pickle files where each contains 3 pieces of information
* label (sentence string)
* sample (MFCC as numpy ndarray)
* encoded (result from Google Universal Encoder)

In [0]:
for folder in data_dirs:
    samples = []
    labels = []
    print('FOLDER:', folder)
    for file in folder['files']:
        if 'flac' in file:
            with open(folder['path'] + '/' + file, 'rb') as f:
                data, samplerate = sf.read(f)
                sample = librosa.feature.melspectrogram(y=data, sr=samplerate)
                #sample = librosa.feature.mfcc(y=data, sr=samplerate)
                samples.append({'data': sample, 'file': file})
        if 'txt' in file:
            with open(folder['path'] + '/' + file, 'r') as f:
                lines = f.readlines()
                labels = lines
    #print(samples)
    #print(labels)
    for entry in labels:
        file = entry.split(' ')[0]
        label = entry.replace(file, '').replace('\\n', '').strip().lower()
        for record in samples:
            if file in record['file']:
                final = {'label': label, 'sample': record['data']}
                with open(output_dir + '/' + file + '.pickle', 'wb') as outfile:
                    pickle.dump(final, outfile)


## Step 4 - Generate Embeddings

This step uses Google's Universal Sentence Encoder to generate **semantic vectors** from the sentence labels. Semantic vectors are consistently of size 512 and can represent any chunk of text.

In [3]:
embed = hub.Module(module_url)
tf.logging.set_verbosity(tf.logging.ERROR)
files = os.listdir(output_dir)
sentences = []
max_len = 0

print('loading all samples')
for file in files:
  with open(output_dir + '/' + file, 'rb') as infile:
    d = pickle.load(infile)
  sentences.append(d['label'])
  if d['sample'].shape[1] > max_len:
    max_len = d['sample'].shape[1]

print('max length is:', max_len)
print('getting encodings')
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  encoded = session.run(embed(sentences))
  
print('encoded', len(encoded), 'sentences')

print('saving samples with embeddings')
for i in range(len(files)):
  with open(output_dir + '/' + files[i], 'rb') as infile:
    d = pickle.load(infile)
  d['embedding'] = encoded[i]
  with open(output_dir + '/' + files[i], 'wb') as outfile:
    pickle.dump(d, outfile)

print('completed data prep!')

with open(output_dir + '/' + files[10], 'rb') as testfile:
  d = pickle.load(testfile)
print('EXAMPLE FILE:')
print('LABEL:     ', type(d['label']), d['label'])
print('SAMPLE:    ', type(d['sample']), d['sample'].shape)
print('EMBEDDING: ', type(d['embedding']), d['embedding'].shape)


INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
loading all samples
max length is: 1021
getting encodings
encoded 2703 sentences
saving samples with embeddings
completed data prep!
EXAMPLE FILE:
LABEL:      <class 'str'> and if you have any desire to shorten the journey and put yourself easily in the way of salvation come with me and i will show you how to become a knight errant a calling wherein so many hardships and mishaps are encountered that if they be taken as penances they will lodge you in heaven in a trice
SAMPLE:     <class 'numpy.ndarray'> (128, 648)
EMBEDDING:  <class 'numpy.ndarray'> (512,)


In [1]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import MaxPooling2D
from keras.layers import Conv2D
from keras.layers import Dropout
from keras.layers import Flatten

mfcc_features = 128
max_len = 1021

print('composing model')
encoder = Sequential()

encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(128, 1021, 1)))
encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(MaxPooling2D(pool_size=(2, 2)))
encoder.add(Dropout(0.1))

encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(MaxPooling2D(pool_size=(2, 2)))
encoder.add(Dropout(0.1))

encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(MaxPooling2D(pool_size=(2, 2)))
encoder.add(Dropout(0.1))

encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(MaxPooling2D(pool_size=(2, 2)))
encoder.add(Dropout(0.1))

encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(MaxPooling2D(pool_size=(2, 2)))
encoder.add(Dropout(0.1))

encoder.add(Flatten())
encoder.add(Dense(512, activation='softmax'))
encoder.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(encoder.summary())


Using TensorFlow backend.


composing model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 127, 1020, 32)     160       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 126, 1019, 32)     4128      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 63, 509, 32)       0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 63, 509, 32)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 62, 508, 32)       4128      
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 61, 507, 32)       4128      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 30, 253, 32)       0    

## Step 5 - Train Encoder
The encoder is the heavy lifter. It looks at raw audio data in the form of an MFCC and encodes it to a semantic vector.

In [0]:
print('compiling data')
files = os.listdir(output_dir)
data_x = []
data_y = []
for file in files:
  with open(output_dir + '/' + file, 'rb') as infile:
    dic = pickle.load(infile)
  pad_width = max_len - dic['sample'].shape[1]
  mfcc = np.pad(dic['sample'], pad_width=((0, 0), (0, pad_width)), mode='constant')
  data_x.append(mfcc)
  data_y.append(dic['embedding'])

data_x = np.asarray(data_x)
data_y = np.asarray(data_y)

data_x = np.expand_dims(data_x, axis=3)

print('training model')
encoder.fit(data_x, data_y, epochs=10, batch_size=32)


compiling data
training model
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
 320/2703 [==>...........................] - ETA: 28s - loss: -50.0856 - acc: 0.0813