# Keras ASR Experiment

## Install Prerequisites
Install a bunch of stuff and download data

In [4]:
#!apt-get -qq update
#!apt-get -qq install -y libsndfile-dev wget p7zip-full
#!pip install SoundFile -q
#!pip install librosa -q
#!pip install tensorflow-hub -q
#!pip install seaborn -q
#!pip install keras -q
!ls -al

## Import Modules
Here we import modules and define a few things

In [1]:
import soundfile as sf
import librosa
import pickle
import os
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import random
import glob
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import MaxPooling2D
from keras.layers import Conv2D
from keras.layers import Dropout
from keras.layers import Flatten

mfcc_features = 128
max_len = 1021
root_dir = './'
output_dir = './speech_data'
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"

Using TensorFlow backend.


## Generate MFCC
Produces pickled dictionaries with the following keys:
* label (sentence string)
* sample (MFCC as numpy ndarray)

In [None]:
print 'walking all directories'
everything = os.walk('./')
data_dirs = []
for i in everything:
    if len(i[2]) > 2:
        data_dirs.append({'path': i[0], 'files': i[2]})
#for d in data_dirs:
#    print(d['path'])
print(len(data_dirs))
    
# TODO remove data_dirs that have already been processed into the output folder

    
for folder in data_dirs:
    samples = []
    labels = []
    print('FOLDER:', folder['path'])
    for file in folder['files']:
        if 'flac' in file:
            with open(folder['path'] + '/' + file, 'rb') as f:
                data, samplerate = sf.read(f)
                sample = librosa.feature.melspectrogram(y=data, sr=samplerate)
                #sample = librosa.feature.mfcc(y=data, sr=samplerate)
                samples.append({'data': sample, 'file': file})
        if 'txt' in file:
            with open(folder['path'] + '/' + file, 'r') as f:
                lines = f.readlines()
                labels = lines
    #print(samples)
    #print(labels)
    for entry in labels:
        file = entry.split(' ')[0]
        label = entry.replace(file, '').replace('\\n', '').strip().lower()
        for record in samples:
            if file in record['file']:
                final = {'label': label, 'sample': record['data']}
                with open(output_dir + '/' + file + '.pickle', 'wb') as outfile:
                    pickle.dump(final, outfile)


walking all directories
5831
('FOLDER:', './train-clean-100/4406/16882')
('FOLDER:', './train-clean-100/4406/16883')
('FOLDER:', './train-clean-100/7447/91187')
('FOLDER:', './train-clean-100/7447/91186')
('FOLDER:', './train-clean-100/3699/19402')
('FOLDER:', './train-clean-100/3699/47246')
('FOLDER:', './train-clean-100/3699/175950')
('FOLDER:', './train-clean-100/3699/19401')
('FOLDER:', './train-clean-100/4397/15666')
('FOLDER:', './train-clean-100/4397/15668')
('FOLDER:', './train-clean-100/4397/15678')
('FOLDER:', './train-clean-100/3857/182317')
('FOLDER:', './train-clean-100/3857/182315')
('FOLDER:', './train-clean-100/3857/180923')
('FOLDER:', './train-clean-100/8098/275181')
('FOLDER:', './train-clean-100/8098/278252')
('FOLDER:', './train-clean-100/8098/278278')
('FOLDER:', './train-clean-100/5463/39174')
('FOLDER:', './train-clean-100/5463/39173')
('FOLDER:', './train-clean-100/8123/275216')
('FOLDER:', './train-clean-100/8123/275193')
('FOLDER:', './train-clean-100/8123/27

## Generate Embeddings
This step uses Google's Universal Sentence Encoder to generate **semantic vectors** from the sentence labels. Semantic vectors are consistently of size 512 and can represent any chunk of text.

In [2]:
embed = hub.Module(module_url)
tf.logging.set_verbosity(tf.logging.ERROR)
files = os.listdir(output_dir)
max_len = 0
chunk_size = 100
idx = 0

def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    for chunk in chunks(files, 100):
        print('loading new chunk of files')
        print('max length is:', max_len)
        sentences = []
        for file in chunk:
            with open(output_dir + '/' + file, 'rb') as infile:
                d = pickle.load(infile)
            if d['embedding']:
                continue
            sentences.append(d['label'])
            if d['sample'].shape[1] > max_len:
                max_len = d['sample'].shape[1]
        encoded = session.run(embed(sentences))
        print('encoded', len(encoded), 'sentences')
        print('saving samples with embeddings')
        idx = 0
        for file in chunk:
            with open(output_dir + '/' + file, 'rb') as infile:
                d = pickle.load(infile)
                d['embedding'] = encoded[idx]
                idx += 1
            with open(output_dir + '/' + file, 'wb') as outfile:
                pickle.dump(d, outfile)

print('completed data prep!')

with open(output_dir + '/' + files[10], 'rb') as testfile:
  d = pickle.load(testfile)
print('EXAMPLE FILE:')
print('LABEL:     ', type(d['label']), d['label'])
print('SAMPLE:    ', type(d['sample']), d['sample'].shape)
print('EMBEDDING: ', type(d['embedding']), d['embedding'].shape)


INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.


KeyboardInterrupt: 

## Compile Model

In [None]:
print('composing model')
encoder = Sequential()

encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(128, 1021, 1)))
encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(MaxPooling2D(pool_size=(2, 2)))
encoder.add(Dropout(0.1))

encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(MaxPooling2D(pool_size=(2, 2)))
encoder.add(Dropout(0.1))

encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(MaxPooling2D(pool_size=(2, 2)))
encoder.add(Dropout(0.1))

encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(MaxPooling2D(pool_size=(2, 2)))
encoder.add(Dropout(0.1))

encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(Conv2D(32, kernel_size=(2, 2), activation='relu'))
encoder.add(MaxPooling2D(pool_size=(2, 2)))
encoder.add(Dropout(0.1))

encoder.add(Flatten())
encoder.add(Dense(512, activation='softmax'))
encoder.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(encoder.summary())


## Train Encoder
The encoder is the heavy lifter. It looks at raw audio data in the form of an MFCC and encodes it to a semantic vector. Because we have so much data it won't all fit into GPU memory therefore we have to generate smaller batches. 

In [None]:
print('compiling data')
files = os.listdir(output_dir)
batch_size = 128
steps_per_epoch = int(len(files) / batch_size)
epochs = 10

def data_generator(sample_count):
    random.seed()
    data_x = []
    data_y = []
    for c in range(sample_count):
        idx = random.randint(0, len(files) - 1)
        with open(output_dir + '/' + files[idx], 'rb') as infile:
            dic = pickle.load(infile)
        pad_width = max_len - dic['sample'].shape[1]
        mfcc = np.pad(dic['sample'], pad_width=((0, 0), (0, pad_width)), mode='constant')
        data_x.append(mfcc)
        data_y.append(dic['embedding'])
    data_x = np.asarray(data_x)
    data_y = np.asarray(data_y)
    data_x = np.expand_dims(data_x, axis=3)
    return data_x, data_y

print('training model')
#encoder.fit(data_x, data_y, epochs=10, batch_size=32)
model.fit_generator(data_generator(batch_size), steps_per_epoch=steps_per_epoch, epochs=epochs)
model.save('keras_asr.h5')
