# Speech Recognition with AI
Brought to you by Daniel Sikar - daniel.sikar@city.ac.uk
and
City Data Science Society - https://www.datasciencesociety.city/

## Natural Language Processing with Convolutional Neural Networks

Notebook: https://github.com/dsikar/natural-language-processing/blob/master/NaturalLanguageProcessing.ipynb

Tensorflow's Speech Commands Datasets: http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz

Consisting of:
* 65,000 one-second long utterances
* 30 short words plus a background noise set
* Thousands of different people

Using a subset ("yes" and "no") of Tensorflow's Speech Commands Datasets. The full set consists of 30 words plus a background noise set: _background_noise_, bed, bird, cat, dog, down, eight, five, four, go, happy, house, left, marvin, no, nine, off, on, one, right, seven, sheila, six, stop, three, tree, two, up, wow, yes, zero.

Note: In this workshop, we will **not** use the full dataset.

# 1. Getting to know the environment

In [None]:
# How much space have we got?
# !df -h
# Command help
# !man df
# What is on the filesystem?
#!ls
# Where are we?
# !pwd
# What files are on the top level / ?
# !ls /

# 2. Get the data

In [None]:
!wget http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
# !ls
!mkdir full_dataset
# !ls
!tar xvf speech_commands_v0.01.tar.gz -C full_dataset/ 
# !ls full_dataset
!mkdir dataset
!mv full_dataset/yes dataset/
!mv full_dataset/no dataset/

In [None]:
!ls dataset/yes | wc -l
!ls dataset/no | wc -l
!ls dataset
# !ls dataset/yes


# 3. Import modules

In [None]:
import os
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
import warnings
warnings.filterwarnings("ignore")

# 4. Visualise data

In [None]:
# Dataset original sample rate 16 kHz
# Humans can detect sounds in a frequency range from about 20 Hz to 20 kHz.
# We will convert data to 8 kHz given our network architecture (more later)
samples_yes, sample_rate = librosa.load('dataset/yes/8d8d9855_nohash_0.wav', sr = 8000)
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('"Yes" - waveform for file ' + 'dataset/yes/8d8d9855_nohash_0.wav')
ax1.set_xlabel('Sample')
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate/len(samples_yes), sample_rate), samples_yes)
ipd.Audio(samples_yes, rate=sample_rate)

In [None]:
samples_no, sample_rate = librosa.load('dataset/no/8a194ee6_nohash_0.wav', sr = 8000)
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('"NO" - Waveform for file ' + 'dataset/no/8a194ee6_nohash_0.wav')
ax1.set_xlabel('Sample')
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate/len(samples_no), sample_rate), samples_no)
ipd.Audio(samples_no, rate=sample_rate)

In [None]:
# Data type
print(type(samples))
# Size
print(samples.shape)

In [None]:
# Statistical analysis - can the word be inferred?
print("No, sample mean:", np.mean(samples_no))
print("No, sample std:", np.std(samples_no))
print("Yes, sample mean:", np.mean(samples_yes))
print("Yes, sample std:", np.std(samples_yes))

In [None]:
# Labels
labels=os.listdir(train_audio_path)
print("Audio labels: ", labels)

In [None]:
# Find count of each label and plot bar graph
no_of_recordings=[]
for label in labels:
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    no_of_recordings.append(len(waves))
    
#plot
plt.figure()
index = np.arange(len(labels))
plt.bar(index, no_of_recordings)
plt.xlabel('Commands', fontsize=12)
plt.ylabel('No of recordings', fontsize=12)
plt.xticks(index, labels, fontsize=15, rotation=60)
plt.title('No. of recordings for each command')
plt.show()

In [None]:
# Duration
duration_of_recordings=[]
for label in labels:
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    for wav in waves:
        sample_rate, samples = wavfile.read(train_audio_path + '/' + label + '/' + wav)
        duration_of_recordings.append(float(len(samples)/sample_rate))
    
plt.hist(np.array(duration_of_recordings))

# 5. Preprocess data

In [None]:
train_audio_path = 'dataset'

all_wave = []
all_label = []
for label in labels:
    print(label)
    waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]
    for wav in waves:
        # resample
        samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 8000)
        if(len(samples)== 8000) : 
            # only use 1 second long recordings
            all_wave.append(samples)
            all_label.append(label)

In [None]:
# print size of training dataset
print(len(all_wave))

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
arr_labels=le.fit_transform(all_label)
classes= list(le.classes_)
# print datatype
print(type(classes))
# print classes
print(classes)

In [None]:
from keras.utils import np_utils
arr_labels=np_utils.to_categorical(arr_labels, num_classes=len(labels))

In [None]:
# type
print(type(arr_labels))
# shape
print(arr_labels.shape)
# first first index value
print(arr_labels[0]) # yes
# first last index value
print(arr_labels[arr_labels.shape[0]-1]) # yes

In [None]:
# Let's look at data again, this time loading from array
samples_no = all_wave[0].ravel()
sample_rate = 8000
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('"Yes" all_wave[0].ravel()')
ax1.set_xlabel('Sample')
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate/len(samples_no), sample_rate), samples_no)
print("labels: ", arr_labels[0])
ipd.Audio(samples_no, rate=sample_rate)


In [None]:
print()
samples_no = all_wave[arr_labels.shape[0]-1].ravel()
sample_rate = 8000
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('"No" arr_labels.shape[0]-1]')
ax1.set_xlabel('Sample')
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate/len(samples_no), sample_rate), samples_no)
print("labels: ", arr_labels[arr_labels.shape[0]-1])
ipd.Audio(samples_no, rate=sample_rate)

In [None]:
# Reshape the 2D array to 3D since the input to the conv1d must be a 3D array
all_wave = np.array(all_wave).reshape(-1,8000,1)

# 6. Split data

In [None]:
# Split into training and validation sets 70/30
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(np.array(all_wave),np.array(y),stratify=arr_labels,test_size = 0.3,random_state=777,shuffle=True)

# 7. Create model

In [None]:
from keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
K.clear_session()

inputs = Input(shape=(8000,1))

#First Conv1D layer
conv = Conv1D(8,13, padding='valid', activation='relu', strides=1)(inputs)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#Second Conv1D layer
conv = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#Third Conv1D layer
conv = Conv1D(32, 9, padding='valid', activation='relu', strides=1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#Fourth Conv1D layer
conv = Conv1D(64, 7, padding='valid', activation='relu', strides=1)(conv)
conv = MaxPooling1D(3)(conv)
conv = Dropout(0.3)(conv)

#Flatten layer
conv = Flatten()(conv)

#Dense Layer 1
conv = Dense(256, activation='relu')(conv)
conv = Dropout(0.3)(conv)

#Dense Layer 2
conv = Dense(128, activation='relu')(conv)
conv = Dropout(0.3)(conv)

outputs = Dense(len(labels), activation='softmax')(conv)

model = Model(inputs, outputs)
model.summary()

In [None]:
# Compile, defining a loss function, optimiser and metrics
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
# Set early stopping and check pointing
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5, min_delta=0.0001) 
mc = ModelCheckpoint('nlp-model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')

# 8. Train model

In [None]:
# Fit the model (find best parameters)
history=model.fit(x_tr, y_tr ,epochs=10, callbacks=[es,mc], batch_size=32, validation_data=(x_val,y_val))

In [None]:
# save model
model.save('nlp-model.h5')

In [None]:
# verify 
!ls -lh nlp-model.h5

In [None]:
# Plot training history
from matplotlib import pyplot
pyplot.plot(history.history['loss'], label='train_loss')
pyplot.plot(history.history['val_loss'], label='test_loss')
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
pyplot.legend()
pyplot.show()

In [None]:
# Load model
from keras.models import load_model
model=load_model('nlp-model.h5')

# 9. Predict

In [None]:
def predict(audio):
    """
    Input
      audio: array representing audio file
    Ouput
      prob: 
      classes:
    """
    prob=model.predict(audio.reshape(1,8000,1))
    index=np.argmax(prob[0])
    return prob, classes[index]

In [None]:
import random
# print("Number of testing examples - len(x_val):", len(x_val))
index=random.randint(0,len(x_val)-1)
print("Random index selected:", index)
samples=x_val[index].ravel() # x_val[index] shape: (8000, 1), "samples" shape: (8000,)
print("Randomly selected audio:",classes[np.argmax(y_val[index])])
pred, predClass = predict(samples)
print("Prediction output: ", pred)
print("Predicted class:", predClass)
sample_rate = 8000
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title(predClass)
ax1.set_xlabel('Sample')
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate/len(samples), sample_rate), samples)
ipd.Audio(samples, rate=8000)

# 10. Practical Applications

In [None]:
# TODO Audio capture
# TODO Audio preprocessing
if(predClass == "yes"):
  # TODO hardware 
  print("Doors opening")
else:
  # TODO hardware 
  print("Doors closing")

# 11. Downloading the model

In [None]:
from google.colab import files
files.download('nlp-model.h5') 

# 12. Practical considerations - software versions

In [None]:
# When running the model on other machines, software versions should at least match,
# not be lower than on machine where model was trained

import keras
!python --version
print("TensorFlow version:", tensorflow.__version__)
print("Keras version:", keras.__version__)