In [1]:
import gc
import os
import re
import math
import random
import numpy as np
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm as tqdm
from itertools import compress

import librosa
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from scipy.io import wavfile

import keras
from keras import optimizers, losses, activations, models
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Bidirectional, Conv1D, GRU
from tensorflow.keras.layers import Input, Dropout, BatchNormalization, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt

import warnings

random.seed(0)
sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

In [2]:
# parameters settings
data_path = '../../data/train/audio'
labels = os.listdir(data_path)
CLASSES = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go'] 
SAMPLING_RATE = 16000

EPOCHS = 30
PATIENCE=int(EPOCHS/3)
BATCH_SIZE = 128
MIN_DELTA=1e-4
LR = 1e-4
DROPOUT = 0.2
NOISE_RATIO = 0.1

In [3]:
def pad_audio(samples):
    return np.pad(samples, pad_width=(SAMPLING_RATE - len(samples), 0), mode='constant', constant_values=(0, 0))


def chop_audio(samples):
    return samples[:SAMPLING_RATE]


def split_audio(sample_rate, samples):
    """ Splits audio file to multiple with (up to) fixed 1s length """
    duration = float(len(samples)/sample_rate)
    n_samples = math.ceil(duration)
    return np.array_split(samples, n_samples)

In [4]:
def assign_unknown_label(labels, classes=CLASSES):
    new_labels = []
    for label in labels:
        if label not in classes:
            new_labels.append('unknown')
        else:
            new_labels.append(label)
    return new_labels

def encode_labels(labels):
    return pd.get_dummies(pd.Series(labels))

In [5]:
def specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    _, _, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return np.log(spec.T.astype(np.float32) + eps)


def plot_model_history(history, title: str) -> None:
    """
    Plotting the learning curve of Keras model, broken down into loss curve and accuracy curve, 
    for both training and validation data.

    Args:
        history : Object returned by the .fit method of Keras model.
        title (str): Title of the plots.
    """
    fig, axs = plt.subplots(1,2,figsize=(15,5)) 

    axs[0].plot(history.history['accuracy']) 
    axs[0].plot(history.history['val_accuracy']) 
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy') 
    axs[0].set_xlabel('Epoch')
    axs[0].legend(['train', 'validate'], loc='upper left')

    axs[1].plot(history.history['loss']) 
    axs[1].plot(history.history['val_loss']) 
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss') 
    axs[1].set_xlabel('Epoch')
    axs[1].legend(['train', 'validate'], loc='upper left')
    fig.suptitle(title, size=15)
    plt.show()

In [6]:
def rnn_network(input_dim, output_dim, rnn_layer):

    input_data = Input(name='input', shape=input_dim, dtype='float32')

    x = Conv1D(filters=256, kernel_size=10, strides=4, name='conv1d')(input_data)
    x = BatchNormalization(name='b_norm')(x)
    x = Activation('relu', name='activation')(x)
    x = Dropout(DROPOUT, name='dropout_1')(x)
    x = rnn_layer(128, activation='relu', return_sequences=True, dropout=DROPOUT, name='rnn_1')(x)
    x = rnn_layer(128, activation='relu', return_sequences=False, dropout=DROPOUT, name='rnn_2')(x)
    x = Dense(units=64, activation='relu', name='dense')(x)
    x = Dropout(DROPOUT, name='dropout_2')(x)

    output_data = Dense(units=output_dim, activation='softmax', name='softmax')(x)

    model = Model(inputs=input_data, outputs=output_data, name=str(rnn_layer).split(".")[-1].split("'")[0])
    model.compile(loss='categorical_crossentropy', optimizer=OPT, metrics=['accuracy'])

    return model

In [7]:
def plot_heatmaps(models_results: list, title: str=None) -> None:
    """
    Plot heatmaps based on the confusion matrices of a given models.

    Args:
        models_results (list): List of confusion matrices of the evaluated models.
        title (str): Title of the plots.
    """
    fig, axes = plt.subplots(1,3,figsize=(18,6))
    models = ['SimpleRNN', 'LSTM', 'GRU']
    for i in range(3):
        sns.heatmap(models_results[i], annot=True, cmap="Blues", ax=axes[i], fmt='d', annot_kws={"size": 12})
        axes[i].set_title(models[i], size=15)
        axes[i].set_xticklabels(axes[i].get_xmajorticklabels(), fontsize = 12)
        axes[i].set_yticklabels(axes[i].get_xmajorticklabels(), fontsize = 12)
        cax = plt.gcf().axes[-1]
        cax.tick_params(labelsize=12)

    plt.tight_layout()
    plt.show()

# <span style='font-family:Georgia'> 1. Data loading & preparation

### <span style='font-family:Georgia'> 1.1. Noisy data loading

In [13]:
label = '_background_noise_'
files = [f for f in os.listdir(data_path + '/'+ label) if f.endswith('.wav')]
one_sec_background_noise_specgrams = []

for file in tqdm(files):
    _, samples = wavfile.read(data_path + "/" + label + "/" + file)
    duration = float(len(samples)/SAMPLING_RATE)

    # Do not distinguish between noise classes
    one_sec_background_noise = split_audio(
            sample_rate=SAMPLING_RATE, samples=samples
        )
    
    for item in one_sec_background_noise:
        duration = float(len(item)/SAMPLING_RATE)
        if duration < 1: item = pad_audio(item)        
        
        one_sec_background_noise_specgrams.append(
            specgram(item, SAMPLING_RATE)
        )

  0%|          | 0/6 [00:00<?, ?it/s]

### <span style='font-family:Georgia'> 1.2. Noise-free data loading

In [14]:
validation_list = pd.read_csv('../../data/train/validation_list.txt', sep="\t", header=None)[0].tolist()
testing_list = pd.read_csv('../../data/train/testing_list.txt', sep="\t", header=None)[0].tolist()

In [15]:
print('Training: ', 64721 - len(validation_list) - len(testing_list))
print('Validation: ', len(validation_list))
print('Testing: ', len(testing_list))

Training:  51088
Validation:  6798
Testing:  6835


In [None]:
train_labels = []
train_specgrams = []
val_labels = []
val_specgrams = []
test_labels = []
test_specgrams = []

for label in tqdm([l for l in labels if l != '_background_noise_']):

    files = [f for f in os.listdir(data_path + '/'+ label) if f.endswith('.wav')]
    
    for file in tqdm(files):
        _, samples = wavfile.read(data_path + "/" + label + "/" + file)
        duration = float(len(samples)/SAMPLING_RATE)
                
        if duration < 1: samples = pad_audio(samples)
            
        if (label + "/" + file) in validation_list: 
            val_labels.append(label)
            val_specgrams.append(specgram(samples, SAMPLING_RATE))
        elif (label + "/" + file) in testing_list: 
            test_labels.append(label)
            test_specgrams.append(specgram(samples, SAMPLING_RATE))
        else:
            train_labels.append(label)
            train_specgrams.append(specgram(samples, SAMPLING_RATE))

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/1713 [00:00<?, ?it/s]

  0%|          | 0/1731 [00:00<?, ?it/s]

  0%|          | 0/1733 [00:00<?, ?it/s]

  0%|          | 0/1746 [00:00<?, ?it/s]

  0%|          | 0/2359 [00:00<?, ?it/s]

  0%|          | 0/2352 [00:00<?, ?it/s]

  0%|          | 0/2357 [00:00<?, ?it/s]

  0%|          | 0/2372 [00:00<?, ?it/s]

  0%|          | 0/2372 [00:00<?, ?it/s]

  0%|          | 0/1742 [00:00<?, ?it/s]

  0%|          | 0/1750 [00:00<?, ?it/s]

  0%|          | 0/2353 [00:00<?, ?it/s]

  0%|          | 0/1746 [00:00<?, ?it/s]

  0%|          | 0/2364 [00:00<?, ?it/s]

  0%|          | 0/2375 [00:00<?, ?it/s]

  0%|          | 0/2357 [00:00<?, ?it/s]

  0%|          | 0/2367 [00:00<?, ?it/s]

  0%|          | 0/2370 [00:00<?, ?it/s]

  0%|          | 0/2367 [00:00<?, ?it/s]

  0%|          | 0/2377 [00:00<?, ?it/s]

  0%|          | 0/1734 [00:00<?, ?it/s]

  0%|          | 0/2369 [00:00<?, ?it/s]

  0%|          | 0/2380 [00:00<?, ?it/s]

  0%|          | 0/2356 [00:00<?, ?it/s]

  0%|          | 0/1733 [00:00<?, ?it/s]

  0%|          | 0/2373 [00:00<?, ?it/s]

  0%|          | 0/2375 [00:00<?, ?it/s]

### <span style='font-family:Georgia'> 1.3. Data labeling

### <span style='font-family:Georgia'> 1.3.1. Approach 1 - taking `background_noise` observations as separate `silence` class

In [None]:
# Split noise into 3 disjoint subsets (train, valid & test sets)
N = 64721
valid_ratio = np.round(len(validation_list)/N, 2)
test_ratio = np.round(len(testing_list)/N, 2)
train_ratio = 1 - (valid_ratio+test_ratio)

sets_list = ["train", "val", "test"]
obs_split = np.array(random.choices(sets_list, weights=(train_ratio, valid_ratio, test_ratio), k=N))
train_mask, val_mask, test_mask = obs_split == "train", obs_split == "val", obs_split == "test"

In [None]:
train_noise = list(compress(one_sec_background_noise_specgrams, train_mask))
print(f'Number of observations assigned to train set: {len(train_noise)}')

val_noise = list(compress(one_sec_background_noise_specgrams, val_mask))
print(f'Number of observations assigned to validation set: {len(val_noise)}')

test_noise = list(compress(one_sec_background_noise_specgrams, test_mask))
print(f'Number of observations assigned to test set: {len(test_noise)}')

In [None]:
train_specgrams_1 = train_specgrams + train_noise
train_labels_1 = train_labels + ['silence' for i in range(len(train_noise))]

val_specgrams_1 = val_specgrams + val_noise
val_labels_1 = val_labels + ['silence' for i in range(len(val_noise))]

test_specgrams_1 = test_specgrams + test_noise
test_labels_1 = test_labels + ['silence' for i in range(len(test_noise))]

In [None]:
x_train_1 = np.array(train_specgrams_1)
print(x_train_1.shape)

x_val_1 = np.array(val_specgrams_1)
print(x_val_1.shape)

x_test_1 = np.array(test_specgrams_1)
print(x_test_1.shape)

In [None]:
classes=CLASSES+['silence']

y_train_1 = assign_unknown_label(train_labels_1, classes=classes)
print(np.unique(y_train_1))

y_val_1 = assign_unknown_label(val_labels_1, classes=classes)
print(np.unique(y_val_1))

y_test_1 = assign_unknown_label(test_labels_1, classes=classes)
print(np.unique(y_test_1))

In [None]:
del train_specgrams_1, val_specgrams_1, test_specgrams_1
del train_labels_1, val_labels_1, test_labels_1
gc.collect()

In [None]:
# Labels encoding
y_train_1 = np.array(encode_labels(y_train_1).values)
y_val_1 = np.array(encode_labels(y_val_1).values)
y_test_1 = np.array(encode_labels(y_test_1).values)

### <span style='font-family:Georgia'> 1.3.2. Approach 2 - noising the training subset(s) with the 'background_noise' class observations

In [None]:
train_labels_summary = pd.Series(train_labels).value_counts()
n = math.floor(NOISE_RATIO*train_labels_summary.sum())
noised_labels = random.choices(train_labels, k=n)

summary = pd.DataFrame(train_labels_summary, columns=["obs_cnt"])
summary["noise_cnt"] = pd.Series(noised_labels).value_counts()
summary["noise_ratio"] = summary["noise_cnt"] / summary["obs_cnt"]
summary

In [None]:
train_specgrams_2 = train_specgrams + random.choices(one_sec_background_noise_specgrams, k=n)
train_labels_2 = train_labels + noised_labels

In [None]:
x_train_2 = np.array(train_specgrams_2)
print(x_train_2.shape)

In [None]:
y_train_2 = assign_unknown_label(train_labels_2)
print(np.unique(y_train_2))

In [None]:
# Labels encoding
y_train_2 = np.array(encode_labels(y_train_2).values)

In [None]:
del train_specgrams_2, train_labels_2
del train_specgrams, val_specgrams, test_specgrams
del train_labels, val_labels, test_labels

gc.collect()

# <span style='font-family:Georgia'> 2. Modeling

## <span style='font-family:Georgia'> 2.1. Approach 1 - taking `background_noise` observations as separate `silence` class

In [None]:
x_train, x_val, x_test = x_train_1, x_val_1, x_test_1
y_train, y_val, y_test = y_train_1, y_val_1, y_test_1

In [None]:
# hyperparameters set-up
INPUT_DIM = (x_train.shape[1], x_train.shape[2])
OUTPUT_DIM = y_train.shape[1]

OPT = Adam(learning_rate=LR, clipnorm=1.0)
early_stop = EarlyStopping(monitor='val_accuracy', patience=PATIENCE, min_delta=MIN_DELTA)

### <span style='font-family:Georgia'> 2.1.1 SimpleRNN

In [None]:
model1_stats = []
model1_acc = []

for i in tqdm(range(5)):
    print('iteration: '+str(i))
    model1 = rnn_network(INPUT_DIM, OUTPUT_DIM, SimpleRNN)
    K.clear_session()
    model1_history = model1.fit(x_train, y_train,
                                batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1,
                                validation_data=(x_val, y_val),
                                callbacks=[early_stop]
                               )
    model1_stats.append(model1_history)
    y_pred_1 = model1.predict(X_test)
    y_pred_classes_1 = np.argmax(y_pred_1, axis=-1)
    model1_acc.append(accuracy_score(y_test, y_pred_classes_1))

### <span style='font-family:Georgia'> 2.1.2. LSTM

In [None]:
model2_stats = []
model2_acc = []

for i in tqdm(range(5)):
    print('iteration: '+str(i))
    model2 = rnn_network(INPUT_DIM, OUTPUT_DIM, LSTM)
    K.clear_session()
    
    model2_history = model2.fit(x_train, y_train,
                                batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1,
                                validation_data=(x_val, y_val),
                                callbacks=[early_stop]
                               )
    
    model2_stats.append(model2_history)
    y_pred_2 = model2.predict(X_test)
    y_pred_classes_2 = np.argmax(y_pred_2, axis=-1)
    model2_acc.append(accuracy_score(y_test, y_pred_classes_2))

### <span style='font-family:Georgia'> 2.1.3. GRU

In [None]:
model3_stats = []
model3_acc = []

for i in tqdm(range(5)):
    print('iteration: '+str(i))
    model3 = rnn_network(INPUT_DIM, OUTPUT_DIM, GRU)
    K.clear_session()
    model3_history = model3.fit(x_train, y_train,
                                batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1,
                                validation_data=(x_val, y_val),
                                callbacks=[early_stop]
                               )
    model3_stats.append(model3_history)
    y_pred_3 = model3.predict(X_test)
    y_pred_classes_3 = np.argmax(y_pred_3, axis=-1)
    model3_acc.append(accuracy_score(y_test, y_pred_classes_3))

### <span style='font-family:Georgia'> 2.1.4. Models summary

In [None]:
approach1_acc_results = pd.DataFrame([model1_acc, model2_acc, model3_acc]).T
approach1_acc_results.columns=['SimpleRNN', 'LSTM', 'GRU']
approach1_acc_results_stats = approach1_acc_results.describe().T[['mean', 'std', 'min', 'max']]

In [None]:
model1_epochs = []
model2_epochs = []
model3_epochs = []
for i in range(5):
    model1_epochs.append(len(model1_stats[i].history['loss']))
    model2_epochs.append(len(model2_stats[i].history['loss']))
    model3_epochs.append(len(model3_stats[i].history['loss']))
    
approach1_acc_results_stats['epochs'] = [np.array(model1_epochs).mean(), np.array(model2_epochs).mean(),
                                      np.array(model3_epochs).mean()]
approach1_acc_results_stats

## <span style='font-family:Georgia'> 2.2. Approach 2 - noising the training subset(s) with the 'background_noise' class observations

In [None]:
x_train, x_val, x_test = x_train_2, x_val_1, x_test_1
y_train, y_val, y_test = y_train_2, y_val_1, y_test_1

In [None]:
# hyperparameters set-up
INPUT_DIM = (x_train.shape[1], x_train.shape[2])
OUTPUT_DIM = y_train.shape[1]

early_stop = EarlyStopping(monitor='val_accuracy', patience=PATIENCE, min_delta=MIN_DELTA)
OPT = Adam(learning_rate=LR, clipnorm=1.0)

### <span style='font-family:Georgia'> 2.2.1 SimpleRNN

In [None]:
model4_stats = []
model4_acc = []

for i in tqdm(range(5)):
    print('iteration: '+str(i))
    model4 = rnn_network(INPUT_DIM, OUTPUT_DIM, SimpleRNN)
    K.clear_session()
    model4_history = model4.fit(x_train, y_train,
                                batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1,
                                validation_data=(x_val, y_val),
                                callbacks=[early_stop]
                               )
    model4_stats.append(model4_history)
    y_pred_4 = model4.predict(X_test)
    y_pred_classes_4 = np.argmax(y_pred_4, axis=-1)
    model4_acc.append(accuracy_score(y_test, y_pred_classes_4))

### <span style='font-family:Georgia'> 2.2.2. LSTM

In [None]:
model5_stats = []
model5_acc = []

for i in tqdm(range(5)):
    print('iteration: '+str(i))
    model5 = rnn_network(INPUT_DIM, OUTPUT_DIM, LSTM)
    K.clear_session()
    
    model5_history = model5.fit(x_train, y_train,
                                batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1,
                                validation_data=(x_val, y_val),
                                callbacks=[early_stop]
                               )
    
    model5_stats.append(model5_history)
    y_pred_5 = model5.predict(X_test)
    y_pred_classes_5 = np.argmax(y_pred_5, axis=-1)
    model5_acc.append(accuracy_score(y_test, y_pred_classes_5))

### <span style='font-family:Georgia'> 2.2.3. GRU

In [None]:
model6_stats = []
model6_acc = []

for i in tqdm(range(5)):
    print('iteration: '+str(i))
    model6 = rnn_network(INPUT_DIM, OUTPUT_DIM, GRU)
    K.clear_session()
    model6_history = model6.fit(x_train, y_train,
                                batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1,
                                validation_data=(x_val, y_val),
                                callbacks=[early_stop]
                               )
    model6_stats.append(model6_history)
    y_pred_6 = model6.predict(X_test)
    y_pred_classes_6 = np.argmax(y_pred_6, axis=-1)
    model6_acc.append(accuracy_score(y_test, y_pred_classes_6))

### <span style='font-family:Georgia'> 2.2.4. Models summary

In [None]:
approach2_acc_results = pd.DataFrame([model4_acc, model5_acc, model6_acc]).T
approach2_acc_results.columns=['SimpleRNN', 'LSTM', 'GRU']
approach2_acc_results_stats = approach2_acc_results.describe().T[['mean', 'std', 'min', 'max']]

In [None]:
model4_epochs = []
model5_epochs = []
model6_epochs = []
for i in range(5):
    model4_epochs.append(len(model4_stats[i].history['loss']))
    model5_epochs.append(len(model5_stats[i].history['loss']))
    model6_epochs.append(len(model6_stats[i].history['loss']))
    
approach2_acc_results_stats['epochs'] = [np.array(model4_epochs).mean(), np.array(model5_epochs).mean(),
                                      np.array(model6_epochs).mean()]
approach2_acc_results_stats