# Audio embeddings using deep LSTM autoencoder

We read the UrbanSound8K metadata from which we will get our training and validation data.

In [None]:
import pandas as pd
import numpy as np
import os
from helpers import wav_to_floats

audio_path = "..\\Data\\UrbanSound8K\\audio"
metadata_path = "..\\Data\\UrbanSound8K\\metadata\\UrbanSound8K.csv"

metadata = pd.read_csv(metadata_path)

# For prototyping, we will select a small sample of sounds
# metadata = metadata.sample(3000)

# Display the data format
metadata.head(5)

We define a method which will grab an UrbanSound8K slice file and return it as a float array.

In [None]:
def get_audio_from_dataframe(index, fold, slice_file_name):
    clip_path = os.path.join(audio_path,"fold" + str(fold), slice_file_name)
    audio = np.array(wav_to_floats(clip_path))
    return audio

Now we perform dilated gammatone analysis on each clip, store the values in a Pandas DataFrame and save the values as CSV files. We also make a DataFrame for holding reference to each CSV file name and their associated UrbanSound class.

In [None]:
# Set params
sample_rate = 44100
window_size_seconds = 0.01
num_filters = 100
cutoff_low_hz = 30
frames_in_memory = 8

# Prepare dilated gammatone processing
from gammatone_filterbank import GammatoneFilterbank
filterbank = GammatoneFilterbank(sample_rate, window_size_seconds, window_size_seconds / 2, num_filters, cutoff_low_hz)

# Data paths
save_path = "..\\Data\\SpectralFrames"
references_path = os.path.join(save_path, "references.csv")

In [None]:
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Make dataframe for storing references
refs_list = []
sound_num = 1

for index, row in metadata.iterrows():
    print("Processing sound " + str(sound_num) + " of 8000")
    sound_num += 1
    
    if sound_num > 8100:
        continue
    
    try:
        path = os.path.join(save_path, row["slice_file_name"].replace("wav", "csv"))
        
        if not os.path.exists(path):
            audio = get_audio_from_dataframe(index, row["fold"],  row["slice_file_name"])
            frames = filterbank.make_dilated_spectral_frames(audio, frames_in_memory, 2)
            frames = frames/frames.max()

            # Flatten the 3D array made by dilated gammatone filter and use multi-index for dataframe
            multi_index = pd.MultiIndex.from_product([range(s) for s in frames.shape])
            frames = pd.DataFrame(frames.flatten(), index=multi_index).reset_index()

            frames.to_csv(path)
        
        ref_dict = {"original_file_name": row["slice_file_name"], "new_file_name": row["slice_file_name"].replace("wav", "csv"), "class_id": row["classID"], "class_name": row["class"]}
        refs_list.append(ref_dict)
    except FileNotFoundError:
        print("Error reading audio file, moving to next")
                                                                  
references = pd.DataFrame(refs_list)
references.to_csv(references_path)

references.head(5)

We define a function for getting spectral data back from CSV, restoring its dimensionality.

In [None]:
def get_spectral_frames_from_csv(path):
    data = pd.read_csv(path, index_col=[0,1,2,3], dtype=np.float32)
    dim1 = len(data.index.get_level_values(1).unique())
    dim2 = len(data.index.get_level_values(2).unique())
    dim3 = len(data.index.get_level_values(3).unique())
    reshaped_data = data.values.reshape((dim1, dim2, dim3))

    return reshaped_data

Let's plot some examples from the training data to vizualize the dilated spectral buffer. The horizontal axis represents frequency content and the vertical axis represents time with the bottom being the most recent timestep and the top being the least recent. 

In [None]:
import matplotlib.pyplot as plt

example_train_path = os.path.join(save_path, "61626-9-0-6.csv")
example_train = get_spectral_frames_from_csv(example_train_path)
example_val_path = os.path.join(save_path, "189989-0-0-0.csv")
example_val = get_spectral_frames_from_csv(example_val_path)

# Plot and print data
plt.figure(figsize=(18,3))
plt.subplot(1, 2, 1)
plt.xscale('symlog')
plt.pcolormesh(example_train[100])
plt.subplot(1, 2, 2)
plt.xscale('symlog')
plt.pcolormesh(example_val[100])
plt.show()

We want to try and use these frames of dilated frequencies to create usable temporal audio embeddings which can be used in neural networks. Ideally, the temporal context of the frequencies at a current moment, should be compressed down into a smaller unidimensional space. We prepare an LSTM Autoencoder for this purpose. 

In [None]:
from model import prepare_autoencoder
from keras import optimizers
autoencoder, encoder = prepare_autoencoder(frames_in_memory, 
                                           num_filters, 
                                           250, 
                                           "adagrad", 
                                           "mean_squared_error")

In [None]:
def spectral_generator(reference_csv_path, spectral_csv_directory, batch_size):
    refs = pd.read_csv(reference_csv_path, index_col=0)
    ref_index = 0
    
    spectral_path = os.path.join(spectral_csv_directory, refs.iloc[ref_index]["new_file_name"])
    spectral_frames = get_spectral_frames_from_csv(spectral_path)
    frame_index = 0
    num_frames = spectral_frames.shape[1]
    num_filters = spectral_frames.shape[2]
    
    batch_features = np.zeros((batch_size, num_frames, num_filters))
    
    while True:
        for i in range(batch_size):
            if (frame_index >= spectral_frames.shape[0]):
                
                frame_index = 0
                ref_index += 1
                
                if (ref_index >= 8000):
                    ref_index = 0
                
                spectral_path = os.path.join(spectral_csv_directory, refs.iloc[ref_index]["new_file_name"])
                spectral_frames = get_spectral_frames_from_csv(spectral_path)
            
            # noise = (np.random.random_sample(spectral_frames[frame_index].shape) * 0.01)
            batch_features[i] = spectral_frames[frame_index] # * noise
            frame_index += 1        
        
        yield batch_features, batch_features

Now that the autoencoder model has been prepared, let's fit it and plot the results!

In [None]:
autoencoder.load_weights("C:/Users/Rothmann/Documents/PROJECTS/Development/Thesis/Keras/Models/LSTM_Autoencoder/autoencoder_weights.h5")

gen_batch_size = 1000
steps_to_take = 8000 * 700 / gen_batch_size # num sounds * average num frames per sounds / batch size
spectral_gen = spectral_generator(references_path, save_path, gen_batch_size)

history = autoencoder.fit_generator(spectral_gen, 
                                    steps_per_epoch=steps_to_take, 
                                    epochs=10)

In [None]:
print(history.history.keys())
#  "Accuracy"
plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
# "Loss"
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

After the model has been fit, let's save its weights.

In [None]:
# save_path = get_save_path()
autoencoder.save_weights("C:/Users/Rothmann/Documents/PROJECTS/Development/Thesis/Keras/Models/LSTM_Autoencoder/autoencoder_weights.h5")

Let's predict some encodings to compare the encoded data with the original:

In [None]:
prediction = autoencoder.predict(x=example_train, batch_size=10, verbose=True)

We'll plot a few of these encodings to get an idea of how well it worked. Looks like the temporal context is reconstructing quite well!

In [None]:
from random import *

plt.figure(figsize=(18,8))

for i in range(3):
    rand_example = randint(1, 100)
    
    plt.subplot(3, 2, i * 2 + 1)
    plt.xscale('symlog')
    if i == 0:
        plt.title('original')
    plt.pcolormesh(example_train[rand_example])
    
    plt.subplot(3, 2,  i * 2 + 2)
    plt.xscale('symlog')
    if i == 0:
        plt.title('predicted')
    plt.pcolormesh(prediction[rand_example])
     
plt.show()