# Exercise 2.05: Loading Audio Data for TensorFlow Models

In [None]:
import tensorflow as tf
import os

In [None]:
def load_audio(file_path, sample_rate=44100):
    # Load audio at 44.1kHz sample-rate
    audio = tf.io.read_file(file_path)
    audio, sample_rate = tf.audio.decode_wav(audio,\
                                             desired_channels=-1,\
                                             desired_samples=sample_rate)
    return tf.transpose(audio)

In [None]:
prefix = "../Datasets/data_speech_commands_v0.02/zero/"
paths = [os.path.join(prefix, paths) for paths in os.listdir(prefix)]

In [None]:
import matplotlib.pyplot as plt
audio = load_audio(paths[0])
plt.plot(audio.numpy().T, color='gray')
plt.xlabel('Sample')
plt.ylabel('Value')
plt.savefig("Figure2.16.png", dpi=200)

In [None]:
def apply_mfccs(audio, sample_rate=44100, num_mfccs=13):
    stfts = tf.signal.stft(audio, frame_length=1024, frame_step=256, fft_length=1024)
    spectrograms = tf.abs(stfts)
    spectrogram_bins = stfts.shape[-1]

    lower_bound, upper_bound, mel_bins = 80.0, 7600.0, 80
    # create linear to mel weight matrix
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
          mel_bins, spectrogram_bins, 
          sample_rate, lower_bound, upper_bound)
    # take tensor product of spectrograms and weight matrix
    mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
    # reshape mel spectrograms
    mel_spectrograms.set_shape(\
      spectrograms.shape[:-1].concatenate(\
      linear_to_mel_weight_matrix.shape[-1:]))

    # add 1e-6 to prevent log errors and take the log
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

    #compute MFCCs from log_mel_spectrograms
    mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)[..., :num_mfccs]
    return mfccs

In [None]:
mfcc = apply_mfccs(audio)
plt.pcolor(mfcc.numpy()[0], cmap='Greys_r')
plt.xlabel('MFCC log coefficient')
plt.ylabel('Sample Value')
plt.savefig("Figure2.17.png", dpi=200)

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def prep_ds(ds, shuffle_buffer_size=1024, batch_size=64):
    # Randomly shuffle (file_path, label) dataset
    ds = ds.shuffle(buffer_size=shuffle_buffer_size)
    # Load and decode audio from file paths
    ds = ds.map(load_audio, num_parallel_calls=AUTOTUNE)
    # generayte MFCCs from the audio data
    ds = ds.map(apply_mfccs)
    # Repeat dataset forever
    ds = ds.repeat()
    # Prepare batches
    ds = ds.batch(batch_size)
    # Prefetch
    ds = ds.prefetch(buffer_size=AUTOTUNE)

    return ds

In [None]:
ds = tf.data.Dataset.from_tensor_slices(paths)
train_ds = prep_ds(ds)

Take 1 batch and print it out

In [None]:
for x in train_ds.take(1):
    print(x)