In [3]:
import numpy
import scipy.io.wavfile
from scipy.fftpack import dct
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import os
import tensorflow as tf
import plaidml.keras

plaidml.keras.install_backend()
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

<h1>1. Define Pre Processing Function</h1>

This function will apply all of the necessary pre-processsing to the speech signal. It assumes that we've already cut the speech signal down to 3 seconds. Then:

- Apply Pre-emphasis
- Framing
- Framing
- Hamming Window
- Fourier Spectrum
- Log-Mel Filter Banks
- MFCC scale
- Normalization

The function will happily accept larger segments but it will still spit out the same length vector which will confuse the network.

I think a lot of this can be done with Tensorflow function which are probably a bit faster since just the pre-processing takes some time.

https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html

<h1>2. Importing and Preprocessing Some Data</h1>

I decided to go with 20 speakers to start off with. We are taking their names, locating all the files under their names and applying the 3-second windows to each file so we get out a list of tensors and speaker ID's. Our end result will be a TF dataset that we can directly apply to train our network.

Unfortunately, the real dataset is around 36GB, which will be too big to fit in RAM. We'll need some way to break it up in order to use it to train our network.

In [160]:
dev_loc = "../../Dataset/vox1_dev_wav/vox1_dev_wav/wav"

x = tf.data.Dataset.list_files("../../Dataset/vox1_dev_wav/vox1_dev_wav/wav*.wav")

In [161]:
ids = []
for item in x:
    ids.append(os.path.basename(os.path.dirname(os.path.dirname(item.numpy()))))

ids = numpy.array(ids).reshape((-1,1))
encoder = OneHotEncoder(sparse=False)
y = tf.data.Dataset.from_tensor_slices(encoder.fit_transform(ids))
    
    

In [162]:
dataset = tf.data.Dataset.zip((x,y))

In [36]:
dataset = dataset.shuffle(len(dataset))
dataset = dataset.batch(64)

<h1>3. Building the Model</h1>

In [137]:
class PreProcessing(tf.keras.layers.Layer):
    def __init__(self):
        super(PreProcessing, self).__init__()
        
    def build(self, input_shape_):
        self.input_shape_ = input_shape_
        
    def call(self, filepaths):
        return tf.map_fn(self.pre_proc, filepaths)
            
            
    def pre_proc(filepath):
        audio, sample_rate = tf.audio.decode_wav(filepath, desired_samples=1)
        sample = tf.image.random_crop(audio, sample_rate*3)
        return generate_mfccs(sample.numpy())
    
    def generate_mfccs(signal):

        pre_emphasis = 0.97
        emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])

        frame_size = .025
        frame_stride = 0.01

        frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
        signal_length = len(emphasized_signal)
        frame_length = int(round(frame_length))
        frame_step = int(round(frame_step))
        num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame

        pad_signal_length = num_frames * frame_step + frame_length
        z = numpy.zeros((pad_signal_length - signal_length))
        pad_signal = numpy.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal

        indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
        frames = pad_signal[indices.astype(numpy.int32, copy=False)]
        frames *= numpy.hamming(frame_length)
        # frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1))  # Explicit Implementation **
        NFFT = 512

        mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT))  # Magnitude of the FFT
        pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum

        nfilt = 40

        low_freq_mel = 0
        high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
        mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
        hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
        bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)

        fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
        for m in range(1, nfilt + 1):
            f_m_minus = int(bin[m - 1])   # left
            f_m = int(bin[m])             # center
            f_m_plus = int(bin[m + 1])    # right

            for k in range(f_m_minus, f_m):
                fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
            for k in range(f_m, f_m_plus):
                fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
        filter_banks = numpy.dot(pow_frames, fbank.T)
        filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks)  # Numerical Stability
        filter_banks = 20 * numpy.log10(filter_banks)  # dB

        mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')

        mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
    
        return mfcc

In [138]:
VGG19 = tf.keras.applications.VGG19(include_top=False, weights=None, input_shape=(298,40,1))

In [139]:
class SelfAttention(tf.keras.layers.Layer):

  #nh - output dimension of last network step
  #nk - number of attention hops
  #nc - hidden size of intermediate layer
  #T  - the sequence length, since I'm sorta used to image processing that's sort of misleading, the sequence length is simply
    #the height of the input image.
    def __init__(self, nk=4, nc=64, initializer=tf.keras.initializers.GlorotNormal()):
        super(SelfAttention, self).__init__()
        self.nh = None
        self.T = None
        self.nk = nk
        self.nc = nc
        self.initializer = initializer

  #T - number of frames we will run through at one time
    def build(self, input_shape):
    #I'm expecting a tensor in this format(batch_size, height, sequence_len, num_filters)
    #out of which I want to extract a flattened tensor height x num_filters
    #sequence len is T
        self.nh = input_shape[2] * input_shape[3]
        self.W1 = self.add_weight(
            shape=(self.nh,self.nc),
            initializer= self.initializer,
            trainable=True
        )
        self.W2 = self.add_weight(
            shape=(self.nc, self.nk),
            initializer= self.initializer,
            trainable=True
        )

  #H - Output of the VGG or other processing network
    @tf.function
    def call(self, H):
        @tf.function
        def operations(H):
            H = tf.reshape(H,tf.convert_to_tensor([H.shape[0],H.shape[1]*H.shape[2]]))
            A = tf.nn.softmax(tf.nn.tanh(tf.linalg.matmul(tf.linalg.matmul(H,self.W1),self.W2)))
            return tf.linalg.matmul(H, A, transpose_a = True)
        return tf.map_fn(operations, H)
    

In [140]:
model = tf.keras.Sequential(PreProcessing())
[model.add(layer) for layer in VGG19.layers[0:8]]
[model.add(layer) for layer in VGG19.layers[11:13]]
model.add(tf.keras.layers.MaxPool2D(padding='same'))
model.add(SelfAttention())
model.add(tf.keras.layers.AveragePooling1D(pool_size=2, padding='same'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [141]:
history = model.fit(dataset, epochs=60)

Epoch 1/60


RuntimeError: in user code:

    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    <ipython-input-137-634408fda866>:9 call  *
        sess = tf.python.keras.backend.get_session()
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\backend.py:627 get_session  **
        session = _get_session(op_input_list)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\backend.py:587 _get_session
        raise RuntimeError('Cannot get session inside Tensorflow graph function.')

    RuntimeError: Cannot get session inside Tensorflow graph function.


In [None]:
#TODO - In order to make it feasible from a memory perspective we need to create a dataset with references to files.
# I think tensorflow might have a sexy automatic way to do this. For each file that gets selected we'll randomly select a 
# 3 second audio clip from it's contents.So the input will be a variable length vector from the .wav file
#TODO - Related to the first one, we need to use the preprocessing function as a preprocessing layer built into the model
# so we get GPU acceleration on that because it's slow. Most of that stuff should be able to be TensorFlow-ized in
#with tf.signal library

#TODO - Add some automatic parameters at the top, get rid of constants
#TODO - Custom loss function for attention layer
#TODO - Grid search, do we really need that?

#TODO - I think maybe it's kinda silly to chop up the VGG network, maybe better to just build layers from scratch so I know
#what I'm getting. I don't know what kind of weird stuff they might've put into the layers.