In [1]:
import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import kaldiio

from scipy.signal import welch
import random
from sklearn.manifold import TSNE
import plotly.graph_objects as go

import plotly.io as pio

pio.renderers.default = "notebook"

from IPython.display import Audio

import tensorflow as tf
import tensorflow_io as tfio

load_dotenv("../.env")
tf.config.list_physical_devices("GPU")

# conda install -c conda-forge tqdm

2023-04-29 19:20:06.333880: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-29 19:20:07.658692: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-29 19:20:07.679916: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-29 19:20:07.679991: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been bu

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
PROJECT_ROOT = os.getenv("PROJECT_ROOT")
os.chdir(PROJECT_ROOT)

CLIPS_PATH = os.getenv("CLIPS_PATH")

VALIDATED_LIST_PATH = os.path.join(os.getenv("CLIPS_META_PATH"), "validated.tsv")
XVECTOR_RESULT_PATH = os.getenv("XVECTOR_RESULT_PATH")
XVECTOR_SCP_PATH = os.path.join(XVECTOR_RESULT_PATH, "xvector.scp")


In [3]:
speakers_xvectors = kaldiio.load_scp(XVECTOR_SCP_PATH)
valid_speakers = set(speakers_xvectors.keys())

raw_clips_meta = pd.read_table(VALIDATED_LIST_PATH)
raw_clips_meta = raw_clips_meta[raw_clips_meta["client_id"].isin(valid_speakers)]


def get_path(row):
    return f"{CLIPS_PATH}/{row}.mp3"


In [4]:
from tensorflow.keras.layers import (
    Input,
    Conv2D,
    Concatenate,
    LeakyReLU,
    BatchNormalization,
    Flatten,
    AveragePooling2D,
    Dense,
    Dropout,
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.models import Model


In [5]:
HYPER_PARAMS = {
    # Model parameters
    "window-width": 65,
    "x-vector-dim": 512,
    # Training parameters
    "batch-size": 32,
    "epochs": 100,
    "learning-rate": 0.0001,
    "learn-test-split": 0.8,
    # FFT parameters
    "nfft": 8192,
    "fft-window": 4096,
    "fft-stride": 512,
    # General
    "min-frequency": 0,
    "max-frequency": 10e3,
    "audio-rate": 48e3,
}

HYPER_PARAMS["window-height"] = int(
    (HYPER_PARAMS["max-frequency"] - HYPER_PARAMS["min-frequency"])
    / HYPER_PARAMS["audio-rate"]
    * (HYPER_PARAMS["nfft"] // 2 + 1)
)

# Model parameters
mel_spectrogram_shape = (
    HYPER_PARAMS["window-width"],
    HYPER_PARAMS["window-height"],
    1
)  # Replace window_size and num_mel_bands with your values

# Leaky ReLU activation function
leaky_relu = LeakyReLU(alpha=0.2)

# Mel-spectrogram input
mel_spectrogram_input = Input(shape=mel_spectrogram_shape, name="mel_spectrogram_input")

# Convolutional layers
x = Conv2D(16, (3, 3), padding="same", activation=leaky_relu)(mel_spectrogram_input)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

# Convolutional layers
x = Conv2D(32, (3, 3), padding="same", activation=leaky_relu)(mel_spectrogram_input)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

x = Conv2D(64, (3, 3), padding="same", activation=leaky_relu)(x)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

x = Conv2D(128, (3, 3), padding="same", activation=leaky_relu)(x)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

x = Flatten()(x)

# X-vector input
x_vector_input = Input(shape=(HYPER_PARAMS["x-vector-dim"],), name="x_vector_input")

# Concatenate flattened CNN output with x-vector input
combined_input = Concatenate()([x, x_vector_input])

# Dense layers
y = Dense(128, activation=leaky_relu)(combined_input)
y = Dropout(0.1)(y)
y = Dense(256, activation=leaky_relu)(y)
y = Dropout(0.1)(y)
y = Dense(256, activation=leaky_relu)(y)
y = Dropout(0.1)(y)
y = Dense(512, activation=leaky_relu)(y)
output = Dense(HYPER_PARAMS["window-height"], activation="linear")(y)

# Construct the model
model = Model(inputs=[mel_spectrogram_input, x_vector_input], outputs=output)
model.summary()

# Compile the model
optimizer = Adam(learning_rate=HYPER_PARAMS["learning-rate"])
loss_fn = MeanSquaredError()
model.compile(optimizer=optimizer, loss=loss_fn)


2023-04-29 19:20:09.249430: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-29 19:20:09.249559: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-29 19:20:09.249610: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-29 19:20:10.023073: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-29 19:20:10.023166: I tensorflow/compile

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 mel_spectrogram_input (InputLa  [(None, 65, 853, 1)  0          []                               
 yer)                           ]                                                                 
                                                                                                  
 conv2d_1 (Conv2D)              (None, 65, 853, 32)  320         ['mel_spectrogram_input[0][0]']  
                                                                                                  
 batch_normalization_1 (BatchNo  (None, 65, 853, 32)  128        ['conv2d_1[0][0]']               
 rmalization)                                                                                     
                                                                                              

mmon_runtime/gpu/gpu_device.cc:1722] Could not identify NUMA node of platform GPU id 0, defaulting to 0.  Your kernel may not have been built with NUMA support.
2023-04-29 19:20:10.023222: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-29 19:20:10.023252: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9554 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:06:00.0, compute capability: 6.1


In [9]:
from random import random

class SamplesLoader:
    
    def __init__(self, audio_samples, x_vectors, modifications, params):
        self.audio_samples = audio_samples
        self.x_vectors = x_vectors

        self.batch_size = params["batch-size"]
        self.split = params["learn-test-split"]
        self.nfft = params["nfft"]
        self.fft_window = params["fft-window"]
        self.fft_stride = params["fft-stride"]

        self.min_frequency = params["min-frequency"]
        self.max_frequency = params["max-frequency"]
        self.audio_rate = params["audio-rate"]

        self.window_width = params["window-width"]
        self.window_height = params["window-height"]

        self.min_frequency = params["min-frequency"]
        self.max_frequency = params["max-frequency"]

        self.x_vector_size = params["x-vector-dim"]

        self.clip_start_index = int(self.min_frequency / self.audio_rate * self.fft_window // 2)
        self.clip_end_index = self.clip_start_index + self.window_height

        # Modification are dict where key is possibility of modification
        # and value is callback that accepts raw sample and returns modified sample
        self.modifications = modifications

        self.active_batch = {
            "input": tf.zeros(shape=(0, self.window_width, self.window_height)),
            "x-vector": tf.zeros(shape=(0, self.x_vector_size)),
            "output": tf.zeros(shape=(0, self.window_height)),
        }

        self.active_samples_iter = None

    def __iter__(self):
        self.active_samples_iter = self.audio_samples.sample(frac=1).iterrows()
        return self

    def _slice_batch(self):
        if self.active_batch["input"].shape[0] < self.batch_size:
            return None

        new_batch = dict()
        for key in self.active_batch.keys():
            new_batch[key] = self.active_batch[key][: self.batch_size]
            self.active_batch[key] = self.active_batch[key][self.batch_size :]

        return new_batch["input"], new_batch["x-vector"], new_batch["output"]


    def _get_spectrogram(self, audio_tensor):
        spectrogram = tfio.audio.spectrogram(
            audio_tensor,
            nfft=self.nfft,
            window=self.fft_window,
            stride=self.fft_stride,
        )

        # Slice away frequencies outside of the human voice range
        sliced_tensor = tf.slice(spectrogram, [0, self.clip_start_index], [spectrogram.shape[0], self.clip_end_index])
        mean = tf.math.reduce_mean(sliced_tensor)
        std_dev = tf.math.reduce_std(sliced_tensor)
        final_tensor = (sliced_tensor - mean) / std_dev  # Normalize the tensor

        return final_tensor

    def _process_audio_tensor(self, audio_tensor):
        final_tensor = self._get_spectrogram(audio_tensor)  # Get normalized tensor

        # Calculate padding width
        pad_width = (self.window_width - 1) // 2

        # Pad the input image on both sides along the width axis
        padded_image = tf.pad(final_tensor, [[pad_width, pad_width], [0, 0]])

        # Extract patches with a sliding window
        patches = tf.image.extract_patches(
            images=tf.expand_dims(tf.expand_dims(padded_image, -1), 0),  # Add a batch dimension to the input image
            sizes=[1, self.window_width, self.window_height, 1],           # Patch size (1, w_w, h, 1)
            strides=[1, 1, 1, 1],                    # Stride (1, 1, 1, 1) for a sliding window with a step of 1
            rates=[1, 1, 1, 1],                      # Dilation rate (1, 1, 1, 1)
            padding='VALID'                          # No padding is required as we already padded the input image
        )


        
        # Reshape the patches tensor to the desired output shape (w, w_w, h)
        patches = tf.reshape(patches, [final_tensor.shape[0], self.window_width, self.window_height])
        return patches

    def __next__(self):
        while True:
            existing_batch = self._slice_batch()
            if existing_batch is not None:
                return existing_batch

            iter_result = self.active_samples_iter.__next__()
            if iter_result is None:
                return None # TODO: Pad left over samples with zeros

            sample_index, new_sample = iter_result
            sample_x_vector = self.x_vectors[new_sample["client_id"]]
            if sample_x_vector is None:
                continue

            # Load audio sample
            audio = tfio.audio.AudioIOTensor(get_path(new_sample["path"]))
            if audio.shape[0] < 1000:  # Skip to short samples
                continue

            # Crop audio beggining and end to remove silence and stop/start button clicks
            raw_audio_tensor = tf.squeeze(audio[300:-350], axis=[-1])
            raw_audio_spectrogram = self._get_spectrogram(raw_audio_tensor)
            tiled_x_vector = tf.tile(tf.expand_dims(sample_x_vector, 0), (raw_audio_spectrogram.shape[0], 1))

            modified_samples = []
            for possibility, callback in self.modifications.items():
                if possibility > random():
                    modified_samples.append(callback(raw_audio_tensor))

            for modified_sample in modified_samples:
                sample_patches = self._process_audio_tensor(modified_sample)

                self.active_batch["input"] = tf.concat(
                    [self.active_batch["input"], sample_patches], axis=0
                )
                self.active_batch["x-vector"] = tf.concat(
                    [self.active_batch["x-vector"], tiled_x_vector], axis=0
                )
                self.active_batch["output"] = tf.concat(
                    [self.active_batch["output"], raw_audio_spectrogram], axis=0
                )

In [None]:
from tensorflow.keras.callbacks import TensorBoard
from tqdm.notebook import tqdm

tensorboard_callback = TensorBoard(histogram_freq=1)

# Training parameters
epochs = 50
train_steps_per_epoch = 2000  # Adjust this value based on your training set size
val_steps_per_epoch = 500  # Adjust this value based on your validation set size

train_test_start_index = int(raw_clips_meta.shape[0] * HYPER_PARAMS["learn-test-split"])
train_dataset = raw_clips_meta.iloc[:train_test_start_index]
test_dataset = raw_clips_meta.iloc[train_test_start_index:]

train_loader = SamplesLoader(train_dataset, speakers_xvectors, {
    1: lambda sample: sample,
}, HYPER_PARAMS)

test_loader = SamplesLoader(test_dataset, speakers_xvectors, {
    1: lambda sample: sample,
}, HYPER_PARAMS)

for epoch in tqdm(range(HYPER_PARAMS["epochs"]), desc="Training"):
    # Train on batches
    for batch_X_mel, batch_X_xvec, batch_y in train_loader:
        train_result = model.train_on_batch(
            x=[batch_X_mel, batch_X_xvec], y=batch_y, reset_metrics=False
        )

        print("WEee")

        # Write train metrics to TensorBoard
        with tensorboard_callback.as_default():
            tf.summary.scalar("loss", train_result, step=epoch)

    # Validate on batches
    # val_losses = []
    # for batch_X_mel, batch_X_xvec, batch_y in val_dataset:
    #     val_result = model.test_on_batch(x=[batch_X_mel, batch_X_xvec], y=batch_y)

    #     # Collect validation losses
    #     val_losses.append(val_result)

    # Write validation metrics to TensorBoard
    # mean_val_loss = np.mean(val_losses)
    # with tensorboard_callback.as_default():
    #     tf.summary.scalar("val_loss", mean_val_loss, step=epoch)

Training:   0%|          | 0/100 [00:00<?, ?it/s]

2023-04-29 19:22:35.090401: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype float and shape [32,853]
	 [[{{node Placeholder/_2}}]]


: 