In [1]:
import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import kaldiio

from scipy.signal import welch
import random
from sklearn.manifold import TSNE
import plotly.graph_objects as go

import plotly.io as pio

pio.renderers.default = "notebook"

from IPython.display import Audio

import tensorflow as tf
import tensorflow_io as tfio

load_dotenv("../.env")
tf.config.list_physical_devices("GPU")


2023-04-30 20:49:25.150050: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-30 20:49:34.466248: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-30 20:49:36.770243: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-30 20:49:36.770313: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been bu

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
PROJECT_ROOT = os.getenv("PROJECT_ROOT")
os.chdir(PROJECT_ROOT)

CLIPS_PATH = os.getenv("CLIPS_PATH")

VALIDATED_LIST_PATH = os.path.join(os.getenv("CLIPS_META_PATH"), "validated.tsv")
XVECTOR_RESULT_PATH = os.getenv("XVECTOR_RESULT_PATH")
XVECTOR_SCP_PATH = os.path.join(XVECTOR_RESULT_PATH, "xvector.scp")

In [3]:
speakers_xvectors = kaldiio.load_scp(XVECTOR_SCP_PATH)
valid_speakers = set(speakers_xvectors.keys())

raw_clips_meta = pd.read_table(VALIDATED_LIST_PATH)
raw_clips_meta = raw_clips_meta[raw_clips_meta["client_id"].isin(valid_speakers)]


def get_path(row):
    return f"{CLIPS_PATH}/{row}.mp3"

In [4]:
from tensorflow.keras.layers import (
    Input,
    Conv2D,
    Concatenate,
    LeakyReLU,
    BatchNormalization,
    Flatten,
    AveragePooling2D,
    Dense,
    Dropout,
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.models import Model

In [11]:
HYPER_PARAMS = {
    # Model parameters
    "window-width": 65,
    "x-vector-dim": 512,
    # Training parameters
    "batch-size": 256,
    "epochs": 10,
    "learning-rate": 0.000005,
    "learn-test-split": 0.8,
    "logs-batch-frequency": 25,
    # FFT parameters
    "nfft": 8192,
    "fft-window": 4096,
    "fft-stride": 512,
    # General
    "min-frequency": 0,
    "max-frequency": 10e3,
    "audio-rate": 48e3,
}

HYPER_PARAMS["window-height"] = int(
    (HYPER_PARAMS["max-frequency"] - HYPER_PARAMS["min-frequency"])
    / HYPER_PARAMS["audio-rate"]
    * (HYPER_PARAMS["nfft"] // 2 + 1)
)

# Model parameters
mel_spectrogram_shape = (
    HYPER_PARAMS["window-width"],
    HYPER_PARAMS["window-height"],
    1,
)  # Replace window_size and num_mel_bands with your values

# Leaky ReLU activation function
leaky_relu = LeakyReLU(alpha=0.2)

# Mel-spectrogram input
mel_spectrogram_input = Input(shape=mel_spectrogram_shape, name="spectrogram_input")

# Convolutional layers
x = Conv2D(16, (3, 3), padding="same", activation=leaky_relu)(mel_spectrogram_input)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

# Convolutional layers
x = Conv2D(32, (3, 3), padding="same", activation=leaky_relu)(mel_spectrogram_input)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

x = Conv2D(64, (3, 3), padding="same", activation=leaky_relu)(x)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

x = Conv2D(128, (3, 3), padding="same", activation=leaky_relu)(x)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

x = Flatten()(x)

# X-vector input
x_vector_input = Input(shape=(HYPER_PARAMS["x-vector-dim"],), name="x_vector_input")

# Concatenate flattened CNN output with x-vector input
combined_input = Concatenate()([x, x_vector_input])

# Dense layers
y = Dense(128, activation=leaky_relu)(combined_input)
y = Dropout(0.1)(y)
y = Dense(192, activation=leaky_relu)(combined_input)
y = Dropout(0.1)(y)
y = Dense(256, activation=leaky_relu)(y)
y = Dropout(0.1)(y)
y = Dense(256, activation=leaky_relu)(y)
y = Dropout(0.1)(y)
y = Dense(512, activation=leaky_relu)(y)
output = Dense(HYPER_PARAMS["window-height"], activation="linear")(y)

# Construct the model
model = Model(inputs=[mel_spectrogram_input, x_vector_input], outputs=output)
model.summary()

# Compile the model
optimizer = Adam(learning_rate=HYPER_PARAMS["learning-rate"])
loss_fn = MeanSquaredError()
model.compile(optimizer=optimizer, loss=loss_fn)

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 spectrogram_input (InputLayer)  [(None, 65, 682, 1)  0          []                               
                                ]                                                                 
                                                                                                  
 conv2d_5 (Conv2D)              (None, 65, 682, 32)  320         ['spectrogram_input[0][0]']      
                                                                                                  
 batch_normalization_5 (BatchNo  (None, 65, 682, 32)  128        ['conv2d_5[0][0]']               
 rmalization)                                                                                     
                                                                                            

In [12]:
from random import random


class SamplesLoader:
    def __init__(self, audio_samples, x_vectors, modifications, params):
        self.audio_samples = audio_samples
        self.x_vectors = x_vectors

        self.batch_size = params["batch-size"]
        self.split = params["learn-test-split"]
        self.nfft = params["nfft"]
        self.fft_window = params["fft-window"]
        self.fft_stride = params["fft-stride"]

        self.min_frequency = params["min-frequency"]
        self.max_frequency = params["max-frequency"]
        self.audio_rate = params["audio-rate"]

        self.window_width = params["window-width"]
        self.window_height = params["window-height"]

        self.min_frequency = params["min-frequency"]
        self.max_frequency = params["max-frequency"]

        self.x_vector_size = params["x-vector-dim"]

        self.clip_start_index = int(
            self.min_frequency / self.audio_rate * self.fft_window // 2
        )
        self.clip_end_index = self.clip_start_index + self.window_height

        # Modification are dict where key is possibility of modification
        # and value is callback that accepts raw sample and returns modified sample
        self.modifications = modifications

        self.active_batch = {
            "input": tf.zeros(shape=(0, self.window_width, self.window_height)),
            "x-vector": tf.zeros(shape=(0, self.x_vector_size)),
            "output": tf.zeros(shape=(0, self.window_height)),
        }

        self.active_samples_iter = None

    def __iter__(self):
        self.active_samples_iter = self.audio_samples.sample(frac=1).iterrows()
        return self

    def _slice_batch(self):
        if self.active_batch["input"].shape[0] < self.batch_size:
            return None

        new_batch = dict()
        for key in self.active_batch.keys():
            new_batch[key] = self.active_batch[key][: self.batch_size]
            self.active_batch[key] = self.active_batch[key][self.batch_size :]

        return new_batch["input"], new_batch["x-vector"], new_batch["output"]

    def _get_spectrogram(self, audio_tensor):
        spectrogram = tfio.audio.spectrogram(
            audio_tensor,
            nfft=self.nfft,
            window=self.fft_window,
            stride=self.fft_stride,
        )

        # Slice away frequencies outside of the human voice range
        sliced_tensor = tf.slice(
            spectrogram,
            [0, self.clip_start_index],
            [spectrogram.shape[0], self.clip_end_index],
        )
        mean = tf.math.reduce_mean(sliced_tensor)
        std_dev = tf.math.reduce_std(sliced_tensor)
        final_tensor = (sliced_tensor - mean) / std_dev  # Normalize the tensor

        return final_tensor

    def _process_audio_tensor(self, audio_tensor):
        final_tensor = self._get_spectrogram(audio_tensor)  # Get normalized tensor

        # Calculate padding width
        pad_width = (self.window_width - 1) // 2

        # Pad the input image on both sides along the width axis
        padded_image = tf.pad(final_tensor, [[pad_width, pad_width], [0, 0]])

        # Extract patches with a sliding window
        patches = tf.image.extract_patches(
            images=tf.expand_dims(
                tf.expand_dims(padded_image, -1), 0
            ),  # Add a batch dimension to the input image
            sizes=[
                1,
                self.window_width,
                self.window_height,
                1,
            ],  # Patch size (1, w_w, h, 1)
            strides=[
                1,
                1,
                1,
                1,
            ],  # Stride (1, 1, 1, 1) for a sliding window with a step of 1
            rates=[1, 1, 1, 1],  # Dilation rate (1, 1, 1, 1)
            padding="VALID",  # No padding is required as we already padded the input image
        )

        # Reshape the patches tensor to the desired output shape (w, w_w, h)
        patches = tf.reshape(
            patches, [final_tensor.shape[0], self.window_width, self.window_height]
        )
        return patches

    def __next__(self):
        while True:
            existing_batch = self._slice_batch()
            if existing_batch is not None:
                return existing_batch

            iter_result = self.active_samples_iter.__next__()
            if iter_result is None:
                raise StopIteration  # TODO: Pad left over samples with zeros

            sample_index, new_sample = iter_result
            sample_x_vector = self.x_vectors[new_sample["client_id"]]
            if sample_x_vector is None:
                continue

            # Load audio sample
            audio = tfio.audio.AudioIOTensor(get_path(new_sample["path"]))
            if audio.shape[0] < 1000:  # Skip to short samples
                continue

            # Crop audio beggining and end to remove silence and stop/start button clicks
            raw_audio_tensor = tf.squeeze(audio[300:-350], axis=[-1])
            raw_audio_spectrogram = self._get_spectrogram(raw_audio_tensor)
            tiled_x_vector = tf.tile(
                tf.expand_dims(sample_x_vector, 0), (raw_audio_spectrogram.shape[0], 1)
            )

            modified_samples = []
            for possibility, callback in self.modifications.items():
                if possibility > random():
                    modified_samples.append(callback(raw_audio_tensor))

            for modified_sample in modified_samples:
                sample_patches = self._process_audio_tensor(modified_sample)

                self.active_batch["input"] = tf.concat(
                    [self.active_batch["input"], sample_patches], axis=0
                )
                self.active_batch["x-vector"] = tf.concat(
                    [self.active_batch["x-vector"], tiled_x_vector], axis=0
                )
                self.active_batch["output"] = tf.concat(
                    [self.active_batch["output"], raw_audio_spectrogram], axis=0
                )

In [13]:
train_test_start_index = int(raw_clips_meta.shape[0] * HYPER_PARAMS["learn-test-split"])
train_dataset = raw_clips_meta.iloc[:train_test_start_index]
test_dataset = raw_clips_meta.iloc[train_test_start_index:]


In [14]:
def samples_generator(loader):
    for batch_X_mel, batch_X_xvec, batch_y in loader:
        yield (batch_X_mel, batch_X_xvec), batch_y


train_loader = SamplesLoader(
    train_dataset, speakers_xvectors, {1: lambda sample: sample}, HYPER_PARAMS
)
validation_loader = SamplesLoader(
    test_dataset, speakers_xvectors, {1: lambda sample: sample}, HYPER_PARAMS
)

test_batch = next(iter(validation_loader))
output_signature = (
    tuple(
        [
            tf.TensorSpec.from_tensor(test_batch[0]),
            tf.TensorSpec.from_tensor(test_batch[1]),
        ]
    ),
    tf.TensorSpec.from_tensor(test_batch[2]),
)

train_data = tf.data.Dataset.from_generator(
    lambda: samples_generator(train_loader),
    output_signature=output_signature,
)

validation_data = tf.data.Dataset.from_generator(
    lambda: samples_generator(validation_loader),
    output_signature=output_signature,
)

In [15]:
import wandb
from wandb.keras import WandbCallback, WandbModelCheckpoint
from tensorflow.keras.callbacks import ModelCheckpoint

wandb.login()




True

In [16]:
run = wandb.init(config=HYPER_PARAMS, project="speech-filter")
checkpoint_path = wandb.run.dir + "/model_checkpoint.hdf5"

checkpoint_callback = WandbModelCheckpoint(
    filepath=checkpoint_path,
    monitor="val_loss",
    verbose=1,
    save_best_only=False,
    save_weights_only=False,
    mode="auto",
    save_freq="epoch",
)

metrics_callback = WandbCallback(
    monitor="val_loss",
    verbose=1,
    mode="auto",
    save_model=True,
    save_graph=True,
    save_weights_only=False,
    log_weights=True,
    log_gradients=True,
    training_data=train_data,
    validation_data=validation_data,
    predictions=64,
    input_type="images",
    output_type="image",
    log_batch_frequency=HYPER_PARAMS["logs-batch-frequency"],
)

model.fit(
    train_data,
    epochs=HYPER_PARAMS["epochs"],
    validation_data=validation_data,
    callbacks=[
        metrics_callback,
        checkpoint_callback,
    ],
)

run.finish()

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669755383342515, max=1.0…

Epoch 1/10


2023-04-30 20:53:28.222485: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2023-04-30 20:53:28.350331: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2023-04-30 20:53:43.280703: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.53GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2023-04-30 20:53:43.319914: W tensorflow/tsl/

     24/Unknown - 46s 394ms/step - loss: 1.0323

2023-04-30 20:54:24.416712: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.58GiB (rounded to 1702272000)requested by op gradient_tape/model_2/average_pooling2d_5/AvgPoolGrad
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-04-30 20:54:24.416816: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2023-04-30 20:54:24.416832: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 120, Chunks in use: 120. 30.0KiB allocated for chunks. 30.0KiB in use in bin. 9.7KiB client-requested in use in bin.
2023-04-30 20:54:24.416838: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 33, Chunks in use: 32. 18.8KiB allocated for chunks. 18.0KiB in use in bin. 17.6KiB client-requested in use in bin.
2023-0

ResourceExhaustedError: Graph execution error:

Detected at node 'gradient_tape/model_2/average_pooling2d_5/AvgPoolGrad' defined at (most recent call last):
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/traitlets/config/application.py", line 992, in launch_instance
      app.start()
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 711, in start
      self.io_loop.start()
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 411, in do_execute
      res = shell.run_cell(
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 531, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_4758/2943329651.py", line 31, in <module>
      model.fit(
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/wandb/integration/keras/keras.py", line 174, in new_v2
      return old_v2(*args, **kwargs)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/wandb/integration/keras/keras.py", line 174, in new_v2
      return old_v2(*args, **kwargs)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/wandb/integration/keras/keras.py", line 174, in new_v2
      return old_v2(*args, **kwargs)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/keras/engine/training.py", line 1054, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/keras/optimizers/optimizer.py", line 542, in minimize
      grads_and_vars = self.compute_gradients(loss, var_list, tape)
    File "/home/user/anaconda3/envs/qwe/lib/python3.8/site-packages/keras/optimizers/optimizer.py", line 275, in compute_gradients
      grads = tape.gradient(loss, var_list)
Node: 'gradient_tape/model_2/average_pooling2d_5/AvgPoolGrad'
OOM when allocating tensor with shape[300,32,65,682] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node gradient_tape/model_2/average_pooling2d_5/AvgPoolGrad}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_7781]