In [1]:
import os
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import kaldiio

from scipy.signal import welch
import random
from sklearn.manifold import TSNE
import plotly.graph_objects as go

import plotly.io as pio

pio.renderers.default = "notebook"

from IPython.display import Audio

import tensorflow as tf
import tensorflow_io as tfio

load_dotenv("../.env")
tf.config.list_physical_devices("GPU")


2023-04-30 22:36:54.303940: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-30 22:36:55.871900: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-30 22:36:55.925306: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-30 22:36:55.925449: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:06:00.0/numa_node
Your kernel may have been bu

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
PROJECT_ROOT = os.getenv("PROJECT_ROOT")
os.chdir(PROJECT_ROOT)

CLIPS_PATH = os.getenv("CLIPS_PATH")

VALIDATED_LIST_PATH = os.path.join(os.getenv("CLIPS_META_PATH"), "validated.tsv")
XVECTOR_RESULT_PATH = os.getenv("XVECTOR_RESULT_PATH")
XVECTOR_SCP_PATH = os.path.join(XVECTOR_RESULT_PATH, "xvector.scp")

In [3]:
speakers_xvectors = kaldiio.load_scp(XVECTOR_SCP_PATH)
valid_speakers = set(speakers_xvectors.keys())

raw_clips_meta = pd.read_table(VALIDATED_LIST_PATH)
raw_clips_meta = raw_clips_meta[raw_clips_meta["client_id"].isin(valid_speakers)]


def get_path(row):
    return f"{CLIPS_PATH}/{row}.mp3"

In [4]:
from tensorflow.keras.layers import (
    Input,
    Conv2D,
    Concatenate,
    LeakyReLU,
    BatchNormalization,
    Flatten,
    AveragePooling2D,
    Dense,
    Dropout,
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.models import Model

In [12]:
HYPER_PARAMS = {
    # Model parameters
    "window-width": 65,
    "x-vector-dim": 512,
    # Training parameters
    "batch-size": 256,
    "epochs": 10,
    "learning-rate": 2e-5,
    "learn-test-split": 0.8,
    "logs-batch-frequency": 25,
    # FFT parameters
    "nfft": 4096,
    "fft-window": 2048,
    "fft-stride": 256,
    # General
    "min-frequency": 0,
    "max-frequency": 9e3, # Crop input audio spectrogram to this frequency
    "audio-rate": 48e3,
    # Data augmentation possibilities
    # "augment-raw": 0.05,
    # "augment-empty-noise": 0.05,
    # "augment-noise": 0.2,
    # "augment-dialogue": 0.4,
    # "augment-dialogue-noise": 0.45,
    
    "augment-raw": 1,
    "augment-empty-noise": 1,
    "augment-noise": 1,
    "augment-dialogue": 1,
    "augment-dialogue-noise": 1,
    # Data augmentation parameters
    "noise-min-level": 0.001,
    "noise-max-level": 0.008,
    "dialogue-min-speakers": 1,
    "dialogue-max-speakers": 6,
    "dialogue-normalization-factor": 0.8, # 0 - means no normalization (samples are added as is), 1 - means full normalization (final samples will be divided by number of speakers)
}

HYPER_PARAMS["window-height"] = int(
    (HYPER_PARAMS["max-frequency"] - HYPER_PARAMS["min-frequency"])
    / HYPER_PARAMS["audio-rate"]
    * (HYPER_PARAMS["nfft"] // 2 + 1)
)

# Model parameters
mel_spectrogram_shape = (
    HYPER_PARAMS["window-width"],
    HYPER_PARAMS["window-height"],
    1,
)  # Replace window_size and num_mel_bands with your values

# Leaky ReLU activation function
leaky_relu = LeakyReLU(alpha=0.2)

# Mel-spectrogram input
mel_spectrogram_input = Input(shape=mel_spectrogram_shape, name="spectrogram_input")

# Convolutional layers
x = Conv2D(16, (3, 3), padding="same", activation=leaky_relu)(mel_spectrogram_input)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

# Convolutional layers
x = Conv2D(32, (3, 3), padding="same", activation=leaky_relu)(mel_spectrogram_input)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

x = Conv2D(64, (3, 3), padding="same", activation=leaky_relu)(x)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

x = Conv2D(128, (3, 3), padding="same", activation=leaky_relu)(x)
x = BatchNormalization()(x)
x = AveragePooling2D(pool_size=(2, 2))(x)

x = Flatten()(x)

# X-vector input
x_vector_input = Input(shape=(HYPER_PARAMS["x-vector-dim"],), name="x_vector_input")

# Concatenate flattened CNN output with x-vector input
combined_input = Concatenate()([x, x_vector_input])

# Dense layers
y = Dense(128, activation=leaky_relu)(combined_input)
y = Dropout(0.1)(y)
y = Dense(256, activation=leaky_relu)(y)
y = Dropout(0.1)(y)
y = Dense(512, activation=leaky_relu)(y)
output = Dense(HYPER_PARAMS["window-height"], activation="linear")(y)

# Construct the model
model = Model(inputs=[mel_spectrogram_input, x_vector_input], outputs=output)
model.summary()

# Compile the model
optimizer = Adam(learning_rate=HYPER_PARAMS["learning-rate"])
loss_fn = MeanSquaredError()
model.compile(optimizer=optimizer, loss=loss_fn)

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 spectrogram_input (InputLayer)  [(None, 65, 384, 1)  0          []                               
                                ]                                                                 
                                                                                                  
 conv2d_9 (Conv2D)              (None, 65, 384, 32)  320         ['spectrogram_input[0][0]']      
                                                                                                  
 batch_normalization_9 (BatchNo  (None, 65, 384, 32)  128        ['conv2d_9[0][0]']               
 rmalization)                                                                                     
                                                                                            

In [29]:
from random import random

class SamplesLoader:
    def __init__(self, audio_samples, x_vectors, params):
        self.audio_samples = audio_samples
        self.x_vectors = x_vectors

        self.params = params
        self.augmenrations = {k: v for k, v in params.items() if k.startswith("augment")}

        self.clip_start_index = int(
            self.params["min-frequency"] / self.params["audio-rate"] * self.params["fft-window"] // 2
        )
        self.clip_end_index = self.clip_start_index + self.params["window-height"]

        self.active_batch = {
            "input": tf.zeros(shape=(0, self.params["window-width"], self.params["window-height"])),
            "x-vector": tf.zeros(shape=(0, self.params["x-vector-dim"])),
            "output": tf.zeros(shape=(0, self.params["window-height"])),
        }

        self.active_samples_iter = None

    def __iter__(self):
        self.active_samples_iter = self.audio_samples.sample(frac=1).iterrows()
        return self

    def _slice_batch(self):
        if self.active_batch["input"].shape[0] < self.params["batch-size"]:
            return None

        new_batch = dict()
        for key in self.active_batch.keys():
            new_batch[key] = self.active_batch[key][: self.params["batch-size"]]
            self.active_batch[key] = self.active_batch[key][self.params["batch-size"] :]

        return new_batch["input"], new_batch["x-vector"], new_batch["output"]

    def _get_spectrogram(self, audio_tensor):
        spectrogram = tfio.audio.spectrogram(
            audio_tensor,
            nfft=self.params["nfft"],
            window=self.params["fft-window"],
            stride=self.params["fft-stride"],
        )

        # Slice away frequencies outside of the human voice range
        sliced_tensor = tf.slice(
            spectrogram,
            [0, self.clip_start_index],
            [spectrogram.shape[0], self.clip_end_index],
        )
        mean = tf.math.reduce_mean(sliced_tensor)
        std_dev = tf.math.reduce_std(sliced_tensor)
        final_tensor = (sliced_tensor - mean) / std_dev  # Normalize the tensor

        return final_tensor

    def _process_audio_tensor(self, audio_tensor):
        final_tensor = self._get_spectrogram(audio_tensor)  # Get normalized tensor

        # Calculate padding width
        pad_width = (self.params["window-width"] - 1) // 2

        # Pad the input image on both sides along the width axis
        padded_image = tf.pad(final_tensor, [[pad_width, pad_width], [0, 0]])

        # Extract patches with a sliding window
        patches = tf.image.extract_patches(
            images=tf.expand_dims(
                tf.expand_dims(padded_image, -1), 0
            ),  # Add a batch dimension to the input image
            sizes=[
                1,
                self.params["window-width"],
                self.params["window-height"],
                1,
            ],  # Patch size (1, w_w, h, 1)
            strides=[
                1,
                1,
                1,
                1,
            ],  # Stride (1, 1, 1, 1) for a sliding window with a step of 1
            rates=[1, 1, 1, 1],  # Dilation rate (1, 1, 1, 1)
            padding="VALID",  # No padding is required as we already padded the input image
        )

        # Reshape the patches tensor to the desired output shape (w, w_w, h)
        patches = tf.reshape(
            patches, [final_tensor.shape[0], self.params["window-width"], self.params["window-height"]]
        )
        return patches
    
    # Augmentation functions should return input and ouput audio tensors
    def _augment_raw(self, sample):
        return sample, sample

    def _augment_empty_noise(self, sample):
        noise_level = tf.random.uniform(shape=(), minval=self.params["noise-min-level"], maxval=self.params["noise-max-level"], dtype=tf.float32)
        noise_tensor = tf.random.normal(shape=tf.shape(sample), mean=0.0, stddev=noise_level, dtype=tf.float32)
        noise_tensor *= random() * 2
        return noise_tensor, noise_tensor

    def _augment_noise(self, sample):
        noise_level = tf.random.uniform(shape=(), minval=self.params["noise-min-level"], maxval=self.params["noise-max-level"], dtype=tf.float32)
        noise_tensor = tf.random.normal(shape=tf.shape(sample), mean=0.0, stddev=noise_level, dtype=tf.float32)
        return sample + noise_tensor, sample

    def _augment_dialogue(self, sample):
        speakers_count = tf.random.uniform(shape=(), minval=self.params["dialogue-min-speakers"], maxval=self.params["dialogue-max-speakers"], dtype=tf.float32)
        input_tensor = tf.identity(sample)
        input_length = sample.shape[0]

        for _ in range(int(speakers_count)):
            random_sample = self.audio_samples.sample(n=1).iloc[0]
            random_audio = self._load_mp3_sample(random_sample)
            random_audio_length = random_audio.shape[0]
            
            if random_audio_length > input_length:
                start_index = np.random.randint(0, random_audio_length - input_length)
                end_index = start_index + input_length
                input_tensor += random_audio[start_index:end_index]
            else:
                left_padding = (input_length - random_audio_length) // 2
                right_padding = input_length - random_audio_length - left_padding

                padded_tensor = tf.pad(random_audio, paddings=[[left_padding, right_padding]])
                input_tensor += padded_tensor


        input_tensor /= speakers_count * tf.cast(self.params["dialogue-normalization-factor"], dtype=tf.float32)
        return input_tensor, sample

    def _augment_dialogue_noise(self, sample):
        dialogue_sample, _ = self._augment_dialogue(sample)
        noise_sample, _ = self._augment_empty_noise(dialogue_sample)
        return dialogue_sample, noise_sample


    def _match_augmentation(self, sample, augmentation):
        if augmentation == "augment-raw":
            return self._augment_raw(sample)
        elif augmentation == "augment-empty-noise":
            return self._augment_empty_noise(sample)
        elif augmentation == "augment-noise":
            return self._augment_noise(sample)
        elif augmentation == "augment-dialogue":
            return self._augment_dialogue(sample)
        elif augmentation == "augment-dialogue-noise":
            return self._augment_dialogue_noise(sample)
        
    def _load_mp3_sample(self, sample):
        sample_path = get_path(sample["path"])
        sample_binary = tf.io.read_file(sample_path)
        audio = tfio.audio.decode_mp3(sample_binary)

        return tf.squeeze(audio, axis=-1)

    def __next__(self):
        while True:
            existing_batch = self._slice_batch()
            if existing_batch is not None:
                return existing_batch

            iter_result = self.active_samples_iter.__next__()
            if iter_result is None:
                raise StopIteration  # TODO: Pad left over samples with zeros

            sample_index, new_sample = iter_result
            sample_x_vector = self.x_vectors[new_sample["client_id"]]
            if sample_x_vector is None:
                continue

            # Load audio sample
            audio = self._load_mp3_sample(new_sample)

            if audio.shape[0] < 1000:  # Skip to short samples
                continue

            # Crop audio beggining and end to remove silence and stop/start button clicks
            raw_audio_tensor = audio[200:-250]
            raw_audio_spectrogram = self._get_spectrogram(raw_audio_tensor)
            tiled_x_vector = tf.tile(
                tf.expand_dims(sample_x_vector, 0), (raw_audio_spectrogram.shape[0], 1)
            )

            modified_samples = []
            for augmentation_name, possibility in self.augmenrations.items():
                if possibility > random():
                    new_sample = self._match_augmentation(raw_audio_tensor, augmentation_name)
                    if new_sample is None:
                        print(f"WARNING: Match for {augmentation_name} augmentation returned None")
                        continue
                    modified_samples.append(new_sample)

            for input_sample, output_sample in modified_samples:
                input_sample_patches = self._process_audio_tensor(input_sample)
                output_spectrogram = self._get_spectrogram(output_sample)

                self.active_batch["input"] = tf.concat(
                    [self.active_batch["input"], input_sample_patches], axis=0
                )
                self.active_batch["x-vector"] = tf.concat(
                    [self.active_batch["x-vector"], tiled_x_vector], axis=0
                )
                self.active_batch["output"] = tf.concat(
                    [self.active_batch["output"], output_spectrogram], axis=0
                )

In [30]:
train_test_start_index = int(raw_clips_meta.shape[0] * HYPER_PARAMS["learn-test-split"])
train_dataset = raw_clips_meta.iloc[:train_test_start_index]
test_dataset = raw_clips_meta.iloc[train_test_start_index:]

In [31]:
def samples_generator(loader):
    for batch_X_mel, batch_X_xvec, batch_y in loader:
        yield (batch_X_mel, batch_X_xvec), batch_y


train_loader = SamplesLoader(
    train_dataset, speakers_xvectors, HYPER_PARAMS
)
validation_loader = SamplesLoader(
    test_dataset, speakers_xvectors, HYPER_PARAMS
)

test_batch = next(iter(validation_loader))
output_signature = (
    tuple(
        [
            tf.TensorSpec.from_tensor(test_batch[0]),
            tf.TensorSpec.from_tensor(test_batch[1]),
        ]
    ),
    tf.TensorSpec.from_tensor(test_batch[2]),
)

train_data = tf.data.Dataset.from_generator(
    lambda: samples_generator(train_loader),
    output_signature=output_signature,
)

validation_data = tf.data.Dataset.from_generator(
    lambda: samples_generator(validation_loader),
    output_signature=output_signature,
)

In [32]:
import wandb
from wandb.keras import WandbCallback, WandbModelCheckpoint
from tensorflow.keras.callbacks import ModelCheckpoint

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msllowre[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [33]:
run = wandb.init(config=HYPER_PARAMS, project="speech-filter")
checkpoint_path = wandb.run.dir + "/model_checkpoint.hdf5"

checkpoint_callback = WandbModelCheckpoint(
    filepath=checkpoint_path,
    monitor="val_loss",
    verbose=1,
    save_best_only=False,
    save_weights_only=False,
    mode="auto",
    save_freq="epoch",
)

metrics_callback = WandbCallback(
    monitor="val_loss",
    verbose=1,
    mode="auto",
    save_model=True,
    save_graph=True,
    save_weights_only=False,
    log_weights=True,
    log_gradients=True,
    training_data=train_data,
    validation_data=validation_data,
    predictions=64,
    input_type="images",
    output_type="image",
    log_batch_frequency=HYPER_PARAMS["logs-batch-frequency"],
)

model.fit(
    train_data,
    epochs=HYPER_PARAMS["epochs"],
    validation_data=validation_data,
    callbacks=[
        metrics_callback,
        checkpoint_callback,
    ],
)

run.finish()



Epoch 1/10


2023-04-30 23:01:33.522059: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2023-04-30 23:01:33.665245: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2023-04-30 23:01:37.478479: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-04-30 23:01:41.096166: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f80a2a5dfd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-04-30 23:01:41.096222: I tensorf

   2750/Unknown - 479s 170ms/step - loss: 0.9989

KeyboardInterrupt: 