# Speaker Identification

In [None]:
!pip3 install tensorflow==2.8.0
!pip3 install tensorflow-io==0.25.0

In [29]:
import sys
import os
from pathlib import Path
from typing import Tuple

import librosa
import numpy as np
import keras

import tensorflow as tf
import tensorflow_io as tfio
print("TensorFlow version:", tf.__version__)
print("TensorFlow IO version:", tfio.__version__)

TensorFlow version: 2.8.0
TensorFlow IO version: 0.25.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')
ROOT_DIR='/content/drive/MyDrive/College/Research/Linh_2023_Research'

DATASET_PATH=ROOT_DIR+'/test_data/vox'

Mounted at /content/drive


## Dataset preparation
[Keras Speaker Recognition](https://keras.io/examples/audio/speaker_recognition_using_cnn/)

In [36]:
SAMPLING_RATE = 16000
N_MFCC = 13
BATCH_SIZE = 128
SHUFFLE_SEED = 43
TRAIN_VALID_SPLIT = 0.2
EPOCHS = 100

def paths_and_labels_to_dataset(audio_paths, labels):
    """Constructs a dataset of audios and labels."""
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(lambda x: path_to_audio(x))
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    return tf.data.Dataset.zip((audio_ds, label_ds))


def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
    return audio

def audio_to_fft(audio):
    # # Since tf.signal.fft applies FFT on the innermost dimension,
    # # we need to squeeze the dimensions and then expand them again
    # # after FFT
    # print(type(audio))
    # audio = tf.squeeze(audio, axis=-1)
    # fft = tf.signal.fft(
    #     tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    # )
    # fft = tf.expand_dims(fft, axis=-1)

    # # Return the absolute value of the first half of the FFT
    # # which represents the positive frequencies
    # return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])

    # Convert the audio to MFCC
    stfts = tf.signal.stft(audio, frame_length=1024, frame_step=256, fft_length=1024)
    spectrograms = tf.abs(stfts)

    # Warp the linear scale spectrograms into the mel-scale
    num_spectrogram_bins = stfts.shape[-1]
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, N_MFCC
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
        num_mel_bins, num_spectrogram_bins, SAMPLING_RATE, lower_edge_hertz, upper_edge_hertz)
    mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
    mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))

    # Compute a stabilized log to get log-magnitude mel-scale spectrograms
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

    # Compute MFCCs from log_mel_spectrograms and take the first num_mfcc
    mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)[..., :N_MFCC]

    # Add an extra dimension to match the shape of the original code
    mfccs = tf.expand_dims(mfccs, axis=-1)

    return tf.math.abs(mfccs[:, : (audio.shape[1] // 2), :])

In [37]:
class_names = os.listdir(DATASET_PATH)
audio_paths = []
labels = []
for label, name in enumerate(class_names):
    if label > 4: break
    print("Processing speaker {}".format(name,))
    dir_path = Path(DATASET_PATH) / name
    speaker_sample_paths = [
        os.path.join(dir_path, filepath)
        for filepath in os.listdir(dir_path)
        if filepath.endswith(".wav")
    ]
    audio_paths += speaker_sample_paths
    labels += [label] * len(speaker_sample_paths)

print(
    "Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
)

# Shuffle
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(audio_paths)
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(labels)

# Split into training and validation
num_val_samples = int(TRAIN_VALID_SPLIT * len(audio_paths))
print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
train_audio_paths = audio_paths[:-num_val_samples]
train_labels = labels[:-num_val_samples]

print("Using {} files for validation.".format(num_val_samples))
valid_audio_paths = audio_paths[-num_val_samples:]
valid_labels = labels[-num_val_samples:]

# Create 2 datasets, one for training and the other for validation
train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
    BATCH_SIZE
)

valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)

# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds = train_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)

valid_ds = valid_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)

Processing speaker id10016
Processing speaker id10011
Processing speaker id10004
Processing speaker id10009
Processing speaker id10017
Found 691 files belonging to 395 classes.
Using 553 files for training.
Using 138 files for validation.




## Model

MFCC

FFT (focus on low freq) ---> CNN (max pool in one direction)

Is speaker unique in consonant or vowel?



In [None]:
def residual_block(x, filters, conv_num=3, activation="relu"):
    # Shortcut
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = keras.layers.Activation(activation)(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation(activation)(x)
    return keras.layers.MaxPool1D(pool_size=2, strides=2)(x)


def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape=input_shape, name="input")

    x = residual_block(inputs, 16, 2)
    x = residual_block(x, 32, 2)
    x = residual_block(x, 64, 3)
    x = residual_block(x, 128, 3)
    x = residual_block(x, 128, 3)

    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(128, activation="relu")(x)

    outputs = keras.layers.Dense(num_classes, activation="softmax", name="output")(x)

    return keras.models.Model(inputs=inputs, outputs=outputs)


# model = build_model((SAMPLING_RATE//2, 1), len(class_names))
model = build_model(N_MFCC, len(class_names))

model.summary()

# Compile the model using Adam's default learning rate
model.compile(
    optimizer="Adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

# Add callbacks:
# 'EarlyStopping' to stop training when the model is not enhancing anymore
# 'ModelCheckPoint' to always keep the model that has the best val_accuracy
model_save_filename = "model.h5"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_accuracy", save_best_only=True
)

## Training

`fit()` is for training the model with the given inputs (and corresponding training labels).

`evaluate()` is for evaluating the already trained model using the validation (or test) data and the corresponding labels. Returns the loss value and metrics values for the model.

`predict()` is for the actual prediction. It generates output predictions for the input samples.

In [39]:
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)

Epoch 1/100


InvalidArgumentError: ignored

## Evaluate

In [None]:
print(model.evaluate(valid_ds))

[1.0956804752349854, 0.6086956262588501]


In [None]:
print(model.predict(valid_ds))

[[4.32005882e-01 6.45957235e-03 4.20293927e-01 ... 1.85888614e-11
  1.54312587e-12 3.42603237e-11]
 [6.73325002e-01 7.79852197e-02 1.06511056e-01 ... 3.76609955e-06
  5.92110155e-07 3.57522913e-06]
 [7.45949507e-01 1.10497393e-01 7.59527981e-02 ... 6.52241965e-07
  1.70138080e-07 1.31194531e-06]
 ...
 [4.66970950e-01 1.10281631e-01 3.97728413e-01 ... 1.51410729e-17
  2.42138223e-18 1.37034261e-16]
 [3.45003270e-02 2.79879779e-01 5.97353326e-04 ... 1.14599116e-17
  1.26591938e-18 9.09140645e-18]
 [1.51060745e-01 2.56404877e-01 8.70290771e-03 ... 2.08005835e-09
  5.24520594e-10 1.46849777e-09]]
