# Train voice embedding

Perform sanity check using low amount of data

- Prepare dataset
- Train model

Full scale training

- What can fit memory?

In [1]:
import os
import tarfile
from IPython.display import Audio

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow_io as tfio
from matplotlib import pyplot as plt

audio_tarfile = 'data/en.tar'
audio_tarfile = 'data/pt.tar'
en_total = 1584330

sr = 48000

# tar = tarfile.open("filename.tar.gz", "r:gz")
audios_tar = tarfile.open(name=audio_tarfile, mode='r')

In [2]:
# for the English tarfile, we already know the max
tar_file_list = [x for x in tqdm(audios_tar, total=en_total)]

data_files = {
    'train.tsv': None,
    'dev.tsv': None,
    'test.tsv': None,
    # 'validated.tsv': None
}
n_files = len(data_files.keys())
cur_files = 0

for x in tar_file_list:
    for k in data_files:
        if x.name.endswith(k):
            with audios_tar.extractfile(x) as f:
                df = pd.read_csv(f, sep='\t')
                data_files[k] = df
            cur_files += 1
    if cur_files == n_files:
        break

  0%|          | 0/1584330 [00:00<?, ?it/s]

In [3]:
# learn to map files to id's
path_to_client_dict = dict(zip(data_files['train.tsv'].path, data_files['train.tsv'].client_id))

data_files['train.tsv'].head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,locale,segment
0,d76e872cd8e581ed15344f019d4debe9986a03723be3fb...,common_voice_pt_20464413.mp3,ao treinar um modelo todos os fonemas culturai...,2,0,twenties,male,,pt,
1,d76e872cd8e581ed15344f019d4debe9986a03723be3fb...,common_voice_pt_20464414.mp3,"Eu conto vinte pedaços por dia, ok?",2,0,twenties,male,,pt,
2,d76e872cd8e581ed15344f019d4debe9986a03723be3fb...,common_voice_pt_20464415.mp3,"Infelizmente, o período de teste expirou.",2,1,twenties,male,,pt,
3,d76e872cd8e581ed15344f019d4debe9986a03723be3fb...,common_voice_pt_20464416.mp3,Nós fizemos o nosso melhor,2,0,twenties,male,,pt,
4,d76e872cd8e581ed15344f019d4debe9986a03723be3fb...,common_voice_pt_20464417.mp3,Marte é quase inabitável.,2,0,twenties,male,,pt,


In [4]:
audio_content = {}
for x in tqdm(tar_file_list):
    name_split = x.name.split('/')
    cur_id = path_to_client_dict.get(name_split[-1], False)
    if cur_id:
        audio_data = audios_tar.extractfile(x).read()
        cur_id_dict = audio_content.get(cur_id, [])
        cur_id_dict.append(audio_data)
        audio_content[cur_id] = cur_id_dict

  0%|          | 0/51723 [00:00<?, ?it/s]

## Sanity check

Check if audios from the same person sound like that

In [5]:
temp_list = [x for x in audio_content if len(audio_content[x]) > 2]
cur_idx = np.random.randint(len(temp_list))
audio_samples = audio_content[temp_list[cur_idx]]
len(audio_samples)

183

In [6]:
decoded_mp3 = tfio.audio.decode_mp3(audio_samples[0])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)

In [7]:
decoded_mp3 = tfio.audio.decode_mp3(audio_samples[1])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)

In [8]:
len(audio_samples[1])

26157

# Model training

In [9]:
class PersonIdAudio:
    def __init__(self, audio_content, sr=48000):
        """ Constructor

        Arguments:

        audio_content: dictionary containing
            person_id as keys and a
            list of mp3-encoded samples

        sr: sampling rate
        """
        person_ids = audio_content.keys()
        labels = range(len(person_ids))
        self.id_to_label = dict(zip(person_ids, labels))
        self.n_audios = sum([len(audio_content[x]) for x in audio_content])

    def gen_audios(self):
        """ Generate audios and id's
        Leave the shuffling part to tf dataset
        """
        for person_id in audio_content:
            person_label = self.id_to_label[person_id]
            for item in audio_content[person_id]:
                yield item, person_label

pia = PersonIdAudio(audio_content)

In [10]:
# at this point we could write a tfrecords
# file if we like using TFRecordWriter
# if everything fits in memory so we don't really need to

audio_signature=(
    tf.TensorSpec(shape=(None), dtype=tf.string),
    tf.TensorSpec(shape=(), dtype=tf.int32)
)

audio_dataset = tf.data.Dataset.from_generator(
     pia.gen_audios,
     output_signature=audio_signature
)

In [11]:
samples = [x for x in audio_dataset.take(10)]
decoded_mp3 = tfio.audio.decode_mp3(samples[0][0])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)

In [12]:
decoded_mp3.shape

TensorShape([248832, 1])

## Model definition

In [13]:
from tensorflow.keras import Model
from tensorflow.keras import layers as L
# good example here
# https://www.tensorflow.org/addons/tutorials/losses_triplet
def normalized_mel_spectrogram(x, sr=48000):
    spec_stride = 128
    spec_len = 1024

    spectrogram = tfio.audio.spectrogram(
        x, nfft=spec_len, window=spec_len, stride=spec_stride
    )

    num_spectrogram_bins = spec_len // 2 + 1  # spectrogram.shape[-1]
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 10000.0, 85
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
      num_mel_bins, num_spectrogram_bins, sr, lower_edge_hertz,
      upper_edge_hertz)
    mel_spectrograms = tf.tensordot(
      spectrogram, linear_to_mel_weight_matrix, 1)
    mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(
      linear_to_mel_weight_matrix.shape[-1:]))

    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    avg = tf.math.reduce_mean(log_mel_spectrograms)
    std = tf.math.reduce_std(log_mel_spectrograms)

    return (log_mel_spectrograms - avg) / std


def BaseSpeechEmbeddingModel(inputLength=None, rnn_func=L.LSTM, rnn_units=64):
    # input is the first channel of the decoded mp3, ie, 
    # tfio.audio.decode_mp3(data)[:, 0]

    inp = L.Input((inputLength,), name='input')
    
    mel_spec = L.Lambda(lambda z: normalized_mel_spectrogram(z), name='normalized_spectrogram')(inp)

    # normalize the spectrogram
    # mel_spec = L.BatchNormalization()(mel_spec)
    # mel_spec = L.LayerNormalization()(mel_spec)

    x = L.Bidirectional(
        rnn_func(rnn_units, return_sequences=True)
    )(mel_spec)  # [b_s, seq_len, vec_dim]
    x = L.Bidirectional(
        rnn_func(rnn_units, return_sequences=False)
    )(x)  # [b_s, seq_len, vec_dim]

    x = L.Dense(rnn_units, activation=None)(x)  # No activation on final dense layer
    # L2 normalize embeddings
    # note: L2 returns normalized, norm
    x = L.Lambda(lambda z: tf.math.l2_normalize(z, axis=1), name='output')(x)
    
    output = x

    model = Model(inputs=[inp], outputs=[output])
    return model

In [14]:
m = BaseSpeechEmbeddingModel()
m.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None)]            0         
_________________________________________________________________
normalized_spectrogram (Lamb (None, None, 85)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         76800     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
output (Lambda)              (None, 64)                0         
Total params: 183,872
Trainable params: 183,872
Non-trainable params: 0
_______________________________________________________

In [15]:
v = tf.expand_dims(decoded_mp3[:, 0], axis=0)
v = tf.concat([v, v], axis=0)
m.predict(v).shape

(2, 64)

TODO:

- Train only with the mel-spectrogram
- Make the tfrecords file

## Training

In [16]:
import tensorflow_addons as tfa

In [17]:
batch_size = 16

def mp3_decode_fn(audio_bytes, audio_class):
    # check if limiting output size helps
    # return tfio.audio.decode_mp3(audio_bytes)[:, 0], audio_class
    return tfio.audio.decode_mp3(audio_bytes)[0:200000, 0], audio_class

train_set = audio_dataset.cache(filename='data/audio_data.cache').repeat(
    ).map(  # Reduce memory usage
        mp3_decode_fn,
        num_parallel_calls=tf.data.AUTOTUNE
    ).padded_batch(  # Vectorize your mapped function
        batch_size,  # batch size
        drop_remainder=True
    ).shuffle(
        20 * batch_size,
        reshuffle_each_iteration=True
    ).prefetch(  # Overlap producer and consumer works
        tf.data.AUTOTUNE
    )

"""
train_set = audio_dataset.cache(
    ).repeat(
    ).map(  # Reduce memory usage
        mp3_decode_fn,
        num_parallel_calls=tf.data.AUTOTUNE
    ).padded_batch(  # Vectorize your mapped function
        batch_size,  # batch size
        drop_remainder=True
    ).shuffle(
        10 * batch_size,
        reshuffle_each_iteration=True
    ).prefetch(  # Overlap producer and consumer works
        tf.data.AUTOTUNE
    )
    
    # .interleave(  # Parallelize data reading
    #     dataset_generator_fun,
    #     num_parallel_calls=tf.data.AUTOTUNE
    # )
    # .map(  # Parallelize map transformation
    #     time_consuming_map,
    #     num_parallel_calls=tf.data.AUTOTUNE
    # )
    .cache()  # Cache data
    .map(  # Reduce memory usage
        mp3_decode_fn,
        num_parallel_calls=tf.data.AUTOTUNE
    )
    .padded_batch(  # Vectorize your mapped function
        256,  # batch size
        drop_remainder=True
    )
    .shuffle(
        5000,
        reshuffle_each_iteration=True
    )
    .prefetch(  # Overlap producer and consumer works
        tf.data.AUTOTUNE
    )
"""

'\ntrain_set = audio_dataset.cache(\n    ).repeat(\n    ).map(  # Reduce memory usage\n        mp3_decode_fn,\n        num_parallel_calls=tf.data.AUTOTUNE\n    ).padded_batch(  # Vectorize your mapped function\n        batch_size,  # batch size\n        drop_remainder=True\n    ).shuffle(\n        10 * batch_size,\n        reshuffle_each_iteration=True\n    ).prefetch(  # Overlap producer and consumer works\n        tf.data.AUTOTUNE\n    )\n    \n    # .interleave(  # Parallelize data reading\n    #     dataset_generator_fun,\n    #     num_parallel_calls=tf.data.AUTOTUNE\n    # )\n    # .map(  # Parallelize map transformation\n    #     time_consuming_map,\n    #     num_parallel_calls=tf.data.AUTOTUNE\n    # )\n    .cache()  # Cache data\n    .map(  # Reduce memory usage\n        mp3_decode_fn,\n        num_parallel_calls=tf.data.AUTOTUNE\n    )\n    .padded_batch(  # Vectorize your mapped function\n        256,  # batch size\n        drop_remainder=True\n    )\n    .shuffle(\n      

In [None]:
sample_train_data = [x for x in train_set.take(1)]

In [None]:
# elements, contents/labels
len(sample_train_data), sample_train_data[0][0].shape, sample_train_data[0][1].shape

In [None]:
cur_sample = sample_train_data[0][0][5].numpy()
Audio(cur_sample, rate=sr)

In [None]:
m.predict(sample_train_data[0][0][0:8]).shape

In [None]:
m.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tfa.losses.TripletSemiHardLoss()
)

In [None]:
history = m.fit(
    train_set,
    steps_per_epoch = pia.n_audios // batch_size,
    epochs=100)

### Debug code

In [None]:
plt.plot(decoded_mp3)

In [None]:
mel_spec = m.predict(tf.expand_dims(decoded_mp3[:, 0], axis=0))
mel_spec = tf.transpose(mel_spec[0])
plt.pcolormesh(mel_spec.numpy())

In [None]:
tf.math.reduce_mean(mel_spec), tf.math.reduce_max(mel_spec), tf.math.reduce_min(mel_spec)

In [None]:
plt.hist(mel_spec.numpy().flatten())

In [None]:
# figure out the mp3 folder inside tar

for x in audios_tar:
    if x.name.endswith('.mp3'):
        mp3_folder = x.name.split('/')
        mp3_folder = '/'.join(mp3_folder[:-1])
        break
print(f'Detected mp3 folder: {mp3_folder}')

In [None]:
# build a dictionary with key -> person id, value -> list of audios from that person

audio_per_person = {}
audio_list = data_files['train.tsv'][['client_id', 'path']].values[0:10]

for (person_id, audio_file) in tqdm(audio_list):
    audio_data = audios_tar.extractfile(mp3_folder + '/' + audio_file).read()
    person_audio_list = audio_per_person.get(person_id, [])
    person_audio_list.append(audio_data)
    audio_per_person[person_id] = person_audio_list

In [None]:
# check a few audios

In [None]:
data_files['train.tsv'].head()