# Prepare tfrecords files

Perform sanity check using low amount of data

- Prepare dataset
- Train model

Full scale training

- What can fit memory?

In [None]:
import os
os.makedirs('data', exist_ok=True)
# if not os.path.isfile('data/cv-corpus-7.0-2021-07-21-it.tar.gz'):
#     !wget -O "data/cv-corpus-7.0-2021-07-21-it.tar.gz" "https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-7.0-2021-07-21/cv-corpus-7.0-2021-07-21-it.tar.gz?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAQ3GQRTO3CHAF2LUR%2F20210801%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20210801T075430Z&X-Amz-Expires=43200&X-Amz-Security-Token=FwoGZXIvYXdzENn%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaDGTMWIPnTt46zTwUTCKSBEzVQleBRJYCDi9aNljFC0TrxdJ5O%2BtQ%2FY0wwhg8b7X4mD6Tu%2BIQ6yAtcJ20qW5vYW4nv0PvpNrq7Ne%2FoZ0RWT9j1wedHxleh2g3JEP4HE8FUKmpVzb5HiaKGmBYRn41nnM0Czk3WHD7KeHHhtQj5rMTxbmHQUTw7gvad7ieRy%2FF4WbOzX%2FPx78dt4Zq2%2BTxl%2Fc4SOhlM9n3SKWc0foqKuDzytDFf1%2FQd45BMUWCiPOd2fyf0l751fgygj7syaEnegchts96%2FZZ0ilaXYuu9jjcO7gJCMB32r6rndMP5g98RbV5ScPe5Ey7TvAgeKZXFuF5LHIj7TraBr0Z6WqX02Salo9c%2Fu5b%2B%2FurhR5Q6%2B5hDbvg9abIrAzpom5egeOJSDFTYzsQHOdboXgs7Ciop7YktBjHXMTPi7ck22%2F4OYI4lqdwLICn%2BHE%2B79%2FcDrTYQ%2BOSLYZonbIc2u9Q2iHwjWr4i9Z%2BGYQhGOyi6L%2BVblvHMjLVBFXNr%2FfnJEM6%2FXE6gIVfM2u9948bzTbBcYLZ552LzXJdBpXqFNQ8t8D4VOYrGNXJOvxCnOI5OlmORzEvHNS1USQhq0rNb1JY8X1N6oVvcIGkrlOJfcJgWT6oTnI0L5CBtzbVVtvwsjvomeb3ZlmbWXCohNxkSCBJouc7zXTRXaejPr8dUBHpxAvgXD6Qch%2Fnm19OoaspXOpxX2oXV5z8wKNibmYgGMiqG2Tnp1d5ZY7Vai14pdD2OijlQWmQsE0FYfpcqYAyL9xbO6Iv3kB5v7MY%3D&X-Amz-Signature=393509c6b2e96db5d0f10557b5973b7bd167a2f7349910073d1c8ddd72fd8992&X-Amz-SignedHeaders=host"

In [None]:
!pip install tensorflow-io -q
!pip install tensorflow-addons -q

In [None]:
from IPython.display import Audio

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow_io as tfio
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 2
from create_audio_tfrecords import AudioTarReader, PersonIdAudio

audio_tarfile = 'data/en.tar'
audio_tarfile = 'data/cv-corpus-7.0-2021-07-21-pt.tar.gz'
en_total = 1584330
sr = 48000

atr = AudioTarReader(audio_tarfile)

In [None]:
atr.data_files['train.tsv'].head()

In [None]:
audio_content = atr.retrieve_per_user_data()

## Sanity check

Check if audios from the same person sound like that

In [None]:
temp_list = [x for x in audio_content if len(audio_content[x]) > 2]
cur_idx = np.random.randint(len(temp_list))
audio_samples = audio_content[temp_list[cur_idx]]
len(audio_samples)

In [None]:
decoded_mp3 = tfio.audio.decode_mp3(audio_samples[0])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)

In [None]:
decoded_mp3 = tfio.audio.decode_mp3(audio_samples[1])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)

In [None]:
len(audio_samples[1])

# Model training

## Retrieve tf.records.dataset

PersonIdAudio contains code to retrieve a tf.records.Dataset from a given audio_content.

In [None]:
pia = PersonIdAudio(audio_content, sr)
audio_dataset = pia.get_tf_dataset()

In [None]:
samples = [x for x in audio_dataset.take(10)]
decoded_mp3 = tfio.audio.decode_mp3(samples[0][0])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)

In [None]:
decoded_mp3.shape

## Write tfrecords file

In [None]:
tfrecords_file = pia.save_tfrecords_file('pt-train')

In [None]:
# check if tfrecords file is OK
# notice GZIP compression + the deserialization function map
tfrecords_audio_dataset = tf.data.TFRecordDataset(
    tfrecords_file, compression_type='GZIP'
).map(PersonIdAudio.deserialize_from_tfrecords)

In [None]:
samples = [x for x in tfrecords_audio_dataset.take(4)]
decoded_mp3 = tfio.audio.decode_mp3(samples[1][0])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)

## Model definition

In [None]:
from tensorflow.keras import Model
from tensorflow.keras import layers as L
# good example here
# https://www.tensorflow.org/addons/tutorials/losses_triplet

n_mel_bins = 80

def normalized_mel_spectrogram(x, sr=48000):
    spec_stride = 256
    spec_len = 1024

    spectrogram = tfio.audio.spectrogram(
        x, nfft=spec_len, window=spec_len, stride=spec_stride
    )

    num_spectrogram_bins = spec_len // 2 + 1  # spectrogram.shape[-1]
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 10000.0, n_mel_bins
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
      num_mel_bins, num_spectrogram_bins, sr, lower_edge_hertz,
      upper_edge_hertz)
    mel_spectrograms = tf.tensordot(
      spectrogram, linear_to_mel_weight_matrix, 1)
    mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(
      linear_to_mel_weight_matrix.shape[-1:]))

    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    avg = tf.math.reduce_mean(log_mel_spectrograms)
    std = tf.math.reduce_std(log_mel_spectrograms)

    return (log_mel_spectrograms - avg) / std


def BaseSpeechEmbeddingModel(inputLength=None, rnn_func=L.LSTM, rnn_units=64):
    # input is the first channel of the decoded mp3, ie, 
    # tfio.audio.decode_mp3(data)[:, 0]

    # inp = L.Input((inputLength,), name='input')
    # mel_spec = L.Lambda(lambda z: normalized_mel_spectrogram(z), name='normalized_spectrogram')(inp)

    # receive normalized mel spectrogram as input instead
    inp = L.Input((inputLength, n_mel_bins), name='input')
    mel_spec = inp

    # normalize the spectrogram
    # mel_spec = L.BatchNormalization()(mel_spec)
    # mel_spec = L.LayerNormalization()(mel_spec)

    x = L.Bidirectional(
        rnn_func(rnn_units, return_sequences=True)
    )(mel_spec)  # [b_s, seq_len, vec_dim]
    x = L.Bidirectional(
        rnn_func(rnn_units, return_sequences=False)
    )(x)  # [b_s, seq_len, vec_dim]

    x = L.Dense(rnn_units, activation=None)(x)  # No activation on final dense layer
    # L2 normalize embeddings
    # note: L2 returns normalized, norm
    x = L.Lambda(lambda z: tf.math.l2_normalize(z, axis=1), name='output')(x)
    
    output = x

    model = Model(inputs=[inp], outputs=[output])
    return model

In [None]:
m = BaseSpeechEmbeddingModel()
m.summary()

In [None]:
mel_spec = normalized_mel_spectrogram(decoded_mp3[:, 0])
v = tf.expand_dims(mel_spec, axis=0)
v = tf.concat([v, v], axis=0)
pred = m.predict(v)
pred.shape

In [None]:
plt.figure(figsize=(15, 5))
plt.pcolormesh(tf.transpose(mel_spec.numpy()))

TODO:

- Train only with the mel-spectrogram
- Make the tfrecords file

## Training

In [None]:
import tensorflow_addons as tfa

In [None]:
batch_size = 48

return_mel_spec = True
def mp3_decode_fn(audio_bytes, audio_class):
    # check if limiting output size helps
    # return tfio.audio.decode_mp3(audio_bytes)[:, 0], audio_class
    audio_data = tfio.audio.decode_mp3(audio_bytes)[:, 0]
    # audio_data = tfio.audio.decode_mp3(audio_bytes)[0:48000 * 4, 0]
    if return_mel_spec:
        audio_data = normalized_mel_spectrogram(audio_data)
    return audio_data, audio_class

train_set = audio_dataset.map(  # Reduce memory usage
        mp3_decode_fn,
        num_parallel_calls=tf.data.AUTOTUNE
    ).cache().repeat(
    ).shuffle(
        10 * batch_size,
        reshuffle_each_iteration=True
    ).padded_batch(  # Vectorize your mapped function
        batch_size,  # batch size
        drop_remainder=True
    ).prefetch(  # Overlap producer and consumer works
        tf.data.AUTOTUNE
    )

"""
train_set = audio_dataset.cache(filename='data/audio_data.cache').repeat(
    ).map(  # Reduce memory usage
        mp3_decode_fn,
        num_parallel_calls=tf.data.AUTOTUNE
    ).padded_batch(  # Vectorize your mapped function
        batch_size,  # batch size
        drop_remainder=True
    ).shuffle(
        10 * batch_size,
        reshuffle_each_iteration=True
    ).prefetch(  # Overlap producer and consumer works
        tf.data.AUTOTUNE
    )
    
    # .interleave(  # Parallelize data reading
    #     dataset_generator_fun,
    #     num_parallel_calls=tf.data.AUTOTUNE
    # )
    # .map(  # Parallelize map transformation
    #     time_consuming_map,
    #     num_parallel_calls=tf.data.AUTOTUNE
    # )
    .cache()  # Cache data
    .map(  # Reduce memory usage
        mp3_decode_fn,
        num_parallel_calls=tf.data.AUTOTUNE
    )
    .padded_batch(  # Vectorize your mapped function
        256,  # batch size
        drop_remainder=True
    )
    .shuffle(
        5000,
        reshuffle_each_iteration=True
    )
    .prefetch(  # Overlap producer and consumer works
        tf.data.AUTOTUNE
    )
"""

In [None]:
# sample_train_data = [x for x in train_set.take(1)]

In [None]:
# elements, contents/labels
# len(sample_train_data), sample_train_data[0][0].shape, sample_train_data[0][1].shape

In [None]:
# cur_sample = sample_train_data[0][0][5].numpy()
# Audio(cur_sample, rate=sr)

In [None]:
# m.predict(sample_train_data[0][0][0:8]).shape

In [None]:
m.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tfa.losses.TripletSemiHardLoss()
)

In [None]:
history = m.fit(
    train_set,
    steps_per_epoch = pia.n_audios // batch_size,
    epochs=100)

In [None]:
m.save(f"model_{audio_tarfile.replace('/', '').replace('.', '')}")

In [None]:
m.save(f"model_{audio_tarfile.replace('/', '').replace('.', '')}.h5")

### Check what the similarities look like

In [None]:
val_path_to_client_dict = dict(zip(data_files['dev.tsv'].path, data_files['dev.tsv'].client_id))
val_audio_content = {}
for x in tqdm(tar_file_list):
    name_split = x.name.split('/')
    cur_id = val_path_to_client_dict.get(name_split[-1], False)
    if cur_id:
        audio_data = audios_tar.extractfile(x).read()
        cur_id_dict = val_audio_content.get(cur_id, [])
        cur_id_dict.append(audio_data)
        val_audio_content[cur_id] = cur_id_dict

In [None]:
def get_embedding(data, model):
    preds = []
    for x in tqdm(data):
        audio_data = tfio.audio.decode_mp3(x)[:, 0]
        audio_data = normalized_mel_spectrogram(audio_data)
        cur_pred = model.predict(
            tf.expand_dims(audio_data, axis=0)
        )[0]
        preds.append(cur_pred)

    return preds

In [None]:
audio_content_with_repeats = [x for x in val_audio_content if len(val_audio_content[x]) > 1]
print([len(val_audio_content[x]) for x in audio_content_with_repeats])

In [None]:
len(val_audio_content[audio_content_with_repeats[0]])

In [None]:
all_keys = audio_content_with_repeats
samples1 = val_audio_content[all_keys[4]]
samples2 = val_audio_content[all_keys[18]]
preds1 = get_embedding(samples1, m)
preds2 = get_embedding(samples2, m)

In [None]:
import numpy as np
def get_dists(list1, list2):
    ans = []
    for x in tqdm(list1):
        for y in list2:
            dist = np.linalg.norm(x-y)
            ans.append(dist)
    return ans

local_dists1 = get_dists(preds1, preds1)
local_dists2 = get_dists(preds2, preds2)
cross_dists = get_dists(preds1, preds2)

np.mean(local_dists1), np.mean(local_dists2), np.mean(cross_dists)

### Debug code

In [None]:
assert False

In [None]:
plt.plot(decoded_mp3)

In [None]:
mel_spec = m.predict(tf.expand_dims(decoded_mp3[:, 0], axis=0))
mel_spec = tf.transpose(mel_spec[0])
plt.pcolormesh(mel_spec.numpy())

In [None]:
tf.math.reduce_mean(mel_spec), tf.math.reduce_max(mel_spec), tf.math.reduce_min(mel_spec)

In [None]:
plt.hist(mel_spec.numpy().flatten())

In [None]:
# figure out the mp3 folder inside tar

for x in audios_tar:
    if x.name.endswith('.mp3'):
        mp3_folder = x.name.split('/')
        mp3_folder = '/'.join(mp3_folder[:-1])
        break
print(f'Detected mp3 folder: {mp3_folder}')

In [None]:
# build a dictionary with key -> person id, value -> list of audios from that person

audio_per_person = {}
audio_list = data_files['train.tsv'][['client_id', 'path']].values[0:10]

for (person_id, audio_file) in tqdm(audio_list):
    audio_data = audios_tar.extractfile(mp3_folder + '/' + audio_file).read()
    person_audio_list = audio_per_person.get(person_id, [])
    person_audio_list.append(audio_data)
    audio_per_person[person_id] = person_audio_list

In [None]:
# check a few audios

In [None]:
data_files['train.tsv'].head()