# Reader for Mozilla Common Voices Dataset

https://commonvoice.mozilla.org/en/datasets

In [None]:
import os
import datetime
import IPython.display as ipd

import tensorflow as tf

%load_ext autoreload
%autoreload 2

from data_readers.mozilla_speech_reader import AudioTarReader  # noqa
from models.alignment_model import PraticantoForcedAligner  # noqa
from models import alignment_losses  # noqa

using_colab = False
if using_colab:
    %pip install pandas -q
    %pip install tqdm -q
    %pip install ipywidgets -q
    # watch this for the correct version 0.21.0 for tf 2.6, 0.24.0 for tf 2.8
    %pip install tensorflow-io==0.24.0 -q

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [None]:
file = 'data/cv-corpus-8.0-2022-01-19-pt.tar.gz'
atr = AudioTarReader(file)

In [None]:
os.makedirs('data', exist_ok=True)
data_file = 'data/validated_not_traintest.tfrecords'
if not os.path.isfile(data_file):
    atr.write_tfrecords_file(data_file)

data_file = 'data/train.tfrecords'
if not os.path.isfile(data_file):
    atr.write_tfrecords_file(data_file, split='train')

In [None]:
dataset = tf.data.TFRecordDataset(
    'data/validated_not_traintest.tfrecords'
).map(AudioTarReader.deserialize)

In [None]:
sample = [x for x in dataset.skip(3).take(1)][0]

In [None]:
print(sample[1].numpy().decode('UTF-8'), sample[2:])
ipd.Audio(sample[0][:, 0].numpy(), rate=48000)

## Check marked strings

Like á, ó, etc

In [None]:
v = tf.concat([['[BOS]'], tf.strings.unicode_split(sample[1], 'UTF-8'), ['[EOS]']], axis=0)
print(atr.lookup(v)), print(atr.lookup_inv(atr.lookup(v)))

In [None]:
x = tf.strings.unicode_split(sample[1], 'UTF-8')
atr.lookup(tf.strings.unicode_split(sample[1], 'UTF-8'))

In [None]:
atr.lookup('á'), atr.lookup('í')

# Prep for training

In [None]:
pfa = PraticantoForcedAligner(vocab=atr.tokens, sampling_rate=48000, use_cnn=False)
alignment_model = pfa.build_models()
alignment_model.summary()

In [None]:
def prep_inputs(cur_audio, sentence, age, gender):
    cur_txt = tf.ensure_shape(sentence, ())
    cur_txt = tf.strings.unicode_split(cur_txt, 'UTF-8')
    cur_txt = tf.concat([['[BOS]'], cur_txt, ['[EOS]']], axis=0)

    shapes = tf.concat([
        tf.shape(cur_txt),
        1 + (tf.shape(cur_audio[:, 0]) - pfa.frame_length) // pfa.frame_step
    ], axis=0)
    return cur_txt, cur_audio[:, 0], shapes


def prep_batch_inputs(cur_txt, cur_audio, seq_lengths):
    return {
        'char_seq': cur_txt,
        'waveform': cur_audio,
    }, seq_lengths

In [None]:
dataset = tf.data.TFRecordDataset(
    ['data/validated_not_traintest.tfrecords', 'data/train.tfrecords'],
    num_parallel_reads=2,
)
n_audio_samples = sum([1 for x in dataset])
print(f'Training on {n_audio_samples} samples')

In [None]:
batch_size = 32
dataset = tf.data.TFRecordDataset(
    ['data/validated_not_traintest.tfrecords', 'data/train.tfrecords'],
    num_parallel_reads=2,
)
dataset = dataset.shuffle(5 * batch_size + 16).repeat().map(
    AudioTarReader.deserialize,
    num_parallel_calls=tf.data.AUTOTUNE
).map(
    prep_inputs,
    num_parallel_calls=tf.data.AUTOTUNE
).padded_batch(
    batch_size, padding_values=('[PAD]', 0.0, 0), drop_remainder=True
).map(
    prep_batch_inputs,
    num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

In [None]:
model_losses = [
    alignment_losses.alignment_loss(x)
    for x in alignment_losses.possible_losses
]
print(f'Losses: {model_losses}')

alignment_model.compile(
    optimizer=tf.keras.optimizers.Adam(
        learning_rate=1e-3, clipnorm=0.1, beta_1=0.8, beta_2=0.99, epsilon=0.1),
    loss=model_losses[0],
    metrics=model_losses[1:],
)

In [None]:
os.makedirs('checkpoints', exist_ok=True)
filepath = 'checkpoints/m_{epoch}_{loss:.3f}.chkpt'
chkpt_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath, monitor='loss', verbose=1, save_best_only=True,
    save_weights_only=True, mode='auto', save_freq='epoch',
)


def scheduler(epoch, lr):
    if epoch <= 50:
        return 1e-3
    elif epoch <= 150:
        return 1e-4
    else:
        return 2e-5


lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

reduce_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='loss', factor=0.2, patience=10, verbose=1,
    mode='auto', min_delta=0.0001, cooldown=0, min_lr=1e-7
)

log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

In [None]:
alignment_model.fit(
    dataset,
    epochs=300,
    steps_per_epoch=n_audio_samples // batch_size,
    callbacks=[lr_callback, chkpt_callback, tensorboard_callback]
)