# Reader for Mozilla Common Voices Dataset

https://commonvoice.mozilla.org/en/datasets

In [None]:
import os
import IPython.display as ipd

import tensorflow as tf

from matplotlib import pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

from data_readers.mozilla_speech_reader import AudioTarReader  # noqa
from models.alignment_model import PraticantoForcedAligner  # noqa
import models  # noqa

using_colab = False
if using_colab:
    %pip install pandas -q
    %pip install tqdm -q
    %pip install ipywidgets -q
    # watch this for the correct version 0.21.0 for tf 2.6, 0.24.0 for tf 2.8
    %pip install tensorflow-io==0.24.0 -q

# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [None]:
file = 'data/cv-corpus-8.0-2022-01-19-pt.tar.gz'
atr = AudioTarReader(file)

In [None]:
os.makedirs('data', exist_ok=True)
data_file = 'data/validated_not_traintest.tfrecords'
if not os.path.isfile(data_file):
    atr.write_tfrecords_file(data_file)

data_file = 'data/train.tfrecords'
if not os.path.isfile(data_file):
    atr.write_tfrecords_file(data_file, split='train')

In [None]:
dataset = tf.data.TFRecordDataset(
    'data/validated_not_traintest.tfrecords'
).map(AudioTarReader.deserialize)

In [None]:
sample = [x for x in dataset.skip(3).take(1)][0]

In [None]:
print(sample[1].numpy().decode('UTF-8'), sample[2:])
ipd.Audio(sample[0][:, 0].numpy(), rate=48000)

## Check marked strings

Like á, ó, etc

In [None]:
v = tf.concat([['[BOS]'], tf.strings.unicode_split(sample[1], 'UTF-8'), ['[EOS]']], axis=0)
print(atr.lookup(v)), print(atr.lookup_inv(atr.lookup(v)))

In [None]:
x = tf.strings.unicode_split(sample[1], 'UTF-8')
atr.lookup(tf.strings.unicode_split(sample[1], 'UTF-8'))

In [None]:
atr.lookup('á'), atr.lookup('í')

# Prep for training

In [None]:
pfa = PraticantoForcedAligner(vocab=atr.tokens, sampling_rate=48000, use_cnn=False)
alignment_model = pfa.build_models()
alignment_model.summary()

In [None]:
def prep_batch_inputs(cur_txt, cur_audio, seq_lengths):
    return {
        "char_seq": cur_txt,
        "waveform": cur_audio,
    }, seq_lengths


def prep_inputs(cur_audio, sentence, age, gender):
    cur_txt = tf.ensure_shape(sentence, ())
    cur_txt = tf.strings.unicode_split(cur_txt, 'UTF-8')
    cur_txt = tf.concat([["[BOS]"], cur_txt, ["[EOS]"]], axis=0)

    shapes = tf.concat(
        [
            tf.shape(cur_txt),
            1 + (tf.shape(cur_audio[:, 0]) - pfa.frame_length) // pfa.frame_step,
        ],
        axis=0,
    )
    return cur_txt, cur_audio[:, 0], shapes

In [None]:
batch_size = 8
dataset = tf.data.TFRecordDataset(
    ["data/validated_not_traintest.tfrecords", "data/train.tfrecords"],
    num_parallel_reads=2,
)
dataset = (
    dataset
    .map(AudioTarReader.deserialize, num_parallel_calls=tf.data.AUTOTUNE)
    .map(prep_inputs, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size, padding_values=("[PAD]", 0.0, 0), drop_remainder=True)
    .map(prep_batch_inputs, num_parallel_calls=tf.data.AUTOTUNE)
    .prefetch(tf.data.AUTOTUNE)
)

# Check results

In [None]:
alignment_model.load_weights('checkpoints/m_54_0.181.chkpt')

In [None]:
m_spec = models.alignment_model.get_spectrogram_model()
m_logmel = models.alignment_model.get_melspec_model()

In [None]:
samples = [x for x in dataset.take(1)]

In [None]:
preds = alignment_model(samples[0][0])
padded_char_len = preds.shape[1]
preds.shape, samples[0][1].shape

In [None]:
idx = 0
unpadded_lens = samples[0][1][idx]
char_len = unpadded_lens[0].numpy()
spec_len = unpadded_lens[1].numpy()
print('Unpadded:', unpadded_lens)

xmax = spec_len

plt.figure(figsize=(15, 6))
# for k in range(0, padded_char_len):
# for k in range(0, 15):
# for k in [0, 1, 2, 3, 4, 5, 6, 7, 8, -2, -1]:
# for k in [0, 1, 2, -2, -1]:
for k in range(0, char_len, 1):
    plt.plot(preds[idx, k, 0:spec_len].numpy(), label=str(k))
    # plt.plot(preds[idx, k, :].numpy())
    plt.ylim(0, 1)
    # plt.show()
# plt.legend()
plt.xlim(0, xmax)


audio_data = samples[0][0]['waveform'][idx]
txt_data = tf.strings.join(samples[0][0]['char_seq'][idx]).numpy().decode('UTF-8').replace('[PAD]', '')

logmel = m_logmel(tf.expand_dims(
    audio_data, axis=0)
)
print(logmel.shape, txt_data)
# t = tf.cast(tf.range(0, logmel.shape[1]), tf.float32) * 256.0 / tf.cast(sr, tf.float32)
# mels = tf.range(0, logmel.shape[2], delta=1)
plt.figure(figsize=(15, 6))
plt.pcolormesh(
    # t.numpy(),
    # mels.numpy(),
    tf.transpose(logmel[0]).numpy()
)
plt.xlim(0, xmax)

plt.show()