# Prepare tfrecords files

Perform sanity check using low amount of data

- Prepare dataset
- Train model

Full scale training

- What can fit memory?

**Note for Colab users**

Edit the line below in order to download Mozilla Common Voice files

In [None]:
import os
os.makedirs('data', exist_ok=True)
# if not os.path.isfile('data/cv-corpus-7.0-2021-07-21-it.tar.gz'):
#     !wget -O "data/cv-corpus-7.0-2021-07-21-it.tar.gz" "https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-7.0-2021-07-21/cv-corpus-7.0-2021-07-21-it.tar.gz?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAQ3GQRTO3CHAF2LUR%2F20210801%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20210801T075430Z&X-Amz-Expires=43200&X-Amz-Security-Token=FwoGZXIvYXdzENn%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaDGTMWIPnTt46zTwUTCKSBEzVQleBRJYCDi9aNljFC0TrxdJ5O%2BtQ%2FY0wwhg8b7X4mD6Tu%2BIQ6yAtcJ20qW5vYW4nv0PvpNrq7Ne%2FoZ0RWT9j1wedHxleh2g3JEP4HE8FUKmpVzb5HiaKGmBYRn41nnM0Czk3WHD7KeHHhtQj5rMTxbmHQUTw7gvad7ieRy%2FF4WbOzX%2FPx78dt4Zq2%2BTxl%2Fc4SOhlM9n3SKWc0foqKuDzytDFf1%2FQd45BMUWCiPOd2fyf0l751fgygj7syaEnegchts96%2FZZ0ilaXYuu9jjcO7gJCMB32r6rndMP5g98RbV5ScPe5Ey7TvAgeKZXFuF5LHIj7TraBr0Z6WqX02Salo9c%2Fu5b%2B%2FurhR5Q6%2B5hDbvg9abIrAzpom5egeOJSDFTYzsQHOdboXgs7Ciop7YktBjHXMTPi7ck22%2F4OYI4lqdwLICn%2BHE%2B79%2FcDrTYQ%2BOSLYZonbIc2u9Q2iHwjWr4i9Z%2BGYQhGOyi6L%2BVblvHMjLVBFXNr%2FfnJEM6%2FXE6gIVfM2u9948bzTbBcYLZ552LzXJdBpXqFNQ8t8D4VOYrGNXJOvxCnOI5OlmORzEvHNS1USQhq0rNb1JY8X1N6oVvcIGkrlOJfcJgWT6oTnI0L5CBtzbVVtvwsjvomeb3ZlmbWXCohNxkSCBJouc7zXTRXaejPr8dUBHpxAvgXD6Qch%2Fnm19OoaspXOpxX2oXV5z8wKNibmYgGMiqG2Tnp1d5ZY7Vai14pdD2OijlQWmQsE0FYfpcqYAyL9xbO6Iv3kB5v7MY%3D&X-Amz-Signature=393509c6b2e96db5d0f10557b5973b7bd167a2f7349910073d1c8ddd72fd8992&X-Amz-SignedHeaders=host"

In [None]:
!pip install tensorflow-io -q
!pip install tensorflow-addons -q

In [None]:
from IPython.display import Audio

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow_io as tfio
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 2
from create_audio_tfrecords import AudioTarReader, PersonIdAudio

audio_tarfile = 'data/en.tar'
audio_tarfile = 'data/cv-corpus-7.0-2021-07-21-pt.tar.gz'
en_total = 1584330
sr = 48000

atr = AudioTarReader(audio_tarfile)

In [None]:
atr.data_files['train.tsv'].head()

In [None]:
audio_content = atr.retrieve_per_user_data()

## Sanity check

Check if audios from the same person sound like that

In [None]:
temp_list = [x for x in audio_content if len(audio_content[x]) > 2]
cur_idx = np.random.randint(len(temp_list))
audio_samples = audio_content[temp_list[cur_idx]]
len(audio_samples)

In [None]:
decoded_mp3 = tfio.audio.decode_mp3(audio_samples[0])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)

In [None]:
decoded_mp3 = tfio.audio.decode_mp3(audio_samples[1])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)

In [None]:
len(audio_samples[1])

# Data preparation

## Retrieve tf.records.dataset

PersonIdAudio contains code to retrieve a tf.records.Dataset from a given audio_content.

In [None]:
pia = PersonIdAudio(audio_content, sr)
audio_dataset = pia.get_tf_dataset()

In [None]:
samples = [x for x in audio_dataset.take(10)]
decoded_mp3 = tfio.audio.decode_mp3(samples[0][0])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)

In [None]:
decoded_mp3.shape

## Write tfrecords file

Save tfrecords file and checks if it is working properly.

In [None]:
tfrecords_file = pia.save_tfrecords_file('pt-train')

In [None]:
# check if tfrecords file is OK
# notice GZIP compression + the deserialization function map
tfrecords_audio_dataset = tf.data.TFRecordDataset(
    tfrecords_file, compression_type='GZIP'
).map(PersonIdAudio.deserialize_from_tfrecords)

In [None]:
samples = [x for x in tfrecords_audio_dataset.take(4)]
decoded_mp3 = tfio.audio.decode_mp3(samples[1][0])
Audio(decoded_mp3.numpy()[:, 0], rate=sr)