# Dataset exploration

This notebook helps understand the Mozilla Speech dataset https://commonvoice.mozilla.org/en/datasets

The version under analysis is Common Voice Corpus 6.1. In particular, it is necessary to download file `en.tar` and place it in `data` folder.

We know that the corpus was collected using mono, 16 bit, 48 kHz - see https://arxiv.org/abs/1912.06670.

In [None]:
import os
import tarfile
from IPython.display import Audio

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow_io as tfio
from matplotlib import pyplot as plt

audio_tarfile = 'data/en.tar'
en_total = 1584330

if audio_tarfile.endswith('.tar'):
    audios_tar = tarfile.open(audio_tarfile, 'r')
elif self.audio_tarfile.endswith('.tar.gz'):
    audios_tar = tarfile.open(audio_tarfile, "r:gz")

In [None]:
# for the English tarfile, we already know the max
tar_file_list = [x for x in tqdm(audios_tar, total=en_total)]

## Analyze validated files

The tar file contains train, dev and test splits (see [this thread](https://discourse.mozilla.org/t/why-train-tsv-includes-a-few-files-just-3-of-validated-set/36471/5) for extra explanation).

Let's take a look at them:

- Search for the files inside tar file
- Read contents using pandas

### Sanity checks

Verify if the files are available and make sure that the splits are correct.

In [None]:
data_files = {
    'train.tsv': None,
    'dev.tsv': None,
    'test.tsv': None,
    # 'validated.tsv': None
}
n_files = len(data_files.keys())
cur_files = 0

for x in tar_file_list:
    for k in data_files:
        if x.name.endswith(k):
            with audios_tar.extractfile(x) as f:
                df = pd.read_csv(f, sep='\t')
                data_files[k] = df
            cur_files += 1
    if cur_files == n_files:
        break

In [None]:
# check if there are client_ids leaking
for x in data_files.keys():
    for y in data_files.keys():
        if x != y:
            intersec_elems = set(data_files[x].client_id).intersection(set(data_files[y].client_id))
            assert len(intersec_elems) == 0, f'Repeated ids in sets {x} and {y}'
        else:
            print(f'{x} has {len(set(data_files[x].client_id))} unique ids and {len(data_files[x])} samples')

In [None]:
# figure out the mp3 folder inside tar

for x in audios_tar:
    if x.name.endswith('.mp3'):
        mp3_folder = x.name.split('/')
        mp3_folder = '/'.join(mp3_folder[:-1])
        break
print(f'Detected mp3 folder: {mp3_folder}')

### Load, plot and analyze a random audio

In [None]:
sample_idx = np.random.randint(len(data_files['train.tsv']))
sample = data_files['train.tsv'].iloc[sample_idx]
sample_idx, sample

In [None]:
sample_info = [x for x in tar_file_list if x.name.endswith(sample.path)][0]

# read file content
mp3_file = audios_tar.extractfile(sample_info)
mp3_content = mp3_file.read()

sr = 48000
decoded_mp3 = tfio.audio.decode_mp3(mp3_content, shape=None, name=None)

# listen to audio

Audio(decoded_mp3.numpy()[:, 0], rate=sr)

In [None]:
t = np.arange(len(decoded_mp3)) / sr

plt.figure(figsize=(15, 5))
plt.title('Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Intensity')
plt.plot(t, decoded_mp3)

In [None]:
spec_stride = 256
spec_len = 1024

spectrogram = tfio.audio.spectrogram(
    decoded_mp3[:, 0], nfft=spec_len, window=spec_len, stride=spec_stride)
spectrogram = tf.transpose(spectrogram)

spectrogram_t = np.arange(spectrogram.shape[1]) * spec_stride / sr
spectrogram_f = np.arange(spectrogram.shape[0]) * sr / spec_len

plt.figure(figsize=(15, 5))
plt.title('Spectrogram')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')

plt.ylim(0, 10000)

plt.pcolormesh(
    spectrogram_t,
    spectrogram_f,
    tf.math.log(spectrogram).numpy(),
    shading='auto'
)

In [None]:
# Warp the linear scale spectrograms into the mel-scale.
spectrogram = tf.transpose(spectrogram)
num_spectrogram_bins = spectrogram.shape[-1]
lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 10000.0, 85
linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
  num_mel_bins, num_spectrogram_bins, sr, lower_edge_hertz,
  upper_edge_hertz)
mel_spectrograms = tf.tensordot(
  spectrogram, linear_to_mel_weight_matrix, 1)
mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(
  linear_to_mel_weight_matrix.shape[-1:]))

# Compute a stabilized log to get log-magnitude mel-scale spectrograms.

log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
log_mel_spectrograms = tf.transpose(log_mel_spectrograms)

plt.figure(figsize=(15, 5))
plt.title('Mel frequency spectrogram')

pcol = plt.pcolormesh(
    log_mel_spectrograms.numpy(),
    shading='auto',
    linewidth=0, rasterized=True
)
pcol.set_edgecolor('face')