In [1]:
import os

from datasets import load_dataset, IterableDatasetDict, Audio, DatasetDict, Dataset
from transformers import WhisperFeatureExtractor, WhisperTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

Using 🤗 Datasets, downloading and preparing data is extremely simple. 
We can download and prepare the Common Voice splits in just one line of code. 

First, ensure you have accepted the terms of use on the Hugging Face Hub: [mozilla-foundation/common_voice_11_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0). Once you have accepted the terms, you will have full access to the dataset and be able to download the data locally.

Since Hindi is very low-resource, we'll combine the `train` and `validation` 
splits to give approximately 8 hours of training data. We'll use the 4 hours 
of `test` data as our held-out test set:

In [2]:
common_voice = IterableDatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="train", token="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", streaming=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "es", split="test", token="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", streaming=True)

print(common_voice)

{'train': <datasets.iterable_dataset.IterableDataset object at 0x7fefa4498e50>, 'test': <datasets.iterable_dataset.IterableDataset object at 0x7feec3ebe210>}


In [3]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(common_voice)

{'train': <datasets.iterable_dataset.IterableDataset object at 0x7feeb15210d0>, 'test': <datasets.iterable_dataset.IterableDataset object at 0x7feeb1523d50>}


Since 
our input audio is sampled at 48kHz, we need to _downsample_ it to 
16kHz prior to passing it to the Whisper feature extractor, 16kHz being the sampling rate expected by the Whisper model. 

We'll set the audio inputs to the correct sampling rate using dataset's 
[`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=cast_column#datasets.DatasetDict.cast_column)
method. This operation does not change the audio in-place, 
but rather signals to `datasets` to resample audio samples _on the fly_ the 
first time that they are loaded:

In [4]:
# Normalize the audio to 16kHz
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

### Load WhisperFeatureExtractor

The Whisper feature extractor performs two operations:
1. Pads / truncates the audio inputs to 30s: any audio inputs shorter than 30s are padded to 30s with silence (zeros), and those longer that 30s are truncated to 30s
2. Converts the audio inputs to _log-Mel spectrogram_ input features, a visual representation of the audio and the form of the input expected by the Whisper model

<figure>
<img src="https://raw.githubusercontent.com/sanchit-gandhi/notebooks/main/spectrogram.jpg" alt="Trulli" style="width:100%">
<figcaption align = "center"><b>Figure 2:</b> Conversion of sampled audio array to log-Mel spectrogram.
Left: sampled 1-dimensional audio signal. Right: corresponding log-Mel spectrogram. Figure source:
<a href="https://ai.googleblog.com/2019/04/specaugment-new-data-augmentation.html">Google SpecAugment Blog</a>.
</figcaption>

We'll load the feature extractor from the pre-trained checkpoint with the default values:

In [5]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

### Load WhisperTokenizer

The Whisper model outputs a sequence of _token ids_. The tokenizer maps each of these token ids to their corresponding text string. For Hindi, we can load the pre-trained tokenizer and use it for fine-tuning without any further modifications. We simply have to 
specify the target language and the task. These arguments inform the 
tokenizer to prefix the language and task tokens to the start of encoded 
label sequences:

In [6]:
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="Spanish", task="transcribe")

### Prepare Data

Let's print the first example of the Common Voice dataset to see 
what form the data is in:

In [7]:
# print(common_voice["train"][0])

Now we can write a function to prepare our data ready for the model:
1. We load and resample the audio data by calling `batch["audio"]`. As explained above, 🤗 Datasets performs any necessary resampling operations on the fly.
2. We use the feature extractor to compute the log-Mel spectrogram input features from our 1-dimensional audio array.
3. We encode the transcriptions to label ids through the use of the tokenizer.

In [8]:
def prepare_dataset_common_voice_11_0(batch): 
    """Function to preprocess the dataset with the .map method"""
    # Prepare dataset provided by Mozilla Common Voice 11.0
    # source: https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0#data-preprocessing-recommended-by-hugging-face
    transcription = batch["sentence"]

    if transcription.startswith('"') and transcription.endswith('"'):
        # we can remove trailing quotation marks as they do not affect the transcription
        transcription = transcription[1:-1]

    if transcription[-1] not in [".", "?", "!"]:
        # append a full-stop to sentences that do not end in punctuation
        transcription = transcription + "."

    batch["sentence"] = transcription

    return batch

def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

def prepare_combined_dataset(batch):
    # Chain the two prepare functions
    return prepare_dataset(prepare_dataset_common_voice_11_0(batch))

We can apply the data preparation function to all of our training examples using dataset's `.map` method. The argument `num_proc` specifies how many CPU cores to use. Setting `num_proc` > 1 will enable multiprocessing. If the `.map` method hangs with multiprocessing, set `num_proc=1` and process the dataset sequentially.

In [9]:
# Filter dataset to 5000 examples
common_voice = common_voice.filter(lambda _example, idx: idx < 5000, with_indices=True)

In [10]:
# common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)
common_voice = common_voice.map(prepare_combined_dataset, remove_columns=common_voice["train"].column_names)

In [11]:
common_voice

{'train': <datasets.iterable_dataset.IterableDataset at 0x7feeb0414ed0>,
 'test': <datasets.iterable_dataset.IterableDataset at 0x7feeb04153d0>}

In [16]:
# Iterate over the dataset and stores all the samples in disk
for split in common_voice.keys():
    common_voice[split].save_to_disk(f"common_voice_{split}_16kHz")

TypeError: cannot pickle 'generator' object

In [None]:
to_store

In [13]:
to_store.save_to_disk("common_voice_es")

AttributeError: 'IterableDataset' object has no attribute 'save_to_disk'

In [None]:
print(os.getcwd())
print(os.listdir("./combined_dataset/"))
print(os.listdir("./combined_dataset/train"))
print(os.listdir("./combined_dataset/test"))

/mnt/c/Users/BlondeFer/Documents/Master/2_2/ID2223/Assignment_2
['dataset_dict.json', 'test', 'train']
['data-00000-of-00029.arrow', 'data-00001-of-00029.arrow', 'data-00002-of-00029.arrow', 'data-00003-of-00029.arrow', 'data-00004-of-00029.arrow', 'data-00005-of-00029.arrow', 'data-00006-of-00029.arrow', 'data-00007-of-00029.arrow', 'data-00008-of-00029.arrow', 'data-00009-of-00029.arrow', 'data-00010-of-00029.arrow', 'data-00011-of-00029.arrow', 'data-00012-of-00029.arrow', 'data-00013-of-00029.arrow', 'data-00014-of-00029.arrow', 'data-00015-of-00029.arrow', 'data-00016-of-00029.arrow', 'data-00017-of-00029.arrow', 'data-00018-of-00029.arrow', 'data-00019-of-00029.arrow', 'data-00020-of-00029.arrow', 'data-00021-of-00029.arrow', 'data-00022-of-00029.arrow', 'data-00023-of-00029.arrow', 'data-00024-of-00029.arrow', 'data-00025-of-00029.arrow', 'data-00026-of-00029.arrow', 'data-00027-of-00029.arrow', 'data-00028-of-00029.arrow', 'dataset_info.json', 'state.json']
['data-00000-of-0001

In [None]:
def get_dir_size(path):
    total = 0
    with os.scandir(path) as it:
        for entry in it:
            if entry.is_file():
                total += entry.stat().st_size
            elif entry.is_dir():
                total += get_dir_size(entry.path)
    return total

def sizeof_fmt(num, suffix="B"):
    # Source: https://web.archive.org/web/20111010015624/http://blogmag.net/blog/read/38/Print_human_readable_file_size
    for unit in ("", "Ki", "Mi", "Gi", "Ti"):
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}"
        num /= 1024.0
    # If we get here, the size is too large to be represented as a PiB value
    return f"{num:.1f}Pi{suffix}"


sz = get_dir_size("./combined_dataset/")
print(sizeof_fmt(sz))

18.7GiB
