In [None]:
#| default_exp datasets

In [None]:
%%capture
#| hide
#| eval: false

!pip install datasets==1.16.1
!pip install librosa

In [None]:
#| export
#| hide

from typing import List, Tuple, Dict
import random
from nbdev.showdoc import *
from IPython.display import display, Audio
from datasets import load_dataset
from datasets.dataset_dict import DatasetDict

# Code

In [None]:
#| export

def _get_labels_dict(labels: List[str]) -> Tuple[Dict[str, str], Dict[str, str]]:
    """Builds maps form id to label and label to id based on label index.

    Args:
        labels (List[str]): List of labels

    Returns:
        Tuple[Dict[str, str], Dict[str, str]]: Maps
    """
    label2id, id2label = dict(), dict()
    for i, label in enumerate(labels):
        label2id[label] = str(i)
        id2label[str(i)] = label
    return id2label, label2id


def dataloader_pipeline(load_dataset_kwargs: dict) -> Dict:
    """Loads dataset and builds maps from label to id and id to label.

    Args:
        load_dataset_kwargs (dict): Parameters for huggingface's load_dataset

    Returns:
        Dict: Keys are 'dataset', 'id2label' and 'label2id'
    """
    dataset = load_dataset(**load_dataset_kwargs)
    labels = dataset["train"].features["label"].names
    id2label, label2id = _get_labels_dict(labels)
    return {'dataset': dataset, 'id2label': id2label, 'label2id': label2id}

In [None]:
#| eval: false

assert _get_labels_dict(['a', 'b', 'c'])[0] == {'0': 'a', '1': 'b', '2': 'c'}
assert _get_labels_dict(['a', 'b', 'c'])[1] == {'a': '0', 'b': '1', 'c': '2'}
data = dataloader_pipeline({'path': "superb", 'name': "ks"})
assert isinstance(data['dataset'], DatasetDict)

Reusing dataset superb (/home/jovyan/.cache/huggingface/datasets/superb/ks/1.9.0/ce836692657f82230c16b3bbcb93eaacdbfd7de4def3be90016f112d68683481)


  0%|          | 0/3 [00:00<?, ?it/s]

# Example

In this example, we load data for a Keyword Spotting task of the [SUPERB Benchmark](https://superbbenchmark.org/)

Keyword Spotting (KS) detects preregistered keywords by classifying utterances into a predefined set of words. SUPERB uses the widely used Speech Commands dataset v1.0 for the task.  
The dataset consists of ten classes of keywords, a class for silence, and an unknown class to include the false positive.  
We will wrap the [🤗 Datasets](https://github.com/huggingface/datasets) library to download the data

In [None]:
#|filter_stream Reusing
#| eval: false

data = dataloader_pipeline({'path': "superb", 'name': "ks"})
dataset = data['dataset'] #type: DatasetDict
id2label = data['id2label'] #type: Dict
label2id = data['label2id'] #type: Dict

Reusing dataset superb (/home/jovyan/.cache/huggingface/datasets/superb/ks/1.9.0/ce836692657f82230c16b3bbcb93eaacdbfd7de4def3be90016f112d68683481)


  0%|          | 0/3 [00:00<?, ?it/s]

The `dataset` object itself is a [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), which contains one key for the training, validation and test set.

In [None]:
#| eval: false

dataset

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'label'],
        num_rows: 51094
    })
    validation: Dataset({
        features: ['file', 'audio', 'label'],
        num_rows: 6798
    })
    test: Dataset({
        features: ['file', 'audio', 'label'],
        num_rows: 3081
    })
})

To access an actual element, you need to select a split first, then give an index:

In [None]:
#| eval: false

dataset["test"][1000]

{'file': '/home/jovyan/.cache/huggingface/datasets/downloads/extracted/45bfc1522471574e4f4c96727b6574012e571193a6708acd3bb51160d97475fd/go/e41a903b_nohash_4.wav',
 'audio': {'path': '/home/jovyan/.cache/huggingface/datasets/downloads/extracted/45bfc1522471574e4f4c96727b6574012e571193a6708acd3bb51160d97475fd/go/e41a903b_nohash_4.wav',
  'array': array([-1.2207031e-04,  3.0517578e-05,  1.8310547e-04, ...,
         -4.8828125e-04, -5.4931641e-04, -3.3569336e-04], dtype=float32),
  'sampling_rate': 16000},
 'label': 9}

As you can see, the `label` field is not an actual string label. By default the `ClassLabel` fields are encoded into integers for convenience:

In [None]:
#| eval: false

dataset["train"].features["label"]

ClassLabel(num_classes=12, names=['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', '_silence_', '_unknown_'], names_file=None, id=None)

We can see that the audio file has been loaded and resampled on-the-fly upon calling.

The sampling rate is set to 16kHz which is what Wav2Vec2 expects as an input.

To get a sense of what the commands sound like, the following snippet will render some audio examples picked randomly from the dataset.

Note: You can run the following cell a couple of times to listen to different audio samples.


In [None]:
#| eval: false

for _ in range(5):
    rand_idx = random.randint(0, len(dataset["train"])-1)
    example = dataset["train"][rand_idx]
    audio = example["audio"]
    label = str(example["label"])
    print(f'Label: {id2label[label]}')
    print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
    display(Audio(audio["array"], rate=audio["sampling_rate"]))
    print()

Label: left
Shape: (16000,), sampling rate: 16000



Label: no
Shape: (16000,), sampling rate: 16000



Label: right
Shape: (16000,), sampling rate: 16000



Label: _unknown_
Shape: (16000,), sampling rate: 16000



Label: down
Shape: (16000,), sampling rate: 16000





In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()