In [None]:
#| default_exp preprocesses

In [None]:
#| hide
#| eval: false
# %%capture

!pip install git+https://github.com/huggingface/transformers.git



Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-xdhxuiuj
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-xdhxuiuj
  Resolved https://github.com/huggingface/transformers.git to commit d90a36d192e2981a41122c30a765c63158dd0557
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting regex!=2019.12.17
  Downloading regex-2022.8.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (766 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [None]:
#| hide
#| export

from typing import List, Callable
from nbdev.showdoc import *
from IPython.display import display,SVG
from transformers import AutoFeatureExtractor
from datasets.dataset_dict import DatasetDict
from wav2keyword.datasets import dataloader_pipeline

# Code

In [None]:
#| export

class Preprocessor(object):

    def __init__(self, model_checkpoint: str = "facebook/wav2vec2-base", max_duration: float = 1.0):
        self.model_checkpoint = model_checkpoint
        self.max_duration = max_duration
        self.FEATURE_EXTRACTOR = AutoFeatureExtractor.from_pretrained(model_checkpoint)

    def _preprocess_function(self, examples: List[dict]):
        """Runs the feature_extractor for the given checkpoint with max_duration.

        Args:
            examples (_type_): Audio example

        Returns:
            _type_: preprocessed example
        """
        audio_arrays = [x["array"] for x in examples["audio"]]
        inputs = self.FEATURE_EXTRACTOR(
            audio_arrays, 
            sampling_rate=self.FEATURE_EXTRACTOR.sampling_rate, 
            max_length=int(self.FEATURE_EXTRACTOR.sampling_rate * self.max_duration), 
            truncation=True,
        )
        return inputs

    def preprocess(self, dataset: DatasetDict):
        return dataset.map(self._preprocess_function, remove_columns=["audio", "file"], batched=True)


In [None]:
show_doc(Preprocessor.preprocess)

# Examples

Before we can feed audio clips to our model, we need to preprocess them. This is done by 🤗 Transformers `FeatureExtractor` which will normalize the inputs and put them in a format the model expects, as well as generate the other inputs that the model requires.

We wrote a function that will preprocess our samples. The `_preprocess_function` is an internal function that will instantiate our feature extractor with the `AutoFeatureExtractor.from_pretrained` method, which will ensure that we get a preprocessor that corresponds to the model architecture we want to use.  
The argument `truncation=True` and the maximum sample length we will ensure that very long inputs like the ones in the `_silence_` class can be safely batched.

In [None]:
#|filter_stream Reusing
#|filter_stream UserWarning
#| eval: false

data = dataloader_pipeline({'path': "superb", 'name': "ks"})
dataset = data['dataset']
sample = dataset['train'][:5]
preprocessor = Preprocessor()
preprocessed_sample = preprocessor._preprocess_function(sample)

Reusing dataset superb (/home/jovyan/.cache/huggingface/datasets/superb/ks/1.9.0/ce836692657f82230c16b3bbcb93eaacdbfd7de4def3be90016f112d68683481)


  0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
#| eval: false

sr = sample['audio'][0]['sampling_rate']
print(f"Raw sample durations in seconds: {[a['array'].shape[0]/sr for a in sample['audio']]}")
print(f"preprocessed sample durations in seconds: {[a.shape[0]/sr for a in preprocessed_sample['input_values']]}")


Raw sample durations in seconds: [95.183125, 61.8056875, 61.253875, 60.0, 61.1555]
preprocessed sample durations in seconds: [1.0, 1.0, 1.0, 1.0, 1.0]


To apply this function on all utterances in our dataset, we just use the `map` method of our `dataset` object we created earlier. This will apply the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command.

This whole process is mapped in the `preprocess` function. The results are automatically cached by the 🤗 Datasets library to avoid spending time on this step the next time you run your notebook. The 🤗 Datasets library is normally smart enough to detect when the function you pass to map has changed (and thus requires to not use the cache data). 🤗 Datasets warns you when it uses cached files, you can pass `load_from_cache_file=False` in the call to `map` to not use the cached files and force the preprocessing to be applied again.

In [None]:
#| filter_stream UserWarning|VisibleDeprecationWarning|Parameter|gradient_checkpointing|warnings.warn|tensor
#| eval: false

preprocess_dataset = preprocessor.preprocess(dataset)
preprocess_dataset

  0%|          | 0/52 [00:00<?, ?ba/s]

  tensor = as_tensor(value)


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_values', 'label'],
        num_rows: 51094
    })
    validation: Dataset({
        features: ['input_values', 'label'],
        num_rows: 6798
    })
    test: Dataset({
        features: ['input_values', 'label'],
        num_rows: 3081
    })
})

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()