# Load MInDS-14 dataset

In [19]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
import torch

# Is MPS even available? macOS 12.3+
print(torch.backends.mps.is_available())

# Was the current version of PyTorch built with MPS activated?
print(torch.backends.mps.is_built())

True
True


In [21]:
# https://github.com/pytorch/pytorch/issues/77764
%env PYTORCH_ENABLE_MPS_FALLBACK=1
%env

env: PYTORCH_ENABLE_MPS_FALLBACK=1


{'COLORTERM': 'truecolor',
 'COMMAND_MODE': 'unix2003',
 'CONDA_CHANGEPS1': 'false',
 'ELECTRON_NO_ATTACH_CONSOLE': '1',
 'HOME': '/Users/sarimabbas',
 'HOMEBREW_CELLAR': '/opt/homebrew/Cellar',
 'HOMEBREW_PREFIX': '/opt/homebrew',
 'HOMEBREW_REPOSITORY': '/opt/homebrew',
 'INFOPATH': '/opt/homebrew/share/info:',
 'JAVA_HOME': '/Library/Java/JavaVirtualMachines/jdk-20.jdk/Contents/Home',
 'LANG': 'en_US.UTF-8',
 'LOGNAME': 'sarimabbas',
 'LaunchInstanceID': '9E612F8A-C845-4874-9870-0C73C167B4AF',
 'MANPATH': '/Users/sarimabbas/.nvm/versions/node/v18.14.0/share/man:/opt/homebrew/share/man::',
 'MallocNanoZone': '0',
 'NVM_BIN': '/Users/sarimabbas/.nvm/versions/node/v18.14.0/bin',
 'NVM_CD_FLAGS': '-q',
 'NVM_DIR': '/Users/sarimabbas/.nvm',
 'NVM_INC': '/Users/sarimabbas/.nvm/versions/node/v18.14.0/include/node',
 'ORIGINAL_XDG_CURRENT_DESKTOP': 'undefined',
 'PATH': '/Users/sarimabbas/.pyenv/versions/3.10.13/envs/venv-3.10.13/bin:/opt/homebrew/Cellar/pyenv-virtualenv/1.2.1/shims:/Users/

In [22]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")

Split the dataset’s train split into a smaller train and test set with the train_test_split method. This’ll give you a chance to experiment and make sure everything works before spending more time on the full dataset.

In [23]:
minds = minds.train_test_split(test_size=0.2)

In [24]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})

While the dataset contains a lot of useful information, like lang_id and english_transcription, you’ll focus on the audio and intent_class in this guide. Remove the other columns with the remove_columns method:

In [25]:
minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])

In [26]:
minds["train"][0]

{'audio': {'path': '/Users/sarimabbas/.cache/huggingface/datasets/downloads/extracted/526e836752f63c07e922c55ea15d552caeaa12500644e29ed15c06476e76896b/en-US~BALANCE/602ba1e0963e11ccd901cc51.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00024414,
          0.        ,  0.00024414]),
  'sampling_rate': 8000},
 'intent_class': 4}

There are two fields:

- audio: a 1-dimensional array of the speech signal that must be called to load and resample the audio file.
- intent_class: represents the class id of the speaker’s intent.


To make it easier for the model to get the label name from the label id, create a dictionary that maps the label name to an integer and vice versa:

In [27]:
labels = minds["train"].features["intent_class"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [28]:
id2label[str(2)]

'app_error'

# Preprocess

In [29]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



The MInDS-14 dataset has a sampling rate of 8000khz (you can find this information in it’s dataset card), which means you’ll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:

In [30]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

{'audio': {'path': '/Users/sarimabbas/.cache/huggingface/datasets/downloads/extracted/526e836752f63c07e922c55ea15d552caeaa12500644e29ed15c06476e76896b/en-US~BALANCE/602ba1e0963e11ccd901cc51.wav',
  'array': array([ 3.99403507e-06,  2.32464517e-05, -3.40701081e-06, ...,
          2.09816557e-04,  2.59068533e-04,  1.23633334e-04]),
  'sampling_rate': 16000},
 'intent_class': 4}

Now create a preprocessing function that:

1. Calls the audio column to load, and if necessary, resample the audio file.
2. Checks if the sampling rate of the audio file matches the sampling rate of the audio data a model was pretrained with. You can find this information in the Wav2Vec2 model card.
3. Set a maximum input length to batch longer inputs without truncating them.

In [31]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

To apply the preprocessing function over the entire dataset, use 🤗 Datasets map function. You can speed up map by setting batched=True to process multiple elements of the dataset at once. Remove the columns you don’t need, and rename intent_class to label because that’s the name the model expects:

In [32]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("intent_class", "label")

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

# Evaluate

In [33]:
import evaluate

accuracy = evaluate.load("accuracy")

In [34]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

# Train

In [35]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'classifier.bias', 'classifier.weight', 'projector.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/30 [00:00<?, ?it/s]



NotImplementedError: The operator 'aten::_weight_norm_interface' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.