# Install libraries

In [None]:
# Transformers installation
! pip install transformers datasets evaluate



# Audio classification

## Load MInDS-14 dataset

Load the MInDS-14 dataset from the 🤗 Datasets library:

In [None]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")

Dataset splitting

In [None]:
minds = minds.train_test_split(test_size=0.2)

Check on the dataset

In [None]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})

We solely focus on the `audio` and `intent_class` attributes of the dataset in this notebook. Therefore we remove the the other columns.

In [None]:
minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])

Example check

In [None]:
minds["train"][42]

{'audio': {'path': '602ba86a963e11ccd901cd1a.wav',
  'array': array([-0.00024414,  0.00024414,  0.        , ..., -0.00024414,
         -0.00024414,  0.        ]),
  'sampling_rate': 8000},
 'intent_class': 12}



Creating a dictionary that maps the label name to an integer and vice versa

In [None]:
labels = minds["train"].features["intent_class"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

Dictionary test


In [None]:
id2label[str(2)]

'app_error'

## Preprocess

The next step is to load a Wav2Vec2 feature extractor to process the audio signal:

In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



We resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:

In [None]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

{'audio': {'path': '602bac19bb1e6d0fbce921bb.wav',
  'array': array([-6.25581924e-06, -2.36390479e-05,  6.34115713e-06, ...,
         -2.01943237e-03,  1.77372154e-03,  1.33233704e-03]),
  'sampling_rate': 16000},
 'intent_class': 7}

Preprocessing function

In [None]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [None]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("intent_class", "label")

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

## Evaluation

We chose the accuracy as our evaluation metric

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

## Train

In [None]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.bias', 'classifier.weight', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install transformers[torch]



In [None]:
!pip install accelerate -U



In [None]:
# changed push to hub param to False below

training_args = TrainingArguments(
    output_dir="my_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=50,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
0,No log,2.63957,0.106195
1,No log,2.63958,0.106195
2,2.634500,2.637948,0.079646
4,2.634500,2.642295,0.053097
5,2.623500,2.650964,0.044248
6,2.623500,2.652777,0.035398
8,2.616900,2.634162,0.070796
9,2.616900,2.640116,0.061947
10,2.609700,2.643286,0.044248
12,2.609700,2.651645,0.044248




TrainOutput(global_step=150, training_loss=2.564316431681315, metrics={'train_runtime': 329.687, 'train_samples_per_second': 68.247, 'train_steps_per_second': 0.455, 'total_flos': 1.63420716672e+17, 'train_loss': 2.564316431681315, 'epoch': 40.0})

## Inference

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[0]["audio"]["path"]
dataset[0]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/a19fbc5032eacf25eab0097832db7b7f022b42104fbad6bd5765527704a428b9/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
  'array': array([ 1.70562416e-05,  2.18727451e-04,  2.28099874e-04, ...,
          3.43842403e-05, -5.96364771e-06, -1.76846661e-05]),
  'sampling_rate': 16000},
 'transcription': 'I would like to set up a joint account with my partner',
 'english_transcription': 'I would like to set up a joint account with my partner',
 'intent_class': 11,
 'lang_id': 4}

Trying the model

In [None]:
from transformers import pipeline

classifier = pipeline("audio-classification", model="stevhliu/my_awesome_minds_model")

classifier(audio_file)

config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at stevhliu/my_awesome_minds_model were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at stevhliu/my_awesome_minds_model and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

[{'score': 0.09766869246959686, 'label': 'cash_deposit'},
 {'score': 0.07998877018690109, 'label': 'app_error'},
 {'score': 0.0781070664525032, 'label': 'joint_account'},
 {'score': 0.07667110115289688, 'label': 'pay_bill'},
 {'score': 0.07552521675825119, 'label': 'balance'}]