<a href="https://colab.research.google.com/github/flowfree/ai-labs/blob/main/audio_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Playground** - Use this Notebook for quick coding or fast prototyping.


In [None]:
pip install transformers datasets evaluate jiwer

In [None]:
from datasets import load_dataset

dataset = load_dataset('PolyAI/minds14', name='en-US', split='train[:20]')
dataset = dataset.train_test_split(test_size=0.2)
dataset = dataset.remove_columns(['english_transcription', 'intent_class', 'lang_id'])
dataset

In [25]:
from transformers import AutoProcessor

checkpoint = 'facebook/wav2vec2-base'
processor = AutoProcessor.from_pretrained(checkpoint)



In [26]:
from datasets import Audio

def uppercase(example):
    return {'transcription': example['transcription'].upper()}

def prepare_dataset(batch):    
    audio = batch['audio']
    batch = processor(audio['array'], sampling_rate=audio['sampling_rate'], text=batch['transcription'])
    batch['input_length'] = len(batch['input_values'][0])
    return batch

dataset = dataset.cast_column('audio', Audio(sampling_rate=16_000))
dataset = dataset.map(uppercase)
encoded_dataset = dataset.map(prepare_dataset)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [22]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:
    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(
        self, 
        features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


data_collator = DataCollatorCTCWithPadding(processor=processor, padding='longest')        

In [29]:
from transformers import AutoModelForCTC, TrainingArguments, Trainer
import evaluate
import numpy as np


def compute_metrics(pred):
    wer = evaluate.load('wer')
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer.compute(predictions=pred_str, references=label_str)

    return {'wer': wer}
    

model = AutoModelForCTC.from_pretrained(
    checkpoint,
    ctc_loss_reduction='mean',
    pad_token_id=processor.tokenizer.pad_token_id,
)

training_args = TrainingArguments(
    output_dir='output_dir',
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=True,
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    num_train_epochs=30,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_hid.weight', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_hid.bias', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

Step,Training Loss,Validation Loss


KeyboardInterrupt: ignored