In [1]:
!nvidia-smi

Sun Dec 28 00:52:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   77C    P0             33W /   70W |    3744MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
%pip install torch torchcodec transformers datasets



First login to huggingface

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

Load the dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("PolyAI/minds14", "en-US", split="train")

split_dataset = dataset.train_test_split(test_size=113)

print(f"train size: {len(split_dataset['train'])}")
print(f"test size: {len(split_dataset['test'])}")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


train size: 450
test size: 113


Inspect the first sample

In [5]:
split_dataset["train"][0]

{'path': 'en-US~LATEST_TRANSACTIONS/602bae0005f96973d67944eb.wav',
 'audio': <datasets.features._torchcodec.AudioDecoder at 0x7f5b483d9ac0>,
 'transcription': 'please show me my last five transactions',
 'english_transcription': 'please show me my last five transactions',
 'intent_class': 12,
 'lang_id': 4}

Keep only transcription and audio

In [6]:
minds14 = split_dataset.select_columns(["audio", "transcription"])

Set up Whisper for transcription of English

In [7]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small"
)

Let's look at the sampling rate of our audio dataset

In [8]:
minds14["train"].features

{'audio': Audio(sampling_rate=8000, decode=True, stream_index=None),
 'transcription': Value('string')}

Whisper was trained on 16khz audio, so we need to indicate that our audio must be upsampled to 16khz on the fly via the `cast_column` method

In [9]:
from datasets import Audio

sampling_rate = processor.feature_extractor.sampling_rate
minds14 = minds14.cast_column("audio", Audio(sampling_rate=sampling_rate))

Prepare the dataset for processing

In [10]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=example["transcription"],
    )

    # compute input length of audio sample in seconds
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]

    return example

Only keep the data returned after mapping to the `prepare_dataset` function

In [11]:
minds14 = minds14.map(
    prepare_dataset, remove_columns=minds14.column_names["train"], num_proc=1
)

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Ensure we don't accidentally pass audio longer than 30s, because if we do Whisper will automatically truncate the audio

In [12]:
max_input_length = 30.0


def is_audio_in_length_range(length):
    return length < max_input_length

Apply the filter function

In [13]:
minds14["train"] = minds14["train"].filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)

Filter:   0%|          | 0/450 [00:00<?, ? examples/s]

Check for training data that was removed by this process

In [14]:
minds14["train"]

Dataset({
    features: ['input_features', 'labels', 'input_length'],
    num_rows: 446
})

Create the datacollator for this task

In [15]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [
            {"input_features": feature["input_features"][0]} for feature in features
        ]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [16]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

Define the evaluation metrics as WER

In [17]:
%pip install evaluate jiwer



In [18]:
import evaluate

metric = evaluate.load("wer")

Define our compute metrics function

In [19]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

normalizer = BasicTextNormalizer()


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # compute orthographic wer
    wer_ortho = metric.compute(predictions=pred_str, references=label_str)

    # compute normalised WER
    pred_str_norm = [normalizer(pred) for pred in pred_str]
    label_str_norm = [normalizer(label) for label in label_str]
    # filtering step to only evaluate the samples that correspond to non-zero references:
    pred_str_norm = [
        pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0
    ]
    label_str_norm = [
        label_str_norm[i]
        for i in range(len(label_str_norm))
        if len(label_str_norm[i]) > 0
    ]

    wer = metric.compute(predictions=pred_str_norm, references=label_str_norm)

    return {"wer_ortho": wer_ortho, "wer": wer}

Load the tiny pretrained Whisper checkpoint

In [20]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

In [21]:
# from functools import partial

# # disable cache during training since it's incompatible with gradient checkpointing
# model.config.use_cache = False

# # set language and task for generation and re-enable cache
# model.generate = partial(
#     model.generate, use_cache=True
# )

Define training arguments

In [22]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-finetuned-minds14-en-us",  # name on the HF Hub
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    lr_scheduler_type="linear",
    warmup_steps=50,
    max_steps=500,  # increase to 4000 if you have your own GPU or a Colab paid plan
    gradient_checkpointing=False,
    fp16=True,
    fp16_full_eval=True,
    eval_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

Set up trainer

In [23]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=minds14["train"],
    eval_dataset=minds14["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

  trainer = Seq2SeqTrainer(


Initiate training

In [24]:
trainer.train()

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Wer Ortho,Wer
100,0.2901,0.54117,0.294564,0.266789
200,0.0542,0.57593,0.282554,0.257631
300,0.007,0.62408,0.286979,0.263126
400,0.0028,0.638128,0.286979,0.265568
500,0.0022,0.643646,0.290139,0.269231


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


TrainOutput(global_step=500, training_loss=0.33350919549912217, metrics={'train_runtime': 1237.3968, 'train_samples_per_second': 6.465, 'train_steps_per_second': 0.404, 'total_flos': 1.9611403886592e+17, 'train_loss': 0.33350919549912217, 'epoch': 17.857142857142858})

Push to hub

In [25]:
kwargs = {
     "dataset_tags": "PolyAI/minds14",
    "finetuned_from": "openai/whisper-tiny",
    "tasks": "automatic-speech-recognition",
}

In [26]:
trainer.push_to_hub(**kwargs)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...78292.0cdc1210a947.1992.0: 100%|##########| 7.43kB / 7.43kB            

  ...4-en-us/training_args.bin: 100%|##########| 6.03kB / 6.03kB            

  ...3234.0cdc1210a947.22131.0: 100%|##########| 13.2kB / 13.2kB            

  ...78555.0cdc1210a947.1992.1: 100%|##########| 19.9kB / 19.9kB            

  ...4-en-us/model.safetensors:  22%|##2       | 33.5MB /  151MB            

CommitInfo(commit_url='https://huggingface.co/dzur658/whisper-tiny-finetuned-minds14-en-us/commit/b621da8d6b4f13af4952065a7f2deea75b8f6b3b', commit_message='End of training', commit_description='', oid='b621da8d6b4f13af4952065a7f2deea75b8f6b3b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dzur658/whisper-tiny-finetuned-minds14-en-us', endpoint='https://huggingface.co', repo_type='model', repo_id='dzur658/whisper-tiny-finetuned-minds14-en-us'), pr_revision=None, pr_num=None)