-------------------

Prepare the dataset with the Whisper processor


In [None]:
from transformers import WhisperProcessor

model_name = "openai/whisper-medium.en"
language = "english" # Change to your dataset's language
task = "transcribe" # Use "translate" if you're translating to English

processor = WhisperProcessor.from_pretrained(model_name, language=language, task=task)

In [3]:
from datasets import load_dataset, Audio, DatasetDict

# Chunked wav dataset
dataset_path = "C:\\Users\\dacla\\Documents\\DALI-chunks-wav"

raw_dataset = load_dataset("csv", data_files="metadata-wav.csv", split='train')
print("Full dataset", raw_dataset)

# Make a train/test split at this point !
raw_dataset = raw_dataset.train_test_split(test_size=0.2, shuffle=True, seed=555)
print("\nSplit dataset", raw_dataset)

Full dataset Dataset({
    features: ['transcript', 'file-wav'],
    num_rows: 29656
})

Split dataset DatasetDict({
    train: Dataset({
        features: ['transcript', 'file-wav'],
        num_rows: 23724
    })
    test: Dataset({
        features: ['transcript', 'file-wav'],
        num_rows: 5932
    })
})


In [77]:
import librosa

def prepare_dataset(batch):
    # Load and resample audio data
    audio_paths = [f"{dataset_path}\\{fname}" for fname in batch['file-wav']]
    audio_arrays = [librosa.load(path, sr=16000)[0] for path in audio_paths]
    
    # Compute log-Mel input features from the audio
    batch["input_features"] = processor.feature_extractor(audio_arrays, sampling_rate=16000).input_features

    # Encode the transcriptions to label ids
    labels = processor.tokenizer(batch["transcript"]).input_ids
    batch["labels"] = [[label if label != processor.tokenizer.pad_token_id else -100 for label in T] for T in labels]

    return batch

# Apply the function to the entire dataset
processed_dataset = raw_dataset.map(
    prepare_dataset,
    batched=True,
    batch_size=8,
    remove_columns=raw_dataset.column_names["train"])

Map:   0%|          | 0/23724 [00:00<?, ? examples/s]

Map:   0%|          | 0/5932 [00:00<?, ? examples/s]

In [81]:
# Save dataset to disc
processed_dataset.save_to_disk('dataset_whisper')

Saving the dataset (0/46 shards):   0%|          | 0/23724 [00:00<?, ? examples/s]

Saving the dataset (0/12 shards):   0%|          | 0/5932 [00:00<?, ? examples/s]

Start from here if retraining


In [2]:
from transformers import WhisperProcessor

model_name = "openai/whisper-medium.en"
language = "english" # Change to your dataset's language
task = "transcribe" # Use "translate" if you're translating to English

processor = WhisperProcessor.from_pretrained(model_name, language=language, task=task)

In [3]:
from transformers import DataCollatorForSeq2Seq
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import torch

# --- Data Collator ---
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels since they have to be of different lengths and need different padding methods.
        # "input_features" for Whisper-based models (vs. "input_values" for wav2vec...)
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(input_features, 
                                                     return_tensors="pt",
                                                     return_attention_mask=True)
        
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor, padding=True)

Model


In [4]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_name)

# This is necessary for the model to work correctly with the Trainer
#model.config.forced_decoder_ids = None
#model.config.suppress_tokens = []

# send to the appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f'Model {model_name} loaded on {device}')

Model openai/whisper-medium.en loaded on cuda


If continuing to train...

In [1]:
finetuned_model_path = ".\\whisper-ft"
model.load_state_dict(torch.load(finetuned_model_path, map_location=device))

NameError: name 'model' is not defined

In [5]:
# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Except those in the last layer
for param in model.proj_out.parameters():
        param.requires_grad = True

# Verify which layers are trainable
print("\nTrainable parameters after freezing:")
trainable_params = 0
frozen_params = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        trainable_params += param.numel()
        print(f"  - {name} (Trainable, shape: {param.shape})")
    else:
        frozen_params += param.numel()
        # print(f"  - {name} (Frozen)") # Uncomment to see all frozen params

total_params = trainable_params + frozen_params
print(f"\nTotal trainable parameters: {trainable_params}")
print(f"Total frozen parameters: {frozen_params}")
print(f"Total parameters: {total_params}")
print(f"Ratio of trained params to total params: {trainable_params / total_params:.4f}")



Trainable parameters after freezing:
  - model.decoder.embed_tokens.weight (Trainable, shape: torch.Size([51864, 1024]))

Total trainable parameters: 53108736
Total frozen parameters: 710748160
Total parameters: 763856896
Ratio of trained params to total params: 0.0695


Downsample if needed

In [6]:
from datasets import load_from_disk

sample_percentage = 0.1

# Load full prepared dataset
prepared_dataset_path = 'dataset_whisper'
prepared_datasets = load_from_disk(prepared_dataset_path)
print("--- Full Prepared Dataset ---")
print(prepared_datasets)

# Sample 1% from the training set
train_split = prepared_datasets["train"]
sampled_train_split = train_split.train_test_split(train_size=sample_percentage, shuffle=True, seed=555)['train'] # We only want the 'train' part of this new split

test_split = prepared_datasets["test"]
sampled_test_split = test_split.train_test_split(train_size=sample_percentage, shuffle=True, seed=555)['train'] 

# Overwrite the original splits with the sampled splits
prepared_datasets['train'] = sampled_train_split
prepared_datasets['test'] = sampled_test_split

print(f"\n--- Sampled ({sample_percentage*100}%) Dataset ---")
print(prepared_datasets)

# Now, use this smaller `prepared_datasets` object for the rest of your script
# (creating DataLoaders, etc.)

Loading dataset from disk:   0%|          | 0/46 [00:00<?, ?it/s]

--- Full Prepared Dataset ---
DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 23724
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 5932
    })
})

--- Sampled (10.0%) Dataset ---
DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2372
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 593
    })
})


Training parameters

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
import re
from tqdm import tqdm

def remove_punctuation(s):
    s = re.sub(r'[^a-zA-Z0-9\s]', '', s)
    return s.lower()

# Training parameters
learning_rate = .0002
train_batch_size = 8
eval_batch_size = 16

# Defined train and test DLs
train_dataloader = DataLoader(prepared_datasets["train"], shuffle=True, collate_fn=data_collator, batch_size=train_batch_size)
eval_dataloader = DataLoader(prepared_datasets["test"], collate_fn=data_collator, batch_size=eval_batch_size)

optimizer = AdamW(model.parameters(), lr=learning_rate)
scaler = torch.amp.GradScaler('cuda')

wer_metric = evaluate.load("wer")



In [None]:
from transformers import get_scheduler

num_train_epochs = 20
num_warmup_steps = 100
total_steps = len(train_dataloader) * num_train_epochs

lr_scheduler = get_scheduler(name="linear",
                             optimizer=optimizer,
                             num_warmup_steps=num_warmup_steps,
                             num_training_steps=total_steps)

# Set initial WER max to inf
best_wer = float('inf')
output_dir = ".\\whisper-ft"

Main training cycle

In [None]:
for epoch in range(num_train_epochs):
    # --- TRAINING ---
    model.train()
    train_loss = 0
    
    # Use tqdm for a progress bar
    for step, batch in enumerate(tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}/{num_train_epochs}")):
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        optimizer.zero_grad()
        
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} | Average Training Loss: {avg_train_loss:.4f}")

    # --- EVALUATION ---
    model.eval()
    all_predictions = []
    all_labels = []

    # Use torch.no_grad() to save memory and computations
    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Generate predictions. This is different from the training forward pass.
            generated_ids = model.generate(input_features=batch["input_features"], 
                                           attention_mask=batch["attention_mask"], 
                                           max_length=225)
            
            # Decode predictions
            predictions = processor.batch_decode(generated_ids, skip_special_tokens=True)
            
            # Decode labels, replacing -100 with pad token
            labels = batch["labels"].clone()
            labels[labels == -100] = processor.tokenizer.pad_token_id
            labels_str = processor.batch_decode(labels, skip_special_tokens=True)

            # Remove punctuation and capital letters from transcription
            predictions = [remove_punctuation(p) for p in predictions]

            all_predictions.extend(predictions)
            all_labels.extend(labels_str)

    # Compute WER
    wer = wer_metric.compute(predictions=all_predictions, references=all_labels)
    
    print(f"WER: {wer:.4f}")

    # Save the model if it has the best WER so far
    if wer < best_wer:
        best_wer = wer
        print(f"New best WER: {best_wer}. Saving model...")
        model.save_pretrained(output_dir)
        processor.save_pretrained(output_dir)
        print(f"Model saved to {output_dir}")

print("\n--- Training Complete ---")
print(f"Best WER achieved: {best_wer}")

Training Epoch 1/20:   0%|          | 0/297 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Training Epoch 1/20:  19%|█▉        | 57/297 [01:14<05:05,  1.27s/it]

--------------------------

Testing a trained model

In [None]:
import torchaudio

def test_transcribe(audio_path):
    # Put in evaluation mode
    model.eval()

    # Load audio file
    print(f"Loading audio from: {audio_path}...")
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample if necessary (Whisper expects 16kHz)
    if sample_rate != 16000:
        print(f"Resampling audio from {sample_rate}Hz to 16kHz...")
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000 # Update sample rate after resampling

    # Ensure mono audio (Whisper expects single channel)
    if waveform.shape[0] > 1:
        print("Converting stereo audio to mono...")
        waveform = waveform.mean(dim=0, keepdim=True) # Average channels to mono

    # Convert to numpy array (required by feature_extractor for raw audio)
    audio_array = waveform.squeeze().numpy()

    # Extract features (Mel spectrogram)
    processed_audio = processor.feature_extractor(audio_array, 
                                                sampling_rate=sample_rate, 
                                                return_tensors="pt",
                                                return_attention_mask=True)

    input_features = processed_audio.input_features.to(device)
    attention_mask = processed_audio.attention_mask.to(device)

    print("Generating transcription...")
    with torch.no_grad():
        generated_ids = model.generate(input_features=input_features, 
                                       attention_mask=attention_mask,
                                       max_new_tokens=256,
                                       temperature=0.0,
                                       #no_speech_threshold=.3 # Error when using this ?
                                       )
        
    # Create the transcription
    transcription = processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription


# Load and preprocess the audio file
audio_path = 'reg.wav'
print("\nTranscription:\n", test_transcribe(audio_path))

Loading audio from: reg.wav...
Generating transcription...

Transcription:
  It was a clear black night, a clear white moon Warren G was on the streets trying to consume some skirts for the E so I could get some phones rolling in my ride, chilling all alone Just hit the east side of the LBC on a mission trying to find Mr. Warren G. Seen a call full of girls, ain't no need to tweak all you skirts know what's up with 213 So I hooked Celeste
