
A data collator class for CTC (Connectionist Temporal Classification) with padding functionality.
This class handles the batching and padding of input features and labels for wav2vec2 model training.
It processes audio features and their corresponding transcription labels, ensuring proper padding
and tensor conversion.
Args:
	processor (Wav2Vec2Processor): The wav2vec2 processor for handling inputs and labels
	padding (Union[bool, str]): The padding strategy to use. Defaults to True.
	max_length (Optional[int]): Maximum length for input features padding. Defaults to None.
	max_length_labels (Optional[int]): Maximum length for labels padding. Defaults to None.
	pad_to_multiple_of (Optional[int]): Pad input features to be multiple of this value. Defaults to None.
	pad_to_multiple_of_labels (Optional[int]): Pad labels to be multiple of this value. Defaults to None.
Methods:
	__call__(features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
		Processes a batch of features to create padded tensors suitable for model training.
		Args:
			features: List of dictionaries containing input values and labels
		Returns:
			Dict containing padded input tensors and processed labels with -100 for padding tokens


In [35]:
import json
import random
import re
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import os
import numpy as np
import pandas as pd
import torch
import torchaudio
import transformers
from datasets import ClassLabel, load_dataset, load_metric, load_from_disk
from transformers import (Trainer, TrainingArguments, Wav2Vec2CTCTokenizer,
                          Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC,
                          Wav2Vec2Processor)

print(torch.cuda.is_available())
print(torch.cuda.device_count())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))



True
1
NVIDIA GeForce RTX 4070 Ti SUPER


In [36]:
import argparse
parser = argparse.ArgumentParser() 
parser.add_argument('--model', type=str, default="facebook/wav2vec2-large-xlsr-53")
parser.add_argument('--unfreeze', action='store_true')
parser.add_argument('--lr', type=float, default=3e-4)
parser.add_argument('--warmup', type=float, default=500)
parser.add_argument('-f', '--fff', help="dummy argument to avoid error in Jupyter", default="dummy_value")
args = parser.parse_args()

print(f"args: {args}")



args: Namespace(model='facebook/wav2vec2-large-xlsr-53', unfreeze=False, lr=0.0003, warmup=500, fff='c:\\Users\\westw\\AppData\\Roaming\\jupyter\\runtime\\kernel-v310886c7194b81b675aae211578db6de1832a187f.json')


In [37]:
# 从本地磁盘加载数据集 Load Cantonese language only 
common_voice_train = load_dataset("mozilla-foundation/common_voice_13_0", "zh-HK", split="train")
common_voice_test = load_dataset("mozilla-foundation/common_voice_13_0", "zh-HK", split="test")

unused_cols = ["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"]
common_voice_train = common_voice_train.remove_columns(unused_cols)
common_voice_test = common_voice_test.remove_columns(unused_cols)



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
# test if the duration column is added
print(common_voice_train[0])
print(common_voice_test[0])


Dataset({
    features: ['path', 'audio', 'sentence', 'variant'],
    num_rows: 5593
})

In [None]:
# data preprocessing

chars_to_ignore_regex = '[\丶\,\?\.\!\-\;\:"\“\%\‘\”\�\．\⋯\！\－\：\–\。\》\,\）\,\？\；\～\~\…\︰\，\（\」\‧\《\﹔\、\—\／\,\「\﹖\·\']'

import string
def remove_special_characters(batch):
    sen = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    if "d" in sen:
        if len([c for c in sen if c in string.ascii_lowercase]) == 1:
            sen = sen.replace("d", "啲")
    batch["sentence"] = sen
    return batch

common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names,)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names,)
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_list = [char for char in vocab_list if not char.isascii()]
vocab_list.append(" ")

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

with open("vocab.json", "w") as vocab_file:
    json.dump(vocab_dict, vocab_file)


Map:   0%|          | 0/8425 [00:00<?, ? examples/s]

Map: 100%|██████████| 8425/8425 [00:00<00:00, 181566.47 examples/s]
Map: 100%|██████████| 5593/5593 [00:00<00:00, 275056.48 examples/s]


In [40]:
# load datasets and resampling, the modern way
from datasets import Audio
common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=16000)) # reinterpret this column ("audio") as a certain type, with new settings
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16000))


In [None]:
common_voice_test

Dataset({
    features: ['path', 'audio', 'sentence', 'variant'],
    num_rows: 5593
})

In [None]:
# --- 3. Define the prepare_dataset function (like your Whisper one) ---
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True,)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained("./wav2vec2-large-xlsr-cantonese")



Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='', vocab_size=3653, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	3651: AddedToken("[UNK]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	3652: AddedToken("[PAD]", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	3653: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3654: AddedToken("<

In [41]:
def prepare_dataset_for_batching(batch, processor_obj=None):
    # Extract audio data
    audio_arrays = [item["array"] for item in batch["audio"]]
    sampling_rates = [item["sampling_rate"] for item in batch["audio"]]
    sentences = batch["sentence"]  # List of strings

    # Process audio inputs (without padding)
    model_inputs = processor_obj(
        audio_arrays,
        sampling_rate=sampling_rates[0],
        padding=False,  # Crucial: no padding at this stage
        return_tensors=None,  # Get raw lists instead of tensors
    )

    batch["input_values"] = model_inputs.input_values # input values can be obtained from model_inputs

    # Process text labels (without padding)
    # Use tokenizer directly with add_special_tokens=False for CTC
    batch["labels"] = processor_obj.tokenizer(
        sentences, 
        add_special_tokens=False,  # No special tokens for CTC
        padding=False,  # No padding - handled by collator
    ).input_ids

    '''
		# Calculate audio durations in seconds
		# audio_arrays: List of audio arrays where each array represents audio samples
		# sampling_rates: Corresponding sampling rates for each audio array
		# Result is stored in batch['input_length'] as list of durations
		'''   
		
    batch['input_length'] = [
        len(arr) / sr 
        for arr, sr in zip(audio_arrays, sampling_rates)
    ]

    return batch

# Then call map like this:
common_voice_train = common_voice_train.map(
    prepare_dataset_for_batching,
    #remove_columns=columns_to_remove_train,
    num_proc=5, # Can now safely increase this for parallel batch processing
    batched=True, # <--- IMPORTANT: Set to True
    fn_kwargs={"processor_obj": processor}, # Still good practice for num_proc > 1
    load_from_cache_file=True
)

common_voice_test = common_voice_test.map(
    prepare_dataset_for_batching,
    #remove_columns=columns_to_remove_train,
    num_proc=5, # Can now safely increase this for parallel batch processing
    batched=True, # <--- IMPORTANT: Set to True
    fn_kwargs={"processor_obj": processor}, # Still good practice for num_proc > 1
    load_from_cache_file=True
)


# Then remove unused columns from BOTH datasets
columns_to_remove = ["path", "audio", "sentence", "variant"]
common_voice_train = common_voice_train.remove_columns(columns_to_remove)
common_voice_test = common_voice_test.remove_columns(columns_to_remove)



Map (num_proc=5): 100%|██████████| 8425/8425 [03:10<00:00, 44.32 examples/s]
Map (num_proc=5):   0%|          | 0/5593 [00:00<?, ? examples/s]

: 

: 

In [None]:
# Define a data collator for CTC with padding and masking
@dataclass

class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch

In [None]:
# Metrics and model initialization, feature extractor, and model loading
import evaluate
data_collator = DataCollatorCTCWithPadding(
    processor=processor,
    padding=True,
    max_length=int(16_000 * 15),     # cap at 15s
    max_length_labels=512,
    pad_to_multiple_of=16,
    pad_to_multiple_of_labels=8,
)
# Load the built-in CER metric
# cer_metric = load_metric("cer")
cer_metric = evaluate.load("cer")


def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Avoid in-place modification
    label_ids = pred.label_ids.copy()
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, group_tokens=False, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer}

model = Wav2Vec2ForCTC.from_pretrained(
    args.model,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)



if not args.unfreeze:
    model.freeze_feature_extractor()


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda:0


In [None]:
# Configure PyTorch Dynamo to suppress errors during optimization
import torch._dynamo
torch._dynamo.config.suppress_errors = True

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Training arguments configuration
training_args = TrainingArguments(
    # Output directory for model checkpoints and logs
    output_dir="./wav2vec2-large-xlsr-cantonese",
    
    # Group sequences of similar lengths together to improve efficiency
    group_by_length=True,
    
    # Batch size per device for training
    per_device_train_batch_size=16,
    
    # Number of gradient accumulation steps
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    
    # Evaluation strategy and frequency 
    evaluation_strategy="steps",
    eval_steps=400,
    
    # Number of training epochs
    num_train_epochs=10,
    
    # Mixed precision training settings
    fp16=True,
    fp16_backend="amp",
    fp16_full_eval=True,
    
    # Logging configuration
    logging_strategy="steps",
    logging_steps=400,
    
    # Learning rate and warmup
    learning_rate=args.lr,
    warmup_steps=500,
    
    # Model checkpointing
    save_steps=2376,
    save_total_limit=3,
    
    # Data loading configuration
    dataloader_num_workers=0,
    
    # Optimizer settings
    optim="adamw_8bit",
    
    # Misc settings
    remove_unused_columns=False,
    torch_compile=False, # support not well on some models, so set to False for now
)

# Initialize the trainer
trainer = Trainer(
    # Model and device
    model=model.to(device),
    
    # Data collator for batching
    data_collator=data_collator,
    
    # Training configuration
    args=training_args,
    
    # Metrics computation function 
    compute_metrics=compute_metrics,
    
    # Training and evaluation datasets
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    
    # Tokenizer/feature extractor
    tokenizer=processor.feature_extractor,
)

# Start training

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 13.23 GiB. GPU 0 has a total capacity of 15.99 GiB of which 0 bytes is free. Of the allocated memory 15.01 GiB is allocated by PyTorch, and 2.01 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)