# Trial 2

In [1]:
import os
import re
import json
import random
import string
from dataclasses import dataclass
from typing import Dict, List, Union, Optional

import torch
import torchaudio
import librosa
import evaluate
from datasets import load_dataset, Audio, DatasetDict
from transformers import (
    Wav2Vec2BertForCTC,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2BertProcessor,
    TrainingArguments,
    Trainer,
    set_seed,
)

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"
set_seed(42)


Torch: 2.8.0+cu126
CUDA available: True


In [2]:
print(device)

cuda


In [3]:
import random

def display10(dataset):
    for i in range(10):
        r = random.randint(0,len(dataset))
        print(i+1 , dataset[r]['sentence'])

In [4]:
# Dataset and language
CV_VERSION = "mozilla-foundation/common_voice_16_0"
LANG_ID = "hi"  # Hindi

# Base SSL model (wav2vec2-bert encoder)
BASE_MODEL = "facebook/w2v-bert-2.0"

# Audio parameters
TARGET_SAMPLING_RATE = 16000

# Training output dir
OUTPUT_DIR = "w2vbert-hi-ctc-cv16"

# Training hyperparameters (tune for your budget)
BATCH_SIZE          = 1      # 1 is the safest, 2 may work if audio lengths are short
GRAD_ACCUM          = 16     # Accumulate gradients for effective batch of 16
LEARNING_RATE       = 2e-4   # Good starting LR for ASR, tune lower if model is unstable
NUM_TRAIN_EPOCHS    = 10
EVAL_STRATEGY       = "steps"
EVAL_STEPS          = 1000   # Evaluate less frequently to save memory
SAVE_STEPS          = 1000   # Save less frequently to reduce disk I/O
LOGGING_STEPS       = 50
WARMUP_RATIO        = 0.05
FP16                = torch.cuda.is_available()      # Enable mixed precision


# If you want to push to the Hub, set these:
PUSH_TO_HUB = True
HF_REPO_ID = "Ed-168/Fine-tuned-wav2vec2-BERT-indian-languages"  # e.g. "username/w2vbert-hi-ctc-cv17"


In [5]:
# This will download and prepare the dataset (first run may take a while)
from datasets import load_dataset

common_voice_train = load_dataset(
    "mozilla-foundation/common_voice_16_0",
    "hi",
    split="train+validation",
    trust_remote_code=True
)
common_voice_test = load_dataset(
    "mozilla-foundation/common_voice_16_0",
    "hi",
    split="test",
    trust_remote_code=True
)



In [9]:
NUM_TRAIN_SAMPLES = 2000
NUM_TEST_SAMPLES = 1000

common_voice_train = common_voice_train.select(range(NUM_TRAIN_SAMPLES))
common_voice_test = common_voice_test.select(range(NUM_TEST_SAMPLES))


In [10]:
print(len(common_voice_train))
print(len(common_voice_test))

2000
1000


In [11]:

common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes" , "variant"])
common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes" , "variant"])

In [12]:
from datasets import Audio

common_voice_train = common_voice_train.cast_column("audio" , Audio(sampling_rate = 16000))
common_voice_test = common_voice_test.cast_column("audio" , Audio(sampling_rate = 16000))

In [13]:
display10(common_voice_train)

1 फैशन की दुनिया में कामयाब होने के लिए अपनाए तरीके
2 आप क्या पहनने वाले हैं?
3 टॉम ने नीले कपड़े पहने थे।
4 उत्तर कोरिया ने अपने रक्षा मंत्री को तोप से उड़ाया
5 उसने मुझसे वह बात छिपाई।
6 यूपी में मूर्ति, माफिया और मुल्जिमों की सरकार: नकवी
7 उसने विस्तृत रूप से अपनी योजना समझाई।
8 नोएडाः वेब वर्क कंपनी के दफ्तर पर पुलिस का छापा, अहम दस्तावेज बरामद
9 शाह को शरद की चुनौती, कहा- महाराष्ट्र की माटी में आकर ललकारना आसान नहीं
10 टॉम बिलकुल हमारी तरह है।


In [14]:
# Define the regex at the top level so that subprocesses can access it
chars_to_ignore_regex = r"[\"\'\(\)\[\]\{\}\<\>\—\–\-\—\—\–\—\.\,\?\!\:\;\।\d\@\#\$\%\^\&\*\+\=\_\\\/\|~`]+"

def normalize_text(batch):
    text = batch["sentence"]
    text = text.lower()
    text = re.sub(chars_to_ignore_regex, " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    batch["sentence"] = text
    return batch

common_voice_train = common_voice_train.map(normalize_text)
common_voice_test = common_voice_test.map(normalize_text)
display10(common_voice_train)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

1 कोड़ा को न दी जाए जमानत सीबीआई
2 लखनऊ में बुजुर्ग महिला की हत्या रेप की आशंका
3 जीत के बाद विराट बोले हमें हार्दिक के ऑफ कटर्स पर भरोसा था
4 हॉकी मैच के दौरान स्टेडियम में मौजूद रहेंगे प्रधानमंत्री मनमोहन सिंह
5 इस दुर्घटना के लिए कौन ज़िम्मेदार है
6 दिल्ली नॉर्थ एमसीडी में आर्थिक तंगी कमिश्नर को नहीं मिली तीन महीने से सैलरी
7 छत्तीसगढ़ नक्सलगढ़ पर प्रहार का प्लान केंद्रीय गृह सचिव ने बुलाई उच्च स्तरीय बैठक
8 वे विद्यार्थी कोरियाई हैं
9 मैंने उसको पैसे देने की कोशिश करी पर उसने इनकार कर दिया
10 वह अपने प्रयोगों में कबूतरों का उपयोग करता था


In [15]:
# Build a set of characters present in the training transcripts
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    return {"all_text": [all_text]}

vocabs = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, remove_columns=common_voice_train.column_names)
all_text = " ".join(vocabs["all_text"])
vocab_list = sorted(list(set(list(all_text))))

# Remove the space from the set; we'll add a dedicated word_delimiter_token later.
if " " in vocab_list:
    vocab_list.remove(" ")

# Build vocab dict
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict["|"] = len(vocab_dict)  # word delimiter
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

print("Vocab size:", len(vocab_dict))
print("Sample of vocab keys:", list(vocab_dict.keys())[:60])

# Save vocab to disk
os.makedirs(OUTPUT_DIR, exist_ok=True)
vocab_path = os.path.join(OUTPUT_DIR, "vocab-v2.json")
with open(vocab_path, "w", encoding="utf-8") as f:
    json.dump(vocab_dict, f, ensure_ascii=False, indent=2)
print("Saved vocab to:", vocab_path)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Vocab size: 98
Sample of vocab keys: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ँ', 'ं', 'ः', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ऑ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प']
Saved vocab to: w2vbert-hi-ctc-cv16\vocab-v2.json


In [16]:
# Tokenizer for CTC

from transformers import SeamlessM4TFeatureExtractor

tokenizer = Wav2Vec2CTCTokenizer(
    vocab_path,
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|",
)

# Feature extractor (handles audio to input features)
feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained(BASE_MODEL)

# Combined processor (specific to wav2vec2-bert)
from transformers import Wav2Vec2BertProcessor

processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor , tokenizer=tokenizer)

# # Save processor for later use/inference
# processor.save_pretrained(OUTPUT_DIR)
# print("Processor saved to:", OUTPUT_DIR)


In [17]:
rand_clip = random.randint(0 , len(common_voice_train) -1 )
print("Target text:", common_voice_train[rand_clip]["sentence"])
print("Input array shape:", common_voice_train[rand_clip]["audio"]["array"].shape)
print("Sampling rate:", common_voice_train[rand_clip]["audio"]["sampling_rate"])

Target text: ओम पुरी का निधन सलमान ने शेयर की ये खास तस्वीर
Input array shape: (86976,)
Sampling rate: 16000


In [18]:
def prepare_dataset(batch):
    audio = batch['audio']
    batch['input_features'] = processor(audio['array'] , sampling_rate=audio['sampling_rate'])
    batch["input_length"] = len(batch['input_features'])

    batch['labels'] = processor(text = batch['sentence']).input_ids

    return batch

common_voice_train = common_voice_train.map(prepare_dataset , remove_columns=common_voice_train.column_names)
common_voice_test = common_voice_test.map(prepare_dataset , remove_columns = common_voice_test.column_names)

print("Example lengths:", len(common_voice_train[0]["input_features"]), len(common_voice_train[0]["labels"]))


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Example lengths: 2 23


In [19]:
print(common_voice_train[0]['input_features'])

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]], 'input_features': [[[-6.366292953491211, -6.282590866088867, -5.798328876495361, -5.775012969970703, -5.9785237312316895, -6.241710186004639, -6.477811813354492, -6.640229225158691, -6.4594621658325195, -6.023435115814209, -5.965824604034424, -6.000712871551514, -5.979282855987549, -5.993183135986328, -6.2013325691223145, -6.42795991897583, -6.566420078277588, -6.47

In [20]:
import numpy as np

In [21]:
data = [{'attention_mask': [[1, 1, 1, ...]], 
         'input_features': [[[-2.3008129596710205, -2.249695301055908, ...]]]}]

input_features = [item['input_features'] for item in data]

input_features

[[[[-2.3008129596710205, -2.249695301055908, Ellipsis]]]]

In [22]:
from dataclasses import dataclass
from typing import List, Dict, Any
import torch

@dataclass
class DataCollatorCTCWithPadding:
    processor: Any
    padding: str = 'longest'

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Extract feature arrays and masks from your nested structure
        input_feature_arrays = []
        attention_masks = []
        for item in features:
            nested = item["input_features"]
            # 'input_features': [[...]], 'attention_mask': [[...]]
            feats = nested["input_features"]
            mask = nested["attention_mask"]

            # Remove the extra outer list if present
            if isinstance(feats, list) and isinstance(feats[0], list):
                feats = feats[0]
            if isinstance(mask, list) and isinstance(mask[0], list):
                mask = mask[0]

            # Convert to tensors
            feats = torch.tensor(feats, dtype=torch.float32)
            mask = torch.tensor(mask, dtype=torch.long)

            input_feature_arrays.append(feats)
            attention_masks.append(mask)

        # If feature_extractor expects a list of dicts per sample, pass both feature+mask together
        batch = self.processor.feature_extractor.pad(
            [{"input_features": f, "attention_mask": m} for f, m in zip(input_feature_arrays, attention_masks)],
            padding=self.padding,
            return_tensors="pt"
        )

        # Labels: extract directly
        labels = [item["labels"] for item in features]
        # Pad labels with tokenizer
        label_features = [{"input_ids": l} for l in labels]
        labels_batch = self.processor.tokenizer.pad(
            label_features,
            padding=self.padding,
            return_tensors="pt"
        )

        # Replace tokenizer padding ID with -100 for CTC loss
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch




data_collator = DataCollatorCTCWithPadding(processor=processor, padding='longest')


In [23]:
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    # pred.predictions is float logits of shape (batch, time, vocab_size)
    pred_logits = pred.predictions
    pred_ids = torch.from_numpy(pred_logits).argmax(-1)

    # Decode predictions and references
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    # Replace -100 with pad_token_id for decoding refs
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


In [24]:
# Initialize the CTC head on top of wav2vec2-bert encoder
model = Wav2Vec2BertForCTC.from_pretrained(
    BASE_MODEL,
    vocab_size=len(processor.tokenizer),
    pad_token_id=processor.tokenizer.pad_token_id,
    ctc_loss_reduction="mean",
    # You can set this to True for long-form training stability with LayerDrop models
    # but w2v-bert-2.0 doesn't use LayerDrop by default.
)

# Make sure the model knows the correct special tokens
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = len(processor.tokenizer)
model.to(device)

# Optionally freeze the feature encoder for a few epochs if you have small compute
# (uncomment to try). Often helps stabilize early training.
# if hasattr(model, "freeze_feature_encoder"):
#     model.freeze_feature_encoder()

# Print parameter count
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total params: {total_params:,} | Trainable: {trainable_params:,}")


Some weights of Wav2Vec2BertForCTC were not initialized from the model checkpoint at facebook/w2v-bert-2.0 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total params: 580,595,620 | Trainable: 580,595,620


In [25]:
# TrainingArguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    group_by_length=True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    save_steps=SAVE_STEPS,
    eval_steps=EVAL_STEPS,
    logging_steps=LOGGING_STEPS,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    warmup_ratio=WARMUP_RATIO,
    fp16=FP16,
    save_total_limit=2,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=PUSH_TO_HUB,
    report_to=["none"],  
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,  # ensures padding works
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Trainer is ready.")


  trainer = Trainer(


Trainer is ready.


In [22]:
# common_voice_train = common_voice_train.rename_column("input_features", "input_values")
# common_voice_train

In [26]:
common_voice_train

Dataset({
    features: ['input_features', 'input_length', 'labels'],
    num_rows: 2000
})

In [27]:
train_result = trainer.train()
trainer.save_model(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)


print("Training complete. Model and processor saved to:", OUTPUT_DIR)

Step,Training Loss


KeyboardInterrupt: 

In [None]:
# After training
metrics = train_result.metrics
metrics["train_samples"] = len(common_voice_train)
metrics["num_epochs"] = training_args.num_train_epochs  

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

print(f"Training complete! Model and processor saved to: {OUTPUT_DIR}")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Metrics: {metrics}")
