In [1]:
import re
import torch
import evaluate
from datasets import load_dataset, Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Load model and processor
repo_name = "Ed-168/wav2vec2-large-xls-r-300m-hi/checkpoint-1880"
model = Wav2Vec2ForCTC.from_pretrained(repo_name).to("cuda")
processor = Wav2Vec2Processor.from_pretrained(repo_name)

# Load datasets (only specified number of samples)
common_voice_test = load_dataset("mozilla-foundation/common_voice_16_0", "hi", split="test")
NUM_TEST_SAMPLES = 750
common_voice_test = common_voice_test.select(range(NUM_TEST_SAMPLES))

# Remove irrelevant columns
common_voice_test = common_voice_test.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes", "variant"]
)

# Cast audio column to correct sampling rate
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16000))

# Define regex to normalize text
chars_to_ignore_regex = r"[\"\'\(\)\[\]\{\}\<\>\—\–\-\—\—\–\—\.\,\?\!\:\;\।\d\@\#\$\%\^\&*\+\=_\\/\|~`]+"

def normalize_text(batch):
    text = batch["sentence"]
    text = text.lower()
    text = re.sub(chars_to_ignore_regex, " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    batch["sentence"] = text
    return batch

# Normalize the text in test dataset
common_voice_test = common_voice_test.map(normalize_text)

# Prepare dataset: preprocess audio and labels
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

common_voice_test = common_voice_test.map(prepare_dataset)

# Initialize WER metric
wer_metric = evaluate.load("wer")

# Evaluate on 5 samples
sampled_test = common_voice_test.shuffle(seed=42).select(range(5))

preds = []
refs = []

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Evaluating on 5 samples from preprocessed test dataset...")

for sample in sampled_test:
    input_dict = processor(sample["input_values"], return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(input_dict.input_values.to(device)).logits
    pred_ids = torch.argmax(logits, dim=-1)[0]
    pred = processor.decode(pred_ids, skip_special_tokens=True)
    preds.append(pred)
    refs.append(sample["sentence"])

    print(f"REF: {sample['sentence']}\nHYP: {pred}\n{'-'*80}")

# Compute and print WER
wer_score = wer_metric.compute(predictions=preds, references=refs)
print(f"\n✅ WER on 5 samples: {wer_score:.4f}")


Evaluating on 5 samples from preprocessed test dataset...


It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


REF: जम्मू कश्मीर आतंकवादियों के ठिकानों का भंडाफोड़ विस्फोटक बरामद
HYP: जमूकश्मे रातंक्वादियोंग के ठिकानो क बंडाफोड़ विसफोट टक बरामत
--------------------------------------------------------------------------------
REF: फ्लाइट में चाइल्ड आर्टिस्ट संग भारती सिंह का डांस वीडियो वायरल
HYP: फ्लाइिट में चाहिडाय टीश्ट संन भारती सिंह का डामस ्वीडियो वायरल्
--------------------------------------------------------------------------------


It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


REF: वे कौनसे रंग के हैं
HYP: वेक कोम से रां के है
--------------------------------------------------------------------------------
REF: उसका कहा हुआ एक शब्द भी सुनने लायक नहीं है
HYP: उसका कहा हुआ एक शब्द भी सुनय लायक नही है
--------------------------------------------------------------------------------


It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


REF: ऑटो चालकों को केजरीवाल सरकार का तोहफा किराए में बढ़ोतरी को दी मंजूरी
HYP: चालकूं को केसरीवास सरकार का तौफा कीराएम में बढ़ोत्तरी को दी मंजूरी
--------------------------------------------------------------------------------

✅ WER on 5 samples: 0.6042


In [2]:
import re
import torch
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor


repo_name = "Ed-168/wav2vec2-large-xls-r-300m-hi/checkpoint-1880"
model = Wav2Vec2ForCTC.from_pretrained(repo_name).to("cuda")
processor = Wav2Vec2Processor.from_pretrained(repo_name)


def transcribe_audio(audio_path):

    speech, sr = librosa.load(audio_path, sr=16000)

    # Tokenize input
    input_values = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True).input_values.to("cuda")

    # Inference
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode
    pred_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
    return transcription



audio_file = "test_hindi_clip.wav"  
text = transcribe_audio(audio_file)
print("\n Transcription:")
print(text)



 Transcription:
जाने उत्ता प्रदेश में किस किसकी हैं मुस्लिम वोटों पर नजर


In [2]:
model.eval()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec