# 📌 Wav2Vec2 Evaluation with LibriSpeech
This notebook demonstrates evaluating the Wav2Vec2 model using Hugging Face Transformers on the LibriSpeech dataset.

In [None]:
!pip install transformers datasets evaluate jiwer torchaudio

In [None]:
from datasets import load_dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import evaluate


In [None]:
# Load WER metric
wer_metric = evaluate.load("wer")

# Load dataset in streaming mode
dataset = load_dataset("librispeech_asr", "clean", split="train.100", streaming=True)
tiny_dataset = list(iter(dataset.take(10)))


In [None]:
# Load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()


In [None]:
predictions, references = [], []

for i, sample in enumerate(tiny_dataset):
    try:
        reference = sample["text"].lower()
        references.append(reference)

        audio = sample["audio"]
        waveform = torch.tensor(audio["array"]).unsqueeze(0).float()
        sample_rate = audio["sampling_rate"]

        inputs = processor(waveform.squeeze(), sampling_rate=sample_rate, return_tensors="pt", padding=True)

        with torch.no_grad():
            logits = model(**inputs).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0].lower()

        predictions.append(transcription)

        print(f"Sample {i+1}")
        print(f"Original Text   : {reference}")
        print(f"Predicted Text  : {transcription}\n")

    except Exception as e:
        print(f"Error processing sample {i+1}: {e}")


In [None]:
overall_wer = wer_metric.compute(predictions=predictions, references=references)
accuracy = (1 - overall_wer) * 100

print("📊 Evaluation Summary")
print(f"Word Error Rate (WER): {overall_wer:.4f}")
print(f"Approximate Accuracy : {accuracy:.2f}%")
