In [None]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.9.3


In [None]:
import os
import torchaudio
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import jiwer

def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
    waveform = torchaudio.transforms.Vol(1.0, gain_type='amplitude')(waveform)  # Normalize audio
    return waveform.squeeze().numpy()

def read_text(file_path):
    with open(file_path, 'r') as file:
        return file.read().strip()

def evaluate_asr_and_calculate_cer(audio_dir, text_dir, model_name='facebook/wav2vec2-large-960h-lv60-self'):
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    model.eval()

    cer_scores = []
    total_samples = 0

    for audio_file in os.listdir(audio_dir):
        if audio_file.endswith('.wav'):
            audio_path = os.path.join(audio_dir, audio_file)
            text_file = audio_file.replace('.wav', '.txt')
            text_path = os.path.join(text_dir, text_file)

            if os.path.exists(text_path):
                # Load and preprocess the audio
                input_audio = load_audio(audio_path)
                input_values = processor(input_audio, sampling_rate=16000, return_tensors="pt").input_values

                # Perform ASR
                with torch.no_grad():
                    logits = model(input_values).logits
                predicted_ids = torch.argmax(logits, dim=-1)
                transcription = processor.batch_decode(predicted_ids)[0]

                # Load reference transcription
                reference = read_text(text_path)

                # Check if reference and transcription are not empty
                if reference and transcription:
                    # Compute CER
                    cer = jiwer.cer(reference, transcription)
                    cer_scores.append(cer)
                    total_samples += 1

                    print(f"File: {audio_file}")
                    print(f"Reference: {reference}")
                    print(f"Transcription: {transcription}")
                    print(f"CER: {cer:.4f}")
                    print("---------")
                else:
                    print(f"Skipping file {audio_file}: Reference or transcription is empty.")
                    print("---------")

    if total_samples > 0:
        average_cer = sum(cer_scores) / total_samples
        print(f"Average CER: {average_cer:.4f}")
    else:
        print("No valid samples found to compute CER.")


audio_directory = '/content/drive/MyDrive/wav'
text_directory = '/content/drive/MyDrive/corrected_txt'
evaluate_asr_and_calculate_cer(audio_directory, text_directory)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.maske

File: 000351e792c6e0e90a93cd73d2d8fca99950db6de0122db253aa1336.wav
Reference: WE HAVE PROVIDED IS ACTUALLY WE CAN OPTIMIZE THE REINFORCEMENT LAYERS BECAUSE AS WE SEE
Transcription: WE HAVE PROVIDED W ACTUALLY WECAN AN OPTIMIZE THE REINFORCEMENT LAYERS BECAUSE AS WE
CER: 0.1149
---------
File: 000104538b2f194bfbe315e09483c0d345f1549ba72a7dd4571ad984.wav
Reference: ARE PUT INTO AN INSTRUMENT CALLED STRAIN MEASURING BRIDGE WERE THIS EMF AND CONNECTION MAGNIFICATION
Transcription: ARE PUT INTO AN INSTRUMENT CALLED STAN MEASURING BRIDGE WHERE THIS EME AND THE CONNECTION
CER: 0.1900
---------
File: 00012fd31a06366368f7a824e76ea8f1c83311be593242717e9daea5.wav
Reference: STRAND IT IS GOING TO LINK WITH THE T ON THE OTHER STRAND NOW IF YOU LOOK VERY CAREFULLY THE
Transcription: STRAND IT IS GOING TO LINK WITH THE T ON THE OTHER STRAND NOW IF YOU LOOK VERY CAREFULLY THE
CER: 0.0000
---------
File: 0002927d79779cca83dc81d5e2f8ca026e16997e7f5840079edcb26b.wav
Reference: WITH PARAMETERS CAPITAL N A