<a href="https://colab.research.google.com/github/devajithsb/Automatic-Speech-Recognition---CTC/blob/main/CTC_working.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.12.1


In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import os
from jiwer import wer  # For Word Error Rate calculation

# 1. Model and Processor
model_name = "facebook/wav2vec2-base-960h"  # Or a larger variant
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# 2. Audio Transcription (with Chunking for long audio)
def transcribe_audio(audio_path, chunk_length_seconds=10):  # Chunking added
    try:
        waveform, sample_rate = torchaudio.load(audio_path)
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)

        chunk_size = int(chunk_length_seconds * 16000)
        num_chunks = (len(waveform[0]) + chunk_size - 1) // chunk_size
        all_transcriptions = []

        for i in range(num_chunks):
            start = i * chunk_size
            end = min((i + 1) * chunk_size, len(waveform[0]))
            chunk = waveform[:, start:end]

            input_values = processor(chunk.squeeze(), return_tensors="pt").input_values
            with torch.no_grad():
                logits = model(input_values).logits

            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = processor.batch_decode(predicted_ids)[0]
            all_transcriptions.append(transcription)

        full_transcription = " ".join(all_transcriptions)  # Basic combining - improve if needed
        return full_transcription

    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return None



# 3. Evaluation (Word Error Rate)
def calculate_wer(ground_truth, predicted):
    if ground_truth is None or predicted is None:
        return 1.0  # Or another appropriate value for error

    return wer(ground_truth, predicted)  # jiwer handles lowercasing and splitting


def calculate_metrics(ground_truth, predicted):
    if ground_truth is None or predicted is None:
        return {'precision': 0, 'recall': 0, 'f1_score': 0, 'accuracy': 0}

    ground_truth_words = ground_truth.lower().split()
    predicted_words = predicted.lower().split()

    common_words = set(ground_truth_words) & set(predicted_words)
    precision = len(common_words) / len(predicted_words) if predicted_words else 0
    recall = len(common_words) / len(ground_truth_words) if ground_truth_words else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
    accuracy = len(common_words) / len(set(ground_truth_words) | set(predicted_words)) if (set(ground_truth_words) | set(predicted_words)) else 0
    return {'precision': precision, 'recall': recall, 'f1_score': f1, 'accuracy': accuracy}



# 4. Dataset and Evaluation Loop (with weights and metrics)
audio_files_and_labels = [
    ("sounds/Life is a beautiful (alfred-british).wav", "Life is a beautiful journey", 1),
    ("sounds/Hello Myself Devajit.wav", "hello myself devajit", 1),
    ("sounds/Hello My self Ryan(canadian -ryan).wav", "hello myself ryan", 1),
    ("sounds/Hello My self Rishi(indian-rishi).wav", "hello myself rishi", 1),
    ("sounds/Hello Myself Devajit (irish-cillian).wav", "hello myself Devajit", 1),
    ("sounds/Life is a beautiful (shirley-scottish).wav", "Life is a beautiful journey", 1),
]

total_wer = 0
total_precision = 0
total_recall = 0
total_f1 = 0
total_accuracy = 0
total_weight = 0
num_transcriptions = 0

for audio_path, ground_truth, weight in audio_files_and_labels:
    if os.path.exists(audio_path):
        transcription = transcribe_audio(audio_path)
        if transcription:
            # ... (print audio path, ground truth, and transcription)

            wer_value = calculate_wer(ground_truth, transcription)
            # ... (print WER)

            metrics = calculate_metrics(ground_truth, transcription)  # Calculate metrics
            total_precision += metrics['precision'] * weight
            total_recall += metrics['recall'] * weight
            total_f1 += metrics['f1_score'] * weight
            total_accuracy += metrics['accuracy'] * weight
            total_wer += wer_value * weight
            total_weight += weight
            num_transcriptions += 1
        else:
            print(f"Transcription failed for: {audio_path}")
    else:
        print(f"File not found: {audio_path}")

if num_transcriptions > 0:
    weighted_average_wer = total_wer / total_weight if total_weight > 0 else 0
    weighted_average_precision = total_precision / total_weight if total_weight > 0 else 0
    weighted_average_recall = total_recall / total_weight if total_weight > 0 else 0
    weighted_average_f1 = total_f1 / total_weight if total_weight > 0 else 0
    weighted_average_accuracy = total_accuracy / total_weight if total_weight > 0 else 0

    print(f"\nWeighted Averages:")
    print(f"Precision: {weighted_average_precision:.4f}")
    print(f"Recall: {weighted_average_recall:.4f}")
    print(f"F1-score: {weighted_average_f1:.4f}")
    print(f"Accuracy: {weighted_average_accuracy:.4f}")


else:
    print("No successful transcriptions to calculate weighted averages.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failin


Weighted Averages:
Precision: 0.6667
Recall: 0.6667
F1-score: 0.6667
Accuracy: 0.5667
