In [2]:
! pip install jiwer

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.13.0


In [None]:
import os
from jiwer import wer, cer
from collections import defaultdict
import re

def normalize_text(text):
    """
    Normalize text by removing symbols, extra spaces, and converting to lowercase.
    """
    # Remove all symbols and punctuation except alphanumeric and spaces
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing spaces
    text = text.strip()
    return text

def load_transcripts(folder_path, strip_prefix="", strip_suffix=""):
    """
    Reads all .txt files and returns a dict: core_name -> transcript
    Strips prefix and suffix from filename to get core_name.
    """
    transcripts = {}
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".txt"):
            key = filename
            if strip_prefix and key.startswith(strip_prefix):
                key = key[len(strip_prefix):]
            if strip_suffix and key.endswith(strip_suffix):
                key = key[:-len(strip_suffix)]
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                # Apply normalization to remove symbols
                transcripts[key] = normalize_text(f.read().strip())
    return transcripts

def evaluate_asr(gt_folder, pred_folder):
    gt_transcripts = load_transcripts(gt_folder, strip_prefix="transcription_")
    pred_transcripts = load_transcripts(pred_folder)

    # Match by core names
    common_keys = sorted(set(gt_transcripts) & set(pred_transcripts))
    if not common_keys:
        print("No matching files found. Check naming conventions.")
        return {}

    per_file = {}
    refs, hyps, wer_list, cer_list = [], [], [], []

    for key in common_keys:
        ref = gt_transcripts[key]
        hyp = pred_transcripts[key]

        w = wer(ref, hyp)
        c = cer(ref, hyp)

        per_file[key] = {"wer": w, "cer": c}
        refs.append(ref)
        hyps.append(hyp)
        wer_list.append(w)
        cer_list.append(c)

    return {
        "per_file": per_file,
        "average_wer": sum(wer_list)/len(wer_list),
        "average_cer": sum(cer_list)/len(cer_list),
        "corpus_wer": wer(refs, hyps),
        "corpus_cer": cer(refs, hyps),
    }

def compute_group_wise_averages(per_file_results):
    """
    Computes average WER and CER for each group (e.g., 'noisy', 'midnoise', 'quite')
    based on the prefix of the file name.
    """
    group_scores = defaultdict(lambda: {"wer": [], "cer": []})

    for fname, scores in per_file_results.items():
        group_name = fname.split('_')[0]  # e.g., 'noisy' from 'noisy_1.txt'
        group_scores[group_name]["wer"].append(scores["wer"])
        group_scores[group_name]["cer"].append(scores["cer"])

    group_averages = {}
    for group, scores in group_scores.items():
        avg_wer = sum(scores["wer"]) / len(scores["wer"])
        avg_cer = sum(scores["cer"]) / len(scores["cer"])
        group_averages[group] = {"average_wer": avg_wer, "average_cer": avg_cer}

    return group_averages

def run_evaluation():
    print("Running evaluation with symbol normalization...")

    # ASR results
    gt_folder = "data/transcription"
    pre_folder = "data/ASR_result"
    print("\nASR Evaluation:")
    results = evaluate_asr(gt_folder, pre_folder)
    group_results = compute_group_wise_averages(results["per_file"])
    for group, metrics in group_results.items():
        print(f"{group}: WER = {metrics['average_wer']:.2%}, CER = {metrics['average_cer']:.2%}")
    print(f"Overall: WER = {results['corpus_wer']:.2%}, CER = {results['corpus_cer']:.2%}")

    # VSR results
    pre_folder = "data/VSR_result"
    print("\nVSR Evaluation:")
    results = evaluate_asr(gt_folder, pre_folder)
    group_results = compute_group_wise_averages(results["per_file"])
    for group, metrics in group_results.items():
        print(f"{group}: WER = {metrics['average_wer']:.2%}, CER = {metrics['average_cer']:.2%}")
    print(f"Overall: WER = {results['corpus_wer']:.2%}, CER = {results['corpus_cer']:.2%}")

    # Multimodal results
    pre_folder = "data/multimodal_result"
    print("\nMultimodal Evaluation:")
    results = evaluate_asr(gt_folder, pre_folder)
    group_results = compute_group_wise_averages(results["per_file"])
    for group, metrics in group_results.items():
        print(f"{group}: WER = {metrics['average_wer']:.2%}, CER = {metrics['average_cer']:.2%}")
    print(f"Overall: WER = {results['corpus_wer']:.2%}, CER = {results['corpus_cer']:.2%}")



In [12]:
run_evaluation()

Running evaluation with symbol normalization...

ASR Evaluation:
midnoise: WER = 9.95%, CER = 7.26%
noisy: WER = 100.94%, CER = 85.61%
quite: WER = 8.95%, CER = 2.48%
Overall: WER = 39.80%, CER = 30.73%

VSR Evaluation:
midnoise: WER = 31.08%, CER = 21.11%
noisy: WER = 37.27%, CER = 27.13%
quite: WER = 27.18%, CER = 17.90%
Overall: WER = 31.89%, CER = 21.55%

Multimodal Evaluation:
midnoise: WER = 21.54%, CER = 14.92%
noisy: WER = 37.27%, CER = 27.13%
quite: WER = 14.77%, CER = 7.79%
Overall: WER = 25.00%, CER = 16.47%
