In [13]:
import os
from torchmetrics.text import CharErrorRate, EditDistance, WordErrorRate

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read().strip()

def evaluate_transcriptions(pred_folder, target_folder, file_limit=529):
    cer = CharErrorRate()
    wer = WordErrorRate()
    edit_distance = EditDistance()

    total_cer = 0
    total_wer = 0
    total_edit_distance = 0
    file_count = 0

    for filename in sorted(os.listdir(pred_folder))[:file_limit]:
        if filename.endswith('.txt'):
            pred_path = os.path.join(pred_folder, filename)
            target_path = os.path.join(target_folder, filename)

            if os.path.exists(target_path):
                pred_text = read_file(pred_path)
                target_text = read_file(target_path)

                file_cer = cer([pred_text], [target_text])
                file_wer = wer([pred_text], [target_text])
                file_edit_distance = edit_distance([pred_text], [target_text])

                total_cer += file_cer
                total_wer += file_wer
                total_edit_distance += file_edit_distance
                file_count += 1

                print(f"File {file_count}: {filename}")
                print(f"WER: {file_wer:.4f}")
                print(f"CER: {file_cer:.4f}")
                print(f"Edit Distance: {file_edit_distance:.4f}")
                print("--------------------")

            if file_count >= file_limit:
                break

    if file_count > 0:
        avg_cer = total_cer / file_count
        avg_wer = total_wer / file_count
        avg_edit_distance = total_edit_distance / file_count

        print("\nOverall Results:")
        print(f"Average WER: {avg_wer:.4f}")
        print(f"Average CER: {avg_cer:.4f}")
        print(f"Average Edit Distance: {avg_edit_distance:.4f}")
        print(f"Total files processed: {file_count}")
    else:
        print("No matching files found.")

# Example usage
pred_folder = 'transcriptions_IAM_pixtral'
target_folder = 'IAM/txt'
evaluate_transcriptions(pred_folder, target_folder)

File 1: c04-156.txt
WER: 0.1224
CER: 0.0688
Edit Distance: 19.0000
--------------------
File 2: c04-160.txt
WER: 0.0800
CER: 0.0619
Edit Distance: 27.0000
--------------------
File 3: c06-083.txt
WER: 0.2000
CER: 0.1480
Edit Distance: 70.0000
--------------------
File 4: d01-024.txt
WER: 0.0820
CER: 0.1185
Edit Distance: 98.0000
--------------------
File 5: d01-056.txt
WER: 0.0822
CER: 0.0425
Edit Distance: 17.0000
--------------------
File 6: d01-060.txt
WER: 0.3750
CER: 0.3405
Edit Distance: 379.0000
--------------------
File 7: d03-112.txt
WER: 0.4918
CER: 0.5373
Edit Distance: 269.0000
--------------------
File 8: d04-005.txt
WER: 0.1833
CER: 0.1308
Edit Distance: 45.0000
--------------------
File 9: d04-008.txt
WER: 0.0741
CER: 0.1070
Edit Distance: 40.0000
--------------------
File 10: d04-071.txt
WER: 0.1356
CER: 0.0874
Edit Distance: 27.0000
--------------------
File 11: d04-075.txt
WER: 0.7132
CER: 0.6735
Edit Distance: 559.0000
--------------------
File 12: d04-081.txt
WER: 0

In [1]:
import os
import numpy as np

def levenshtein_distance(s1: str, s2: str) -> int:
    """Calculates the Levenshtein distance between two strings."""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

def cer_single(prediction: str, ground_truth: str) -> float:
    """Calculates the Character Error Rate (CER) between two strings."""
    # Calculate the Levenshtein distance (edit distance)
    edit_distance = levenshtein_distance(prediction, ground_truth)

    # CER = edit distance / number of characters in the ground truth
    return edit_distance / len(ground_truth) if len(ground_truth) > 0 else 0

def calculate_cer_over_dataset(predictions_folder: str, ground_truth_folder: str) -> float:
    """Calculates the average CER over a dataset of transcriptions."""
    cer_scores = []

    # Ensure both folders exist
    if not os.path.exists(predictions_folder) or not os.path.exists(ground_truth_folder):
        raise ValueError("Predictions or Ground Truth folder does not exist.")

    # List of files in both folders (assuming .txt files)
    prediction_files = [f for f in os.listdir(predictions_folder) if f.endswith('.txt')]
    ground_truth_files = [f for f in os.listdir(ground_truth_folder) if f.endswith('.txt')]

    # Ensure both folders contain the same files
    common_files = set(prediction_files).intersection(set(ground_truth_files))

    for file_name in common_files:
        pred_file_path = os.path.join(predictions_folder, file_name)
        gt_file_path = os.path.join(ground_truth_folder, file_name)

        # Read the predicted and ground truth transcriptions
        with open(pred_file_path, 'r', encoding='utf-8') as pred_file:
            prediction_text = pred_file.read().strip()

        with open(gt_file_path, 'r', encoding='utf-8') as gt_file:
            ground_truth_text = gt_file.read().strip()

        # Calculate CER for the current pair and convert it to percentage
        cer = cer_single(prediction_text, ground_truth_text) * 100
        cer_scores.append(cer)

        print(f"{file_name} - CER: {cer:.2f}%")

    # Return the average CER across the dataset as percentage
    return np.mean(cer_scores)

# Example usage
predictions_folder = "transcriptions_IAM_claude"  # Path to the folder containing predicted transcriptions
ground_truth_folder = "IAM/aachen_validation_txt"  # Path to the folder containing ground truth transcriptions

average_cer = calculate_cer_over_dataset(predictions_folder, ground_truth_folder)
print(f"Average CER over the dataset: {average_cer:.2f}%")


c06-083.txt - CER: 3.17%
p03-142.txt - CER: 6.45%
f04-100.txt - CER: 19.25%
d01-060.txt - CER: 32.43%
n03-082.txt - CER: 8.65%
p03-103.txt - CER: 8.81%
e01-018.txt - CER: 6.80%
d04-005.txt - CER: 2.99%
m04-145.txt - CER: 4.03%
m04-251.txt - CER: 9.47%
m02-083.txt - CER: 2.58%
n04-039.txt - CER: 5.04%
d06-050.txt - CER: 2.62%
d04-117.txt - CER: 4.99%
m04-209.txt - CER: 3.06%
p03-057.txt - CER: 9.32%
g04-039.txt - CER: 8.40%
m03-095.txt - CER: 3.96%
m03-062.txt - CER: 4.46%
p02-131.txt - CER: 2.64%
g01-083.txt - CER: 7.06%
d04-121.txt - CER: 4.43%
d04-008.txt - CER: 2.68%
p03-069.txt - CER: 4.19%
f04-049.txt - CER: 2.90%
p02-127.txt - CER: 5.88%
n06-156.txt - CER: 2.75%
m04-152.txt - CER: 4.29%
m04-200.txt - CER: 3.60%
n06-163.txt - CER: 5.98%
n02-098.txt - CER: 7.29%
m02-052.txt - CER: 6.06%
d06-015.txt - CER: 2.79%
n04-195.txt - CER: 2.82%
e06-006.txt - CER: 3.56%
f07-028b.txt - CER: 1.22%
n04-183.txt - CER: 2.97%
f04-046.txt - CER: 2.42%
g03-058.txt - CER: 4.08%
g04-036.txt - CER: 7.6

In [2]:
import os
import numpy as np

def levenshtein_distance(s1: list, s2: list) -> int:
    """Calculates the Levenshtein distance between two lists of words."""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = list(range(len(s2) + 1))
    for i, word1 in enumerate(s1):
        current_row = [i + 1]
        for j, word2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (word1 != word2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

def wer_single(prediction: str, ground_truth: str) -> float:
    """Calculates the Word Error Rate (WER) between two strings."""
    # Split the strings into words
    prediction_words = prediction.split()
    ground_truth_words = ground_truth.split()

    # Calculate the Levenshtein distance (edit distance) for words
    edit_distance = levenshtein_distance(prediction_words, ground_truth_words)

    # WER = edit distance / number of words in the ground truth
    return edit_distance / len(ground_truth_words) if len(ground_truth_words) > 0 else 0

def calculate_wer_over_dataset(predictions_folder: str, ground_truth_folder: str) -> float:
    """Calculates the average WER over a dataset of transcriptions."""
    wer_scores = []

    # Ensure both folders exist
    if not os.path.exists(predictions_folder) or not os.path.exists(ground_truth_folder):
        raise ValueError("Predictions or Ground Truth folder does not exist.")

    # List of files in both folders (assuming .txt files)
    prediction_files = [f for f in os.listdir(predictions_folder) if f.endswith('.txt')]
    ground_truth_files = [f for f in os.listdir(ground_truth_folder) if f.endswith('.txt')]

    # Ensure both folders contain the same files
    common_files = set(prediction_files).intersection(set(ground_truth_files))

    for file_name in common_files:
        pred_file_path = os.path.join(predictions_folder, file_name)
        gt_file_path = os.path.join(ground_truth_folder, file_name)

        # Read the predicted and ground truth transcriptions
        with open(pred_file_path, 'r', encoding='utf-8') as pred_file:
            prediction_text = pred_file.read().strip()

        with open(gt_file_path, 'r', encoding='utf-8') as gt_file:
            ground_truth_text = gt_file.read().strip()

        # Calculate WER for the current pair and convert it to percentage
        wer = wer_single(prediction_text, ground_truth_text) * 100
        wer_scores.append(wer)

        print(f"{file_name} - WER: {wer:.2f}%")

    # Return the average WER across the dataset as percentage
    return np.mean(wer_scores)

# Example usage
predictions_folder = "transcriptions_IAM_claude"  # Path to the folder containing predicted transcriptions
ground_truth_folder = "IAM/aachen_validation_txt"  # Path to the folder containing ground truth transcriptions

average_wer = calculate_wer_over_dataset(predictions_folder, ground_truth_folder)
print(f"Average WER over the dataset: {average_wer:.2f}%")


c06-083.txt - WER: 5.33%
p03-142.txt - WER: 5.26%
f04-100.txt - WER: 17.86%
d01-060.txt - WER: 31.25%
n03-082.txt - WER: 7.02%
p03-103.txt - WER: 7.84%
e01-018.txt - WER: 5.80%
d04-005.txt - WER: 1.67%
m04-145.txt - WER: 3.33%
m04-251.txt - WER: 6.25%
m02-083.txt - WER: 2.59%
n04-039.txt - WER: 2.17%
d06-050.txt - WER: 0.00%
d04-117.txt - WER: 3.08%
m04-209.txt - WER: 0.00%
p03-057.txt - WER: 3.28%
g04-039.txt - WER: 8.22%
m03-095.txt - WER: 3.77%
m03-062.txt - WER: 5.00%
p02-131.txt - WER: 0.00%
g01-083.txt - WER: 7.08%
d04-121.txt - WER: 3.12%
d04-008.txt - WER: 0.00%
p03-069.txt - WER: 0.00%
f04-049.txt - WER: 3.80%
p02-127.txt - WER: 1.52%
n06-156.txt - WER: 3.12%
m04-152.txt - WER: 10.53%
m04-200.txt - WER: 3.45%
n06-163.txt - WER: 3.80%
n02-098.txt - WER: 3.70%
m02-052.txt - WER: 3.45%
d06-015.txt - WER: 0.00%
n04-195.txt - WER: 0.00%
e06-006.txt - WER: 0.00%
f07-028b.txt - WER: 0.00%
n04-183.txt - WER: 0.00%
f04-046.txt - WER: 0.00%
g03-058.txt - WER: 6.90%
g04-036.txt - WER: 8.