In [3]:
import os
from torchmetrics.text import CharErrorRate, EditDistance, WordErrorRate

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read().strip()

def evaluate_transcriptions(pred_folder, target_folder, file_limit=529):
    cer = CharErrorRate()
    wer = WordErrorRate()
    edit_distance = EditDistance()

    total_cer = 0
    total_wer = 0
    total_edit_distance = 0
    file_count = 0

    # Get lists of files from both folders
    pred_files = sorted([f for f in os.listdir(pred_folder) if f.endswith('.txt')])
    target_files = sorted([f for f in os.listdir(target_folder) if f.endswith('.txt')])

    for pred_filename in pred_files[:file_limit]:
        pred_id = os.path.splitext(pred_filename)[0]  # Get the ID before .txt
        
        # Check if there's a corresponding file in the target folder with the same ID
        matching_target_file = next((f for f in target_files if os.path.splitext(f)[0] == pred_id), None)

        if matching_target_file:
            pred_path = os.path.join(pred_folder, pred_filename)
            target_path = os.path.join(target_folder, matching_target_file)

            pred_text = read_file(pred_path)
            target_text = read_file(target_path)

            file_cer = cer([pred_text], [target_text])
            file_wer = wer([pred_text], [target_text])
            file_edit_distance = edit_distance([pred_text], [target_text])

            total_cer += file_cer
            total_wer += file_wer
            total_edit_distance += file_edit_distance
            file_count += 1

            print(f"File {file_count}: {pred_filename}")
            print(f"WER: {file_wer:.4f}")
            print(f"CER: {file_cer:.4f}")
            print(f"Edit Distance: {file_edit_distance:.4f}")
            print("--------------------")

        if file_count >= file_limit:
            break

    if file_count > 0:
        avg_cer = total_cer / file_count
        avg_wer = total_wer / file_count
        avg_edit_distance = total_edit_distance / file_count

        print("\nOverall Results:")
        print(f"Average WER: {avg_wer:.4f}")
        print(f"Average CER: {avg_cer:.4f}")
        print(f"Average Edit Distance: {avg_edit_distance:.4f}")
        print(f"Total files processed: {file_count}")
    else:
        print("No matching files found.")

# Example usage
pred_folder = 'BGdataset/transkribus_transcription'
target_folder = 'BGdataset/filtered100txt'
evaluate_transcriptions(pred_folder, target_folder)


File 1: 00003_1.txt
WER: 0.9518
CER: 0.7629
Edit Distance: 298.0000
--------------------
File 2: 00003_3.txt
WER: 0.5437
CER: 0.3476
Edit Distance: 171.0000
--------------------
File 3: 00004_1.txt
WER: 0.9786
CER: 0.7624
Edit Distance: 545.0000
--------------------
File 4: 00010_7.txt
WER: 0.9521
CER: 0.7905
Edit Distance: 547.0000
--------------------
File 5: 00012_13.txt
WER: 0.9664
CER: 0.7595
Edit Distance: 529.0000
--------------------
File 6: 00019_15.txt
WER: 0.6275
CER: 0.4875
Edit Distance: 138.0000
--------------------
File 7: 00021_7.txt
WER: 0.9508
CER: 0.7578
Edit Distance: 502.0000
--------------------
File 8: 00028_1.txt
WER: 0.9521
CER: 0.7811
Edit Distance: 598.0000
--------------------
File 9: 00029_11.txt
WER: 0.9558
CER: 0.7653
Edit Distance: 637.0000
--------------------
File 10: 00029_9.txt
WER: 0.9542
CER: 0.8011
Edit Distance: 591.0000
--------------------
File 11: 00030_1.txt
WER: 0.9700
CER: 0.8381
Edit Distance: 410.0000
--------------------
File 12: 00031_2

In [4]:
import os
import numpy as np

def levenshtein_distance(s1: str, s2: str) -> int:
    """Calculates the Levenshtein distance between two strings."""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

def cer_single(prediction: str, ground_truth: str) -> float:
    """Calculates the Character Error Rate (CER) between two strings."""
    # Calculate the Levenshtein distance (edit distance)
    edit_distance = levenshtein_distance(prediction, ground_truth)

    # CER = edit distance / number of characters in the ground truth
    return edit_distance / len(ground_truth) if len(ground_truth) > 0 else 0

def calculate_cer_over_dataset(predictions_folder: str, ground_truth_folder: str) -> float:
    """Calculates the average CER over a dataset of transcriptions."""
    cer_scores = []

    # Ensure both folders exist
    if not os.path.exists(predictions_folder) or not os.path.exists(ground_truth_folder):
        raise ValueError("Predictions or Ground Truth folder does not exist.")

    # List of files in both folders (assuming .txt files)
    prediction_files = [f for f in os.listdir(predictions_folder) if f.endswith('.txt')]
    ground_truth_files = [f for f in os.listdir(ground_truth_folder) if f.endswith('.txt')]

    # Ensure both folders contain the same files
    common_files = set(prediction_files).intersection(set(ground_truth_files))

    for file_name in common_files:
        pred_file_path = os.path.join(predictions_folder, file_name)
        gt_file_path = os.path.join(ground_truth_folder, file_name)

        # Read the predicted and ground truth transcriptions
        with open(pred_file_path, 'r', encoding='utf-8') as pred_file:
            prediction_text = pred_file.read().strip()

        with open(gt_file_path, 'r', encoding='utf-8') as gt_file:
            ground_truth_text = gt_file.read().strip()

        # Calculate CER for the current pair and convert it to percentage
        cer = cer_single(prediction_text, ground_truth_text) * 100
        cer_scores.append(cer)

        print(f"{file_name} - CER: {cer:.2f}%")

    # Return the average CER across the dataset as percentage
    return np.mean(cer_scores)

# Example usage
predictions_folder = "transcriptions_IAM_gpt4"  # Path to the folder containing predicted transcriptions
ground_truth_folder = "IAM/aachen_validation_txt"  # Path to the folder containing ground truth transcriptions

average_cer = calculate_cer_over_dataset(predictions_folder, ground_truth_folder)
print(f"Average CER over the dataset: {average_cer:.2f}%")


c04-160.txt - CER: 3.76%
e01-018.txt - CER: 9.57%
m04-019.txt - CER: 7.87%
n06-182.txt - CER: 7.76%
g04-036.txt - CER: 9.61%
n02-098.txt - CER: 12.50%
n04-195.txt - CER: 2.82%
p03-185.txt - CER: 3.85%
n06-148.txt - CER: 6.98%
m02-052.txt - CER: 2.73%
n06-186.txt - CER: 9.61%
m04-190.txt - CER: 2.94%
m04-164.txt - CER: 4.80%
m01-090.txt - CER: 5.01%
d01-024.txt - CER: 7.23%
p03-103.txt - CER: 11.01%
f04-049.txt - CER: 3.14%
n03-082.txt - CER: 4.50%
m04-209.txt - CER: 3.06%
e06-006.txt - CER: 3.56%
m04-024.txt - CER: 5.08%
p03-057.txt - CER: 9.89%
d04-117.txt - CER: 3.94%
p03-121.txt - CER: 5.45%
n02-000.txt - CER: 3.48%
n04-190.txt - CER: 3.42%
m06-019.txt - CER: 4.22%
m04-222.txt - CER: 4.37%
d01-056.txt - CER: 3.00%
f07-046b.txt - CER: 3.23%
n06-163.txt - CER: 7.13%
m04-180.txt - CER: 4.46%
n06-201.txt - CER: 8.02%
m02-090.txt - CER: 4.23%
d04-071.txt - CER: 4.21%
n02-082a.txt - CER: 6.25%
p03-135.txt - CER: 9.69%
p03-087.txt - CER: 7.32%
d03-112.txt - CER: 40.68%
n04-044.txt - CER: 4

In [5]:
import os
import numpy as np

def levenshtein_distance(s1: list, s2: list) -> int:
    """Calculates the Levenshtein distance between two lists of words."""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = list(range(len(s2) + 1))
    for i, word1 in enumerate(s1):
        current_row = [i + 1]
        for j, word2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (word1 != word2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

def wer_single(prediction: str, ground_truth: str) -> float:
    """Calculates the Word Error Rate (WER) between two strings."""
    # Split the strings into words
    prediction_words = prediction.split()
    ground_truth_words = ground_truth.split()

    # Calculate the Levenshtein distance (edit distance) for words
    edit_distance = levenshtein_distance(prediction_words, ground_truth_words)

    # WER = edit distance / number of words in the ground truth
    return edit_distance / len(ground_truth_words) if len(ground_truth_words) > 0 else 0

def calculate_wer_over_dataset(predictions_folder: str, ground_truth_folder: str) -> float:
    """Calculates the average WER over a dataset of transcriptions."""
    wer_scores = []

    # Ensure both folders exist
    if not os.path.exists(predictions_folder) or not os.path.exists(ground_truth_folder):
        raise ValueError("Predictions or Ground Truth folder does not exist.")

    # List of files in both folders (assuming .txt files)
    prediction_files = [f for f in os.listdir(predictions_folder) if f.endswith('.txt')]
    ground_truth_files = [f for f in os.listdir(ground_truth_folder) if f.endswith('.txt')]

    # Ensure both folders contain the same files
    common_files = set(prediction_files).intersection(set(ground_truth_files))

    for file_name in common_files:
        pred_file_path = os.path.join(predictions_folder, file_name)
        gt_file_path = os.path.join(ground_truth_folder, file_name)

        # Read the predicted and ground truth transcriptions
        with open(pred_file_path, 'r', encoding='utf-8') as pred_file:
            prediction_text = pred_file.read().strip()

        with open(gt_file_path, 'r', encoding='utf-8') as gt_file:
            ground_truth_text = gt_file.read().strip()

        # Calculate WER for the current pair and convert it to percentage
        wer = wer_single(prediction_text, ground_truth_text) * 100
        wer_scores.append(wer)

        print(f"{file_name} - WER: {wer:.2f}%")

    # Return the average WER across the dataset as percentage
    return np.mean(wer_scores)

# Example usage
predictions_folder = "transcriptions_IAM_gpt4"  
ground_truth_folder = "IAM/aachen_validation_txt"  

average_wer = calculate_wer_over_dataset(predictions_folder, ground_truth_folder)
print(f"Average WER over the dataset: {average_wer:.2f}%")


c04-160.txt - WER: 5.33%
e01-018.txt - WER: 7.25%
m04-019.txt - WER: 3.57%
n06-182.txt - WER: 6.25%
g04-036.txt - WER: 9.46%
n02-098.txt - WER: 7.41%
n04-195.txt - WER: 0.00%
p03-185.txt - WER: 7.59%
n06-148.txt - WER: 4.11%
m02-052.txt - WER: 3.45%
n06-186.txt - WER: 8.33%
m04-190.txt - WER: 2.56%
m04-164.txt - WER: 1.54%
m01-090.txt - WER: 1.39%
d01-024.txt - WER: 4.92%
p03-103.txt - WER: 9.80%
f04-049.txt - WER: 3.80%
n03-082.txt - WER: 8.77%
m04-209.txt - WER: 0.00%
e06-006.txt - WER: 0.00%
m04-024.txt - WER: 1.72%
p03-057.txt - WER: 8.20%
d04-117.txt - WER: 7.69%
p03-121.txt - WER: 16.13%
n02-000.txt - WER: 1.56%
n04-190.txt - WER: 3.64%
m06-019.txt - WER: 7.94%
m04-222.txt - WER: 1.79%
d01-056.txt - WER: 6.85%
f07-046b.txt - WER: 1.37%
n06-163.txt - WER: 5.06%
m04-180.txt - WER: 12.50%
n06-201.txt - WER: 5.97%
m02-090.txt - WER: 7.55%
d04-071.txt - WER: 11.86%
n02-082a.txt - WER: 18.18%
p03-135.txt - WER: 3.17%
p03-087.txt - WER: 3.95%
d03-112.txt - WER: 34.43%
n04-044.txt - WER: