In [21]:
import os
from tqdm import tqdm
import glob
from xml.etree import ElementTree as ET
import re
import difflib
from torchmetrics.text import CharErrorRate, WordErrorRate
import jiwer

In [22]:
BASE = "/home/efittsc1/projects/latin-transcription/"
TRANSCRIBUS_SOURCE = BASE + "transkribus_model_result/page"
GROUND_TRUTH_SOURCE = BASE + "source"

In [23]:
transcribus_files = glob.glob(os.path.join(TRANSCRIBUS_SOURCE, "*.xml"))
transcribus_files_dics = {}

for file in transcribus_files:
    _, original_filename = os.path.basename(file).split("_")
    transcribus_files_dics[original_filename] = file



In [24]:
def get_namespace(element):
        m = re.match('\{.*\}', element.tag)
        return m.group(0)[1:-1] if m else ''

In [47]:
def extract_lines_from_xml(xml_file):
    lines = []
    tree = ET.parse(xml_file)
    ns = {"ns": get_namespace(tree.getroot())}
    ET.register_namespace('', ns['ns'])
    root = tree.getroot()
    try:
        for text_region in root.findall('.//ns:TextRegion', ns):
            for lineno, text_line in enumerate(text_region.findall('.//ns:TextLine', ns)):
                text = text_line.find('.//ns:TextEquiv', ns).find('.//ns:Unicode', ns).text
                if text is None:
                    continue
                text = text.strip()
                text = text.replace(",", ".")
                lines.append(text)
    except Exception as e:
        print(f"Error processing {xml_file}: {e}")
    return lines


In [48]:
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
])

In [51]:
for filename in tqdm(glob.glob(GROUND_TRUTH_SOURCE + "/*.xml")):
    if os.path.basename(filename) not in transcribus_files_dics:
        print(f"{os.path.basename(filename)} not found in transcribus files")
    #assert os.path.basename(filename) in transcribus_files_dics, f"{os.path.basename(filename)} not found in transcribus files"

100%|██████████| 103/103 [00:00<00:00, 560328.55it/s]

JUST1-633m38.xml not found in transcribus files
JUST1-633m11.xml not found in transcribus files
JUST1-633m12d.xml not found in transcribus files
JUST1-633m20.xml not found in transcribus files





In [52]:
char_error_rate = CharErrorRate()
word_error_rate = WordErrorRate()

data = {}

for filename in tqdm(glob.glob(GROUND_TRUTH_SOURCE + "/*.xml")):
    if os.path.basename(filename) not in transcribus_files_dics:
        continue
    transkribus_file = transcribus_files_dics[os.path.basename(filename)]
    ground_truth_file = filename
    transkribus_lines = extract_lines_from_xml(transkribus_file)
    ground_truth_lines = extract_lines_from_xml(ground_truth_file)

    transkribus_text = " ".join(transkribus_lines)
    ground_truth_text = " ".join(ground_truth_lines)
    
    gt = transformation(ground_truth_text)
    hyp = transformation(transkribus_text)  

    char_error_rate.update(hyp, gt)
    word_error_rate.update(hyp, gt)

    data[os.path.basename(filename)] = {
        "ground_truth": ground_truth_text,
        "hypothesis": transkribus_text,
    }


100%|██████████| 103/103 [02:46<00:00,  1.62s/it]


In [53]:
character_error_rate = float(char_error_rate.compute())
word_error_rate = float(word_error_rate.compute())
print(f"Character Error Rate: {character_error_rate:.3f}")
print(f"Word Error Rate: {word_error_rate:.3f}")

In [57]:
gt = data[os.path.basename(filename)]["ground_truth"]
hyp = data[os.path.basename(filename)]["hypothesis"]
wer = jiwer.process_words(
    gt,
    hyp,
)
cer = jiwer.process_characters(
    gt,
    hyp,
)
wer.wer
cer.cer

In [62]:
print(jiwer.visualize_alignment(wer))

sentence 1
REF:      ¶ Abbas de  Persouere summonitus fuit ad respondendum Matheo de   Besill de placito  quare levavit quoddam mercatum apud  Hauekebir ad *********** nocumentum mercati predicti  Mathei de Schorstan etc. Et  unde predictus Matheus per attornatum suum queritur quod cum habebat qualibet septimana per diem   martis quoddam mercatum ex dono domini regis       per cartam suam in villa ipsius Mathei de Schorstan et antiquius impetratum quam mercatum predicti Abbatis de Hauekebir. predictus Abbas de novo levavit predictum mercatum suum in Hauekebir scilicet qualibet  septimana per diem lune ita quod mercandie mercatorum que vendi debent et  solent ad mercatum suum in Schorstan modo venduntur ad  mercatum ipsius Abbatis de Hauekebir Et similiter teolonia et ** stallagia et alie consuetudines que dari  solent et debent pro merchandis mercatorum emptis in  mercato ipsius Mathei de Schorstan modo dantur et  capiuntur in mercato ipsius Abbatis de Hauekebir unde dicit quod per hoc