In [1]:
import os
from tqdm import tqdm
import glob
from xml.etree import ElementTree as ET
import re
import difflib
from torchmetrics.text import CharErrorRate, WordErrorRate
import jiwer
import numpy as np
#import cv2
import matplotlib.pyplot as plt
import cv2
from shapely.affinity import scale
import random


In [45]:
BASE = "/home/efittsc1/projects/latin-transcription/"
TRANSCRIBUS_SOURCE = BASE + "transkribus_model_result/page"
GROUND_TRUTH_SOURCE = BASE + "source"

In [46]:
transcribus_files = glob.glob(os.path.join(TRANSCRIBUS_SOURCE, "*.xml"))
transcribus_files_dics = {}

for file in transcribus_files:
    _, original_filename = os.path.basename(file).split("_")
    transcribus_files_dics[original_filename] = file



In [47]:
def get_namespace(element):
        m = re.match('\{.*\}', element.tag)
        return m.group(0)[1:-1] if m else ''

In [48]:
def extract_lines_from_xml(xml_file):
    lines = []
    #print(f"Processing {xml_file}")
    tree = ET.parse(xml_file)
    ns = {"ns": get_namespace(tree.getroot())}
    ET.register_namespace('', ns['ns'])
    root = tree.getroot()
    try:
        for text_region in root.findall('.//ns:TextRegion', ns):
            for lineno, text_line in enumerate(text_region.findall('.//ns:TextLine', ns)):
                text = text_line.find('.//ns:TextEquiv', ns).find('.//ns:Unicode', ns).text
                if text is None:
                    continue
                baseline = text_line.find('ns:Baseline', ns).get('points')
                #print(baseline)
                baseline_list = np.array([p.split(",") for p in baseline.split(" ")], dtype=int)
                text = text.strip()
                text = text.replace(",", ".")
                lines.append((text, baseline_list))
    except Exception as e:
        print(f"Error processing {xml_file}: {e}")
    return lines


In [49]:
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
])

In [50]:
for filename in tqdm(glob.glob(GROUND_TRUTH_SOURCE + "/*.xml")):
    if os.path.basename(filename) not in transcribus_files_dics:
        print(f"{os.path.basename(filename)} not found in transcribus files")
    #assert os.path.basename(filename) in transcribus_files_dics, f"{os.path.basename(filename)} not found in transcribus files"

100%|██████████| 100/100 [00:00<00:00, 602629.89it/s]

JUST1-633m20.xml not found in transcribus files





In [51]:
def get_line_spacing(baselines):
    center_x = np.median([(l.T[0][0] + l.T[0][-1])/2 for l in baselines])
    
    center_ys = []
    for line in baselines:
        xs, ys = line.T
        center_ys.append(np.interp(center_x, xs, ys))

    med_spacing = np.median(np.diff(center_ys))
    return int(round(med_spacing))

In [52]:
def box_from_baseline(baseline_points, med_spacing, height):
        lower_spacing = int(round(0.23 * med_spacing))
        upper_spacing = int(round(0.77 * med_spacing))
    
        baseline_points_polygon = [[max(p[0], 0), min(height-1, p[1])+lower_spacing] for p in baseline_points]
        baseline_points_polygon += [[max(p[0], 0), max(0, p[1]-upper_spacing)] for p in baseline_points[::-1]]
        return np.array(baseline_points_polygon, dtype=np.int32)

In [53]:
from shapely.geometry import Polygon
from shapely.strtree import STRtree

In [54]:
dirs = [ BASE +"transkribus_model_result/images/matched_lines", "transkribus_model_result/images/unmatched_lines"]

for d in dirs:
    if os.path.exists(d):
        for f in os.listdir(d):
            os.remove(os.path.join(d, f))
        os.rmdir(d)
    os.makedirs(d)
    

In [55]:
def polygon_to_points(polygon):
    array= np.array(polygon.exterior.coords, dtype=np.int32)
    return array[:-1]

In [56]:
def scale_np_array_y(array, scale):
    array = array.astype(float)
    x_values = array.T[0]
    y_values = array.T[1]
    x_values_unique = np.unique(x_values)
    
    for x in x_values_unique:
        indices = np.where(x_values == x)
        midpoint = np.mean(y_values[indices])
        y_values[indices] = scale * (y_values[indices] - midpoint) + midpoint
    return np.array([x_values, y_values]).T.astype(int)


In [57]:
char_error_rate = CharErrorRate()
word_error_rate = WordErrorRate()

data = {}

for filename in tqdm(glob.glob(GROUND_TRUTH_SOURCE + "/*.xml")):
    if os.path.basename(filename) not in transcribus_files_dics:
        continue
    transkribus_file = transcribus_files_dics[os.path.basename(filename)]
    ground_truth_file = filename
    transkribus_lines = extract_lines_from_xml(transkribus_file)
    ground_truth_lines = extract_lines_from_xml(ground_truth_file)
    t_lines = [t[0] for t in transkribus_lines]
    gt_lines = [t[0] for t in ground_truth_lines]

    t_baselines = [t[1] for t in transkribus_lines]
    gt_baselines = [t[1] for t in ground_truth_lines]

    try:
        #med_spacing_t = get_line_spacing(t_baselines)
        med_spacing_gt = get_line_spacing(gt_baselines)
    except Exception as e:
        print(filename)
        print(f"transkribus_lines: {t_baselines}")
        print(f"ground_truth_lines: {gt_baselines}")
        print(f"Error processing {filename}: {e}")
        continue

    image_path = filename.replace(".xml", ".jpeg")
    if not os.path.exists(image_path):
        image_path = filename.replace(".xml", ".jpg")
    if not os.path.exists(image_path):
        image_path = filename.replace(".xml", ".png")
    if not os.path.exists(image_path):
        raise Exception(f"Image not found for {filename}")
    

    image = cv2.imread(image_path)
    height, width, _ = image.shape

    def get_bbx(np_array):
        x = np_array[:, 0]
        y = np_array[:, 1]
        return np.array([[np.min(x), np.min(y)], [np.max(x), np.max(y)]])
    
    t_bounding_boxes = [get_bbx(t[1]) for t in transkribus_lines]
    gt_bounding_boxes = [get_bbx(t[1]) for t in ground_truth_lines]

    t_polygons = [box_from_baseline(t[1], med_spacing_gt, height) for t in transkribus_lines]
    gt_polygons = [box_from_baseline(t[1], med_spacing_gt, height) for t in ground_truth_lines]

    shapely_t_polygons = [Polygon(p.tolist()) for p in t_polygons]
    shapely_gt_polygons = [Polygon(p.tolist()) for p in gt_polygons]

    gt_strtree = STRtree(shapely_gt_polygons)

    matched_polygons = []

    for i, t_polygon in enumerate(shapely_t_polygons):
        possible_matches = gt_strtree.query(t_polygon)
        scaled_t_polygon = Polygon(scale_np_array_y(polygon_to_points(t_polygon), 1.6).tolist())
        for match_idx in possible_matches:
            match = shapely_gt_polygons[match_idx]
            scaled_match = Polygon(scale_np_array_y(polygon_to_points(match), 1.6).tolist())
            intersection = t_polygon.intersection(scaled_match)
            intersection2 = scaled_t_polygon.intersection(match)
            if intersection.area > 0.9 * t_polygon.area and intersection2.area > 0.9 * match.area:
                matched_polygons.append((i, shapely_gt_polygons.index(match)))
            
    
    for i, j in matched_polygons:
        random_color = (np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))
        cv2.polylines(image, [t_polygons[i]], isClosed=True, color=random_color, thickness=3)
        cv2.polylines(image, [gt_polygons[j]], isClosed=True, color=random_color, thickness=2)
    # save image to images/works
    filename_ = "images/matched_lines/" + os.path.basename(filename).split('.')[0] + ".png"
    cv2.imwrite(filename_, image)

    
    unmatched_polygons_t = [i for i in range(len(t_polygons)) if i not in [m[0] for m in matched_polygons]]
    unmatched_polygons_gt = [i for i in range(len(gt_polygons)) if i not in [m[1] for m in matched_polygons]]

    if len(unmatched_polygons_t + unmatched_polygons_gt) != 0:
        new_image = cv2.imread(image_path)
        print(f"Unmatched polygons for {filename}")
        for i in unmatched_polygons_t:
            cv2.polylines(new_image, [t_polygons[i]], isClosed=True, color=(0, 255, 0), thickness=3)
        for i in unmatched_polygons_gt:
            cv2.polylines(new_image, [gt_polygons[i]], isClosed=True, color=(255, 0, 0), thickness=2)
        
        stacked_image = np.vstack([image, new_image])
        filename_ = "images/unmatched_lines/" + os.path.basename(filename).split('.')[0] + ".png"
        cv2.imwrite(filename_, stacked_image)

    data[os.path.basename(filename)] = {}
    for index, (i, j) in enumerate(matched_polygons):
        gt = transformation(ground_truth_lines[j][0])
        hyp = transformation(transkribus_lines[i][0])

        char_error_rate.update(hyp, gt)
        word_error_rate.update(hyp, gt)

        data[os.path.basename(filename)][index] = {
            "ground_truth": ground_truth_lines[j][0],
            "hypothesis": transkribus_lines[i][0],
            "ground_truth_index": j,
            "hypothesis_index": i,
        }


  0%|          | 0/100 [00:00<?, ?it/s]

  1%|          | 1/100 [00:00<00:27,  3.66it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m3d.xml


  2%|▏         | 2/100 [00:00<00:32,  3.01it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m84d.xml


  3%|▎         | 3/100 [00:00<00:28,  3.42it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-635m13f.xml


  4%|▍         | 4/100 [00:01<00:26,  3.65it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m48d.xml


  5%|▌         | 5/100 [00:01<00:21,  4.35it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-632m3d.xml


  8%|▊         | 8/100 [00:01<00:14,  6.36it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/memb2.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m3.xml


  9%|▉         | 9/100 [00:01<00:16,  5.47it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m11.xml


 10%|█         | 10/100 [00:02<00:19,  4.55it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-820m18.xml


 11%|█         | 11/100 [00:02<00:18,  4.78it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m2d.xml


 15%|█▌        | 15/100 [00:02<00:09,  8.59it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/norw5.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/norw2.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m2.xml


 16%|█▌        | 16/100 [00:02<00:10,  7.64it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m13.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-635m35d.xml


 18%|█▊        | 18/100 [00:03<00:09,  8.98it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/memb1.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m15d.xml


 19%|█▉        | 19/100 [00:03<00:10,  7.99it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/norw6.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m13d.xml


 22%|██▏       | 22/100 [00:03<00:12,  6.06it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m4.xml


 23%|██▎       | 23/100 [00:04<00:13,  5.54it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m17.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m5a.xml


 26%|██▌       | 26/100 [00:04<00:15,  4.63it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m58.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/norw4.xml


 28%|██▊       | 28/100 [00:04<00:11,  6.07it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m8d.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m2d.xml


 30%|███       | 30/100 [00:05<00:11,  6.12it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m52.xml


 32%|███▏      | 32/100 [00:05<00:14,  4.64it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m4da.xml


 33%|███▎      | 33/100 [00:05<00:12,  5.18it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m5da.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-635m6.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-635m37.xml


 36%|███▌      | 36/100 [00:06<00:08,  7.12it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m7a.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-277m15da.xml


 39%|███▉      | 39/100 [00:06<00:08,  6.86it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m30d.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-635m6f.xml


 41%|████      | 41/100 [00:06<00:07,  7.39it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m38d.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/norw1.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m16d.xml


 45%|████▌     | 45/100 [00:07<00:07,  7.24it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-635m12d.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-635m36d.xml


 47%|████▋     | 47/100 [00:07<00:09,  5.56it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m31.xml


 48%|████▊     | 48/100 [00:08<00:09,  5.38it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m3d.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-635m95.xml


 49%|████▉     | 49/100 [00:08<00:09,  5.48it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-277m15d.xml


 51%|█████     | 51/100 [00:08<00:11,  4.41it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m17d.xml


 53%|█████▎    | 53/100 [00:09<00:10,  4.61it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m6da.xml


 54%|█████▍    | 54/100 [00:09<00:11,  4.10it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m45d.xml


 55%|█████▌    | 55/100 [00:09<00:12,  3.69it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST-633m12d.xml


 56%|█████▌    | 56/100 [00:10<00:10,  4.17it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m32.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m21d.xml


 58%|█████▊    | 58/100 [00:10<00:07,  5.68it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-230m2.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m11d.xml


 60%|██████    | 60/100 [00:10<00:08,  4.70it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m14da.xml


 61%|██████    | 61/100 [00:11<00:08,  4.61it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m29.xml


 63%|██████▎   | 63/100 [00:11<00:09,  4.10it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m15a.xml


 66%|██████▌   | 66/100 [00:12<00:06,  5.56it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m12.xml


 69%|██████▉   | 69/100 [00:12<00:04,  6.22it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-235m13.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m7.xml


 72%|███████▏  | 72/100 [00:13<00:05,  5.35it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m80.xml


 75%|███████▌  | 75/100 [00:13<00:03,  7.05it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/norw7.xml


 76%|███████▌  | 76/100 [00:13<00:03,  6.41it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-635m89.xml


 77%|███████▋  | 77/100 [00:13<00:04,  4.92it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST-633m38.xml


 78%|███████▊  | 78/100 [00:13<00:03,  5.53it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/norw3.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m167d.xml


 82%|████████▏ | 82/100 [00:14<00:02,  6.99it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m5a.xml


 83%|████████▎ | 83/100 [00:14<00:03,  5.59it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m23d.xml


 84%|████████▍ | 84/100 [00:15<00:03,  4.49it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m6d.xml


 85%|████████▌ | 85/100 [00:15<00:02,  5.03it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m8.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m1.xml


 87%|████████▋ | 87/100 [00:15<00:02,  4.71it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m10d.xml


 88%|████████▊ | 88/100 [00:15<00:02,  4.56it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST-633m11.xml


 89%|████████▉ | 89/100 [00:16<00:02,  4.13it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m32d.xml


 90%|█████████ | 90/100 [00:16<00:02,  4.02it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m18d.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m5.xml


 92%|█████████▏| 92/100 [00:17<00:02,  2.99it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m22.xml


 94%|█████████▍| 94/100 [00:17<00:01,  4.31it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m3a.xml
Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m18.xml


 96%|█████████▌| 96/100 [00:17<00:00,  4.91it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST-633m20.xml


 97%|█████████▋| 97/100 [00:18<00:00,  4.52it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m12d.xml


 98%|█████████▊| 98/100 [00:18<00:00,  5.01it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m3b.xml


 99%|█████████▉| 99/100 [00:18<00:00,  4.44it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-633m9d.xml


100%|██████████| 100/100 [00:18<00:00,  5.28it/s]

Unmatched polygons for /home/efittsc1/projects/latin-transcription/source/JUST1-734m17da.xml





In [58]:
character_error_rate_ = float(char_error_rate.compute())
word_error_rate_ = float(word_error_rate.compute())


In [59]:
print(f"Character Error Rate: {character_error_rate_}")
print(f"Word Error Rate: {word_error_rate_}")

Character Error Rate: 0.22566388547420502
Word Error Rate: 0.5422852039337158


In [60]:
char_error_rate_old = CharErrorRate()
word_error_rate_old = WordErrorRate()

data_old = {}

for filename in tqdm(glob.glob(GROUND_TRUTH_SOURCE + "/*.xml")):
    if os.path.basename(filename) not in transcribus_files_dics:
        continue
    transkribus_file = transcribus_files_dics[os.path.basename(filename)]
    ground_truth_file = filename
    transkribus_lines = extract_lines_from_xml(transkribus_file)
    ground_truth_lines = extract_lines_from_xml(ground_truth_file)

    transkribus_lines = [t[0] for t in transkribus_lines]
    ground_truth_lines = [t[0] for t in ground_truth_lines]

    transkribus_text = " ".join(transkribus_lines)
    ground_truth_text = " ".join(ground_truth_lines)
    
    gt = transformation(ground_truth_text)
    hyp = transformation(transkribus_text)  

    char_error_rate_old.update(hyp, gt)
    word_error_rate_old.update(hyp, gt)

    data_old[os.path.basename(filename)] = {
        "ground_truth": ground_truth_text,
        "hypothesis": transkribus_text,
    }

  1%|          | 1/99 [00:02<04:09,  2.55s/it]


KeyboardInterrupt: 

In [None]:
character_error_rate_old = float(char_error_rate_old.compute())
word_error_rate_old = float(word_error_rate_old.compute())
print(f"Character Error Rate: {character_error_rate_old}")
print(f"Word Error Rate: {word_error_rate_old}")

Character Error Rate: 0.2582835257053375
Word Error Rate: 0.566498339176178
