<a href="https://colab.research.google.com/github/eugeniavd/magic_tagger/blob/main/notebooks/03_transformer_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1.Setup

In [2]:
!git clone https://github.com/eugeniavd/magic_tagger.git
%cd magic_tagger
!ls -la


Cloning into 'magic_tagger'...
remote: Enumerating objects: 181, done.[K
remote: Counting objects: 100% (181/181), done.[K
remote: Compressing objects: 100% (136/136), done.[K
remote: Total 181 (delta 55), reused 161 (delta 35), pack-reused 0 (from 0)[K
Receiving objects: 100% (181/181), 21.13 MiB | 23.78 MiB/s, done.
Resolving deltas: 100% (55/55), done.
/content/magic_tagger
total 44
drwxr-xr-x 7 root root 4096 Dec 28 18:09 .
drwxr-xr-x 1 root root 4096 Dec 28 18:08 ..
drwxr-xr-x 5 root root 4096 Dec 28 18:09 app
drwxr-xr-x 5 root root 4096 Dec 28 18:09 data
drwxr-xr-x 7 root root 4096 Dec 28 18:09 docs
drwxr-xr-x 8 root root 4096 Dec 28 18:09 .git
-rw-r--r-- 1 root root   66 Dec 28 18:09 .gitattributes
-rw-r--r-- 1 root root  387 Dec 28 18:09 .gitignore
-rw-r--r-- 1 root root    0 Dec 28 18:09 LICENSE
drwxr-xr-x 3 root root 4096 Dec 28 18:09 notebooks
-rw-r--r-- 1 root root 4208 Dec 28 18:09 README.md


In [3]:
!pip -q install -r ./notebooks/requirements.txt


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m109.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m143.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m130.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m128.0 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2. Data loading and preprocessing

### 1. Data loading

In [25]:
# === Imports ===
from pathlib import Path
import pandas as pd
from lxml import etree
from PIL import Image
from typing import Iterable, Union
import fitz
import re
import numpy as np
import torch
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import requests

In [5]:
# === Paths ===
PROJECT_ROOT = Path(".").resolve()

PAGEXML_DIR_CANDIDATES = [
    PROJECT_ROOT / "data" / "raw" / "scans" / "eval_pages_xml"
]

EVAL_SUMMARY_CANDIDATES = [
    PROJECT_ROOT / "docs" / "htr" / "eval_summary.xlsx"
]

GOLDEN_TRUTH_XLSX_CANDIDATES = [
    PROJECT_ROOT / "docs" / "htr" / "golden_truth.xlsx"
]

SCANS_ROOT = PROJECT_ROOT / "data" / "raw" / "scans"
assert SCANS_ROOT.exists(), f"SCANS_ROOT not found: {SCANS_ROOT}"


In [6]:
def first_existing(paths):
    for p in paths:
        if p.exists():
            return p
    return None


eval_summary_path = first_existing(EVAL_SUMMARY_CANDIDATES)
gt_xlsx_path = first_existing(GOLDEN_TRUTH_XLSX_CANDIDATES)

print("eval_summary_path:", eval_summary_path)
print("gt_xlsx_path:", gt_xlsx_path)

eval_summary_path: /content/magic_tagger/docs/htr/eval_summary.xlsx
gt_xlsx_path: /content/magic_tagger/docs/htr/golden_truth.xlsx


In [7]:
def find_page_image_in_tale(tale_id: str, page_id: str, scans_root: Path) -> Path | None:
    tale_dir = scans_root / tale_id
    if not tale_dir.exists():
        return None

    exts = ("*.pdf")
    for ext in ("pdf"):
        p = tale_dir / f"{page_id}.{ext}"
        if p.exists():
            return p

    for pattern in exts:
        for p in tale_dir.glob(pattern):
            if p.stem.startswith(page_id):
                return p
    return None


def load_image(tale_id: str, page_id: str, scans_root: Path) -> Image.Image:
    img_path = find_page_image_in_tale(tale_id, page_id, scans_root)
    if not img_path:
        raise FileNotFoundError(f"No image found for tale_id={tale_id}, page_id={page_id} in {scans_root/tale_id}")
    return Image.open(img_path).convert("RGB")


In [8]:
def first_dir_with_files(candidates: Union[Path, str, Iterable[Union[Path, str]]], pattern: str):

    # Нормализуем вход: если это один Path/str — оборачиваем в список
    if isinstance(candidates, (Path, str)):
        candidates = [candidates]

    for cand in candidates:
        cand = Path(cand)
        if not cand.exists():
            continue
        if any(cand.rglob(pattern)):
            return cand
    return None


pagexml_root = first_dir_with_files(PAGEXML_DIR_CANDIDATES, "*.xml")
pdf_root     = first_dir_with_files(SCANS_ROOT, "*.pdf")

print("pagexml_root:", pagexml_root)
print("pdf_root:", pdf_root)


pagexml_root: /content/magic_tagger/data/raw/scans/eval_pages_xml
pdf_root: /content/magic_tagger/data/raw/scans


In [9]:
dfs = {}

if eval_summary_path:
    dfs["eval_summary"] = pd.read_excel(eval_summary_path)
    print("Loaded eval_summary:", dfs["eval_summary"].shape)
    display(dfs["eval_summary"].head(3))

if gt_xlsx_path:
    dfs["golden_truth_xlsx"] = pd.read_excel(gt_xlsx_path)
    print("Loaded golden_truth.xlsx:", dfs["golden_truth_xlsx"].shape)
    display(dfs["golden_truth_xlsx"].head(3))


Loaded eval_summary: (10, 10)


Unnamed: 0,handwriting_id,n_pages_made,CER_transkrib,WER_transkrib,CER_transkrib_average,WER_transkrib_average,CER_transkrib_median,WER_transkrib_median,p_CER_le_25_transkrib_weighted,p_CER_ge_50_transcrib_weighted
0,H01,2,10.77,0.2464,33.43,0.522247,23.575,0.4703,1,0.0
1,H02,2,49.18,0.5924,33.43,0.522247,23.575,0.4703,0,0.564103
2,H03,2,76.82,0.8894,33.43,0.522247,23.575,0.4703,0,1.0


Loaded golden_truth.xlsx: (20, 19)


Unnamed: 0,collector_ids,collector_count,handwriting_id,handwriting_status (unknown/assigned),handwriting_confidence (low | medium | high),tale_id,image_filename,page_no,page_side,page_id,lines_count,lines_selected,quality_hint (good|ok|bad),major_issues,gt_status,CER_transkrib,WER_transkrib,p_CER_le_25_transkrib_weighted,p_CER_ge_50_transcrib_weighted
0,Федор Коняев,1,H01,assigned,high,era_vene_2_605_4,era_vene_02_606-607.pdf,606,L,era_vene_2_606,18,20,good,,made,10.77,0.2464,1,0.0
1,Федор Коняев,1,H01,assigned,high,era_vene_2_605_4,era_vene_02_606-607.pdf,607,R,era_vene_2_607,19,21,good,,made,10.77,0.2464,1,0.0
2,Nina Valkevitš,1,H02,assigned,high,era_vene_5_167_9,era_vene_05_166-167.pdf,167,R,era_vene_5_167,20,17,good,,made,49.18,0.5924,0,0.564103


In [10]:
def parse_pagexml(xml_path: Path) -> pd.DataFrame:
    """
    parse transkribus export and return DataFrame of with lines coordinates and golden truth text:
    """
    parser = etree.XMLParser(recover=True, huge_tree=True)
    tree = etree.parse(str(xml_path), parser)
    root = tree.getroot()

    # Namespace handling
    ns = root.nsmap.get(None)  # default namespace
    if ns:
        N = {"p": ns}
        xpath_line = ".//p:TextLine"
        xpath_unicode = ".//p:Unicode"
        xpath_coords = ".//p:Coords"
    else:
        N = {}
        xpath_line = ".//TextLine"
        xpath_unicode = ".//Unicode"
        xpath_coords = ".//Coords"

    rows = []

    page_id = xml_path.stem

    for line in root.xpath(xpath_line, namespaces=N):
        line_id = line.get("id") or line.get("{http://www.w3.org/XML/1998/namespace}id") or ""

        # Unicode может быть внутри TextEquiv
        unicode_el = line.xpath(xpath_unicode, namespaces=N)
        text = ""
        if unicode_el:
            # берём последний (часто лучший) вариант
            text = unicode_el[-1].text or ""

        # Coords: points="x1,y1 x2,y2 ..."
        coords_el = line.xpath(xpath_coords, namespaces=N)
        x = y = w = h = None
        if coords_el:
            pts = coords_el[0].get("points")
            if pts:
                xy = []
                for p in pts.split():
                    if "," in p:
                        px, py = p.split(",")
                        xy.append((int(float(px)), int(float(py))))
                if xy:
                    xs = [p[0] for p in xy]
                    ys = [p[1] for p in xy]
                    x, y = min(xs), min(ys)
                    w, h = max(xs) - x, max(ys) - y

        rows.append({
            "pagexml_file": str(xml_path.name),
            "page_id": page_id,
            "line_id": line_id,
            "x": x, "y": y, "w": w, "h": h,
            "gt_text": text,
        })

    return pd.DataFrame(rows)


def load_pagexml_folder(pagexml_folder: Path) -> pd.DataFrame:
    all_rows = []
    for xml_path in sorted(pagexml_folder.glob("*.xml")):
        try:
            df = parse_pagexml(xml_path)
            all_rows.append(df)
        except Exception as e:
            print(f"[WARN] Failed parsing {xml_path.name}: {e}")

    if not all_rows:
        return pd.DataFrame(columns=["pagexml_file","page_id","line_id","x","y","w","h","gt_text"])
    out = pd.concat(all_rows, ignore_index=True)
    return out


if pagexml_root:
    df_gt_lines = load_pagexml_folder(pagexml_root)
    print("Loaded GT lines from PageXML:", df_gt_lines.shape)
    display(df_gt_lines.head(5))
else:
    df_gt_lines = pd.DataFrame()
    print("No PageXML folder found. Put *.xml exports into e.g. data/gt_pagexml/")

Loaded GT lines from PageXML: (351, 8)


Unnamed: 0,pagexml_file,page_id,line_id,x,y,w,h,gt_text
0,era_rkm_vene_03_258-259.xml,era_rkm_vene_03_258-259,tr_2_tl_1,401,387,1079,145,"чулок вязать, кто хошь"
1,era_rkm_vene_03_258-259.xml,era_rkm_vene_03_258-259,tr_2_tl_2,405,487,1173,136,(сеть) вязти кто что. Се¬
2,era_rkm_vene_03_258-259.xml,era_rkm_vene_03_258-259,tr_2_tl_3,416,580,1138,107,одна супряженка к сто-
3,era_rkm_vene_03_258-259.xml,era_rkm_vene_03_258-259,tr_2_tl_4,404,676,1108,121,лу вязть. Под столом
4,era_rkm_vene_03_258-259.xml,era_rkm_vene_03_258-259,tr_2_tl_5,404,793,1101,92,была в подпол большая


In [11]:
def find_pdf_for_tale(pdf_root: Path, tale_id: str) -> Path:
    tale_dir = pdf_root / tale_id
    if not tale_dir.exists():
        raise FileNotFoundError(f"No tale folder under scans: {tale_dir}")

    pdfs = sorted(tale_dir.glob("*.pdf"))
    if not pdfs:
        raise FileNotFoundError(f"No PDFs found in: {tale_dir}")

    return pdfs[0]

In [12]:
tale_id = "era_vene_8_210_126"

pdf_path = find_pdf_for_tale(pdf_root, tale_id)

print("PDF:", pdf_path)


PDF: /content/magic_tagger/data/raw/scans/era_vene_8_210_126/era_vene_08_210-211.pdf


In [13]:
def build_tale_to_pdf_map(pdf_root: Path) -> dict[str, Path]:
    m = {}
    for tale_dir in sorted([p for p in pdf_root.iterdir() if p.is_dir()]):
        pdfs = sorted(tale_dir.glob("*.pdf"))
        if pdfs:
            m[tale_dir.name] = pdfs[0]
    return m

tale2pdf = build_tale_to_pdf_map(pdf_root)
print("Found tales with PDFs:", len(tale2pdf))
print("Example:", next(iter(tale2pdf.items())))


Found tales with PDFs: 10
Example: ('era_vene_12_137_98', PosixPath('/content/magic_tagger/data/raw/scans/era_vene_12_137_98/era_vene_12_138-139.pdf'))


In [14]:
def render_pdf_to_images(pdf_path: Path, out_dir: Path, dpi: int = 250) -> list[Path]:
    """
    render PDF to PNG.
    return paths to PNG
    """
    pdf_path = Path(pdf_path)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    doc = fitz.open(pdf_path)
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)

    out_paths = []
    for i in range(doc.page_count):
        out_png = out_dir / f"page_{i:04d}.png"
        if not out_png.exists():
            page = doc.load_page(i)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            pix.save(out_png.as_posix())
        out_paths.append(out_png)

    doc.close()
    return out_paths

In [15]:
tale_id = "era_vene_8_210_126"
pdf_path = find_pdf_for_tale(pdf_root, tale_id)

render_dir = PROJECT_ROOT / "data" / "processed" / "renders" / tale_id
page_images = render_pdf_to_images(pdf_path, render_dir, dpi=250)

print("PDF:", pdf_path)
print("Rendered pages:", len(page_images))
print("First image:", page_images[0])


PDF: /content/magic_tagger/data/raw/scans/era_vene_8_210_126/era_vene_08_210-211.pdf
Rendered pages: 1
First image: /content/magic_tagger/data/processed/renders/era_vene_8_210_126/page_0000.png


### 2. Cutting images to lines

In [16]:
def find_pagexml_paths(pagexml_root: Path, tale_id: str) -> list[Path]:
    pagexml_root = Path(pagexml_root)

    hits = sorted(pagexml_root.rglob(f"*{tale_id}*.xml"))
    if hits:
        return hits

    return sorted(pagexml_root.rglob("*.xml"))

In [17]:
xml_paths_all = find_pagexml_paths(pagexml_root, tale_id)
print("Found xml candidates:", len(xml_paths_all))
print("First 5:", xml_paths_all[:5])


Found xml candidates: 10
First 5: [PosixPath('/content/magic_tagger/data/raw/scans/eval_pages_xml/era_rkm_vene_03_258-259.xml'), PosixPath('/content/magic_tagger/data/raw/scans/eval_pages_xml/era_vene_02_606-607.xml'), PosixPath('/content/magic_tagger/data/raw/scans/eval_pages_xml/era_vene_05_166-167.xml'), PosixPath('/content/magic_tagger/data/raw/scans/eval_pages_xml/era_vene_05_442-443.xml'), PosixPath('/content/magic_tagger/data/raw/scans/eval_pages_xml/era_vene_06_380-381.xml')]


In [18]:
xml_paths_all = find_pagexml_paths(pagexml_root, tale_id)

if len(xml_paths_all) == 0:
    raise RuntimeError("We have not finded PageXML in the repo")

if len(xml_paths_all) == len(page_images):
    xml_paths = xml_paths_all
else:
    if len(page_images) == 1:
        xml_paths = [xml_paths_all[0]]
    else:
        xml_paths = xml_paths_all[:len(page_images)]

print("Using xml_paths:", len(xml_paths), "for pages:", len(page_images))
print("Example:", xml_paths[0], page_images[0])


Using xml_paths: 1 for pages: 1
Example: /content/magic_tagger/data/raw/scans/eval_pages_xml/era_rkm_vene_03_258-259.xml /content/magic_tagger/data/processed/renders/era_vene_8_210_126/page_0000.png


In [19]:
def normalize_text(s: str) -> str:
    if s is None:
        return ""
    s = s.strip()

    s = re.sub(r"\s+", " ", s)
    return s

def render_pdf_first_page(pdf_path: Path, dpi: int = 200) -> Image.Image:
    doc = fitz.open(str(pdf_path))
    page = doc.load_page(0)
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    doc.close()
    return img

def safe_crop(img: Image.Image, x: int, y: int, w: int, h: int, pad: int = 0) -> Image.Image:
    W, H = img.size
    x0 = max(0, int(x) - pad)
    y0 = max(0, int(y) - pad)
    x1 = min(W, int(x) + int(w) + pad)
    y1 = min(H, int(y) + int(h) + pad)

    if x1 <= x0 or y1 <= y0:
        return Image.new("RGB", (10, 10), (255, 255, 255))
    return img.crop((x0, y0, x1, y1))


In [20]:


def build_pageid_to_pdf(scans_root: Path, page_ids: list[str]) -> dict[str, Path]:
    pageid_to_pdf = {}
    missing = []
    for pid in page_ids:
        hits = list(scans_root.rglob(f"{pid}.pdf"))
        if not hits:

            hits = list(scans_root.rglob(f"*{pid}*.pdf"))
        if hits:

            hits = sorted(hits, key=lambda p: len(str(p)))
            pageid_to_pdf[pid] = hits[0]
        else:
            missing.append(pid)
    if missing:
        print("WARN: no PDF found for page_id (first 10):", missing[:10], "… total:", len(missing))
    return pageid_to_pdf

page_ids = sorted(df_gt_lines["page_id"].unique().tolist())
pageid_to_pdf = build_pageid_to_pdf(SCANS_ROOT, page_ids)
print("Resolved PDFs:", len(pageid_to_pdf), "out of", len(page_ids))


Resolved PDFs: 10 out of 10


In [21]:

pageid_to_pdf = {}
for pdf in SCANS_ROOT.rglob("*.pdf"):
    pageid_to_pdf[pdf.stem] = pdf

print("PDFs indexed:", len(pageid_to_pdf))

missing = sorted(set(df_gt_lines["page_id"]) - set(pageid_to_pdf.keys()))
print("Missing page_ids:", len(missing))
print("Example missing:", missing[:5])

PDFs indexed: 11
Missing page_ids: 0
Example missing: []


## 3. Run TrOCR

In [22]:
TROCR_MODEL_NAME = "kazars24/trocr-base-handwritten-ru"

processor = TrOCRProcessor.from_pretrained(TROCR_MODEL_NAME)
model = VisionEncoderDecoderModel.from_pretrained(TROCR_MODEL_NAME)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(DEVICE)
model.eval()

BATCH_SIZE = 8
CROP_PAD = 6
DPI = 200


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.46.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

generation_config.json:   0%|          | 0.00/273 [00:00<?, ?B/s]

In [23]:
print(torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)

True NVIDIA L4


In [28]:
from tqdm.auto import tqdm


def trocr_predict_lines_for_page(page_df: pd.DataFrame, page_img: Image.Image) -> list[str]:
    crops = []
    for _, r in page_df.iterrows():
        crop = safe_crop(
            page_img,
            x=int(r["x"]), y=int(r["y"]),
            w=int(r["w"]), h=int(r["h"]),
            pad=CROP_PAD
        ).convert("RGB")
        crops.append(crop)

    preds = []
    for i in range(0, len(crops), BATCH_SIZE):
        batch = crops[i:i+BATCH_SIZE]
        with torch.no_grad():
            pixel_values = processor(images=batch, return_tensors="pt").pixel_values.to(DEVICE)
            generated_ids = model.generate(pixel_values, max_new_tokens=128)
            texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
        preds.extend([normalize_text(t) for t in texts])

    return preds

# --- Ensure numeric coords ---
df_gt_lines = df_gt_lines.copy()
for col in ["x", "y", "w", "h"]:
    df_gt_lines[col] = pd.to_numeric(df_gt_lines[col], errors="coerce").fillna(0).astype(int)

df_gt_lines["pred_trocr"] = ""

page_image_cache = {}

for page_id, group in tqdm(df_gt_lines.groupby("page_id", sort=False), total=df_gt_lines["page_id"].nunique()):
    pdf_path = pageid_to_pdf.get(page_id)
    if pdf_path is None:
        continue

    if page_id not in page_image_cache:
        page_image_cache[page_id] = render_pdf_first_page(pdf_path, dpi=DPI)

    page_img = page_image_cache[page_id]

    preds = trocr_predict_lines_for_page(group, page_img)

    if len(preds) != len(group):
        raise RuntimeError(f"Pred/rows mismatch for {page_id}: preds={len(preds)} rows={len(group)}")

    df_gt_lines.loc[group.index, "pred_trocr"] = preds

filled = (df_gt_lines["pred_trocr"].str.len() > 0).sum()
print("Filled predictions:", filled, "/", len(df_gt_lines))


  0%|          | 0/10 [00:00<?, ?it/s]

Filled predictions: 351 / 351


In [42]:
df_gt_lines.tail(20)

Unnamed: 0,pagexml_file,page_id,line_id,x,y,w,h,gt_text,pred_trocr
331,era_vene_15_134-135.xml,era_vene_15_134-135,tr_3_tl_1,1936,394,1204,141,за мяня гамуж?» А ана,по
332,era_vene_15_134-135.xml,era_vene_15_134-135,tr_3_tl_2,1933,509,1212,114,"сказала: ""Ну как мне ийти, у",И
333,era_vene_15_134-135.xml,era_vene_15_134-135,tr_3_tl_3,1929,603,1228,97,"мяня нет больше ничаво, как",по
334,era_vene_15_134-135.xml,era_vene_15_134-135,tr_3_tl_4,1933,665,1256,133,"только что на теле"". Он сказал:",л.
335,era_vene_15_134-135.xml,era_vene_15_134-135,tr_3_tl_5,1926,747,1299,135,"""Ну, что ты хогешь я принясу?""",И
336,era_vene_15_134-135.xml,era_vene_15_134-135,tr_3_tl_6,1924,854,1270,81,Онва захатела платьев и он при-,по
337,era_vene_15_134-135.xml,era_vene_15_134-135,tr_3_tl_7,1936,929,1240,123,нес ей два сундука. Тут пра-,по
338,era_vene_15_134-135.xml,era_vene_15_134-135,tr_3_tl_8,1926,1004,1263,113,пел петушок и чорт правалил,по
339,era_vene_15_134-135.xml,era_vene_15_134-135,tr_3_tl_9,1941,1089,1258,143,ся. Приходит утра. Она паш-,го
340,era_vene_15_134-135.xml,era_vene_15_134-135,tr_3_tl_10,1926,1177,1271,97,"ла дамой и говорит: “Папа,",5


проверка

In [44]:
NS = {"pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"}

def get_pagexml_size(xml_path):
    root = etree.parse(str(xml_path)).getroot()
    page = root.find(".//pc:Page", namespaces=NS)
    w = int(page.get("imageWidth"))
    h = int(page.get("imageHeight"))
    return w, h

In [45]:
def scale_bbox(x, y, w, h, src_w, src_h, dst_w, dst_h):
    sx = dst_w / src_w
    sy = dst_h / src_h
    return (
        int(round(x * sx)),
        int(round(y * sy)),
        int(round(w * sx)),
        int(round(h * sy)),
    )


In [49]:
from pathlib import Path

def build_pageid_to_xml(pagexml_root: Path) -> dict[str, Path]:
    xml_paths = list(pagexml_root.rglob("*.xml"))
    pageid_to_xml = {}
    for p in xml_paths:
        key = p.stem
        pageid_to_xml[key] = p
    return pageid_to_xml

pageid_to_xml = build_pageid_to_xml(pagexml_root)


missing = [pid for pid in df_gt_lines["page_id"].unique() if pid not in pageid_to_xml]
print("XML total:", len(pageid_to_xml))
print("page_id unique:", df_gt_lines["page_id"].nunique())
print("missing page_ids:", missing[:20], " ... total:", len(missing))


XML total: 10
page_id unique: 10
missing page_ids: []  ... total: 0


In [51]:
pagexml_size_cache = {}

for page_id, group in df_gt_lines.groupby("page_id", sort=False):
    xml_path = pageid_to_xml.get(page_id)
    if xml_path is None:
        continue

    if page_id not in pagexml_size_cache:
        pagexml_size_cache[page_id] = get_pagexml_size(xml_path)

    src_w, src_h = pagexml_size_cache[page_id]
    page_img = render_pdf_first_page(pageid_to_pdf[page_id], dpi=DPI)

    preds = trocr_predict_lines_for_page(group, page_img, src_w, src_h)
    df_gt_lines.loc[group.index, "pred_trocr"] = preds


AttributeError: 'NoneType' object has no attribute 'get'

In [52]:
def trocr_predict_lines_for_page(page_df, page_img, src_w, src_h):
    dst_w, dst_h = page_img.size
    crops = []

    for _, r in page_df.iterrows():
        x, y, w, h = int(r["x"]), int(r["y"]), int(r["w"]), int(r["h"])
        x2, y2, w2, h2 = scale_bbox(x, y, w, h, src_w, src_h, dst_w, dst_h)
        crop = safe_crop(page_img, x2, y2, w2, h2, pad=CROP_PAD).convert("RGB")
        crops.append(crop)

        preds = []
    for i in range(0, len(crops), BATCH_SIZE):
        batch = crops[i:i+BATCH_SIZE]
        with torch.no_grad():
            pixel_values = processor(images=batch, return_tensors="pt").pixel_values.to(DEVICE)
            generated_ids = model.generate(pixel_values, max_new_tokens=128)
            texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
        preds.extend([normalize_text(t) for t in texts])

    return preds

# --- Ensure numeric coords ---
df_gt_lines = df_gt_lines.copy()
for col in ["x", "y", "w", "h"]:
    df_gt_lines[col] = pd.to_numeric(df_gt_lines[col], errors="coerce").fillna(0).astype(int)

df_gt_lines["pred_trocr"] = ""

page_image_cache = {}

for page_id, group in tqdm(df_gt_lines.groupby("page_id", sort=False), total=df_gt_lines["page_id"].nunique()):
    pdf_path = pageid_to_pdf.get(page_id)
    if pdf_path is None:
        continue

    if page_id not in page_image_cache:
        page_image_cache[page_id] = render_pdf_first_page(pdf_path, dpi=DPI)

    page_img = page_image_cache[page_id]

    preds = trocr_predict_lines_for_page(group, page_img)

    if len(preds) != len(group):
        raise RuntimeError(f"Pred/rows mismatch for {page_id}: preds={len(preds)} rows={len(group)}")

    df_gt_lines.loc[group.index, "pred_trocr"] = preds

filled = (df_gt_lines["pred_trocr"].str.len() > 0).sum()
print("Filled predictions:", filled, "/", len(df_gt_lines))

  0%|          | 0/10 [00:00<?, ?it/s]

TypeError: trocr_predict_lines_for_page() missing 2 required positional arguments: 'src_w' and 'src_h'

## 4. Compute quality metrics

## 5. Export results