In [13]:
# JUPYTER: LayoutLMv3 token classification on DocLayNet (ALL classes, with images)
# Robust to nested pdf_cells; uses processor(images=..., text=...); skips safely; drops helper cols.
import os, time
from pathlib import Path
from typing import Any, Dict, List, Tuple, Iterable

import numpy as np
import torch
from datasets import load_dataset, load_from_disk, Dataset
from transformers import (
    LayoutLMv3Processor,
    LayoutLMv3ForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
)
from PIL import Image

# =========================
# CONFIG
# =========================
BASE_PATH = Path("/home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout")
IMAGE_ROOT = BASE_PATH / "images"   # images/{split}/{idx}.png

MODEL_NAME   = "microsoft/layoutlmv3-base"
OUTPUT_DIR   = "./layoutlmv3_doclaynet_tokcls_all"
MAX_LEN      = 512
LR           = 3e-5
EPOCHS       = 4
BATCH_TRAIN  = 2
BATCH_EVAL   = 2

# Labels: ALL DocLayNet classes + "O" (12 total)
DOC_LAYNET_CLASSES = [
    "CAPTION","FOOTNOTE","FORMULA","LIST-ITEM","PAGE-FOOTER",
    "PAGE-HEADER","PICTURE","SECTION-HEADER","TABLE","TEXT","TITLE"
]
LABELS   = ["O"] + DOC_LAYNET_CLASSES
label2id = {l: i for i, l in enumerate(LABELS)}
id2label = {i: l for l, i in label2id.items()}
ID2NAME_FALLBACK = {  # DocLayNet int ids → names
    1:"CAPTION", 2:"FOOTNOTE", 3:"FORMULA", 4:"LIST-ITEM", 5:"PAGE-FOOTER",
    6:"PAGE-HEADER", 7:"PICTURE", 8:"SECTION-HEADER", 9:"TABLE", 10:"TEXT", 11:"TITLE"
}

# =========================
# HELPERS — create {split}_with_images only if missing
# =========================
def get_metadata_keys(dataset) -> set:
    return {tuple(sorted(meta.items())) for meta in dataset["metadata"]}

def save_image_file(pil_img: Image.Image, image_path: Path):
    image_path.parent.mkdir(parents=True, exist_ok=True)
    pil_img.save(image_path)

def ensure_with_images_split(split: str):
    with_images_dir = BASE_PATH / f"{split}_with_images"
    if with_images_dir.exists():
        print(f"[i] Found existing '{split}_with_images' at {with_images_dir}")
        return
    subset_dir = BASE_PATH / split
    if not subset_dir.exists():
        raise FileNotFoundError(f"Subset not found: {subset_dir}")
    print(f"[i] Creating '{split}_with_images' by attaching images...")
    ann_subset = load_from_disk(str(subset_dir))
    meta_keys = get_metadata_keys(ann_subset)
    if not meta_keys:
        raise ValueError(f"No 'metadata' column in {subset_dir}; cannot match images.")
    ann_dict = {tuple(sorted(meta.items())): i for i, meta in enumerate(ann_subset["metadata"])}
    image_paths = [None] * len(ann_subset)

    t0 = time.time()
    stream = load_dataset("ds4sd/DocLayNet-v1.2", split=split, streaming=True)
    matched = 0
    for sample in stream:
        key = tuple(sorted(sample["metadata"].items()))
        if key in meta_keys:
            idx = ann_dict.get(key)
            if idx is None:
                continue
            img_dir  = IMAGE_ROOT / split
            img_path = img_dir / f"{idx}.png"
            save_image_file(sample["image"], img_path)
            image_paths[idx] = str(img_path)
            matched += 1
            meta_keys.remove(key)
            if not meta_keys:
                break
    print(f"   ✅ Saved {matched} images to {IMAGE_ROOT/split} in {time.time()-t0:.1f}s")
    ann_subset = ann_subset.add_column("image_path", image_paths)
    ann_subset.save_to_disk(str(with_images_dir))
    print(f"   💾 Saved '{split}_with_images' at {with_images_dir}")

# Ensure with-images sets exist (you already have them; this won’t rebuild)
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "1")
for split in ["train", "validation"]:
    ensure_with_images_split(split)

# =========================
# GEOMETRY & LABELING
# =========================
def clamp(v, lo, hi): 
    return max(lo, min(hi, v))

def to_1000_space(boxes: List[List[float]], w: int, h: int) -> List[List[int]]:
    mv = max((max(b) for b in boxes if b), default=0.0)
    out = []
    for b in boxes:
        if len(b) != 4:
            out.append([0,0,1,1]); continue
        x0,y0,x1,y1 = b
        if x1 < x0: x0,x1 = x1,x0
        if y1 < y0: y0,y1 = y1,y0
        if mv <= 1.0001:
            X0,Y0,X1,Y1 = int(round(x0*1000)),int(round(y0*1000)),int(round(x1*1000)),int(round(y1*1000))
        elif w>0 and h>0:
            X0,Y0,X1,Y1 = int(round((x0/w)*1000)),int(round((y0/h)*1000)),int(round((x1/w)*1000)),int(round((y1/h)*1000))
        else:
            X0,Y0,X1,Y1 = int(round(x0)),int(round(y0)),int(round(x1)),int(round(y1))
        X0,Y0,X1,Y1 = clamp(X0,0,1000),clamp(Y0,0,1000),clamp(X1,0,1000),clamp(Y1,0,1000)
        if X1 == X0: X1 = min(1000, X0+1)
        if Y1 == Y0: Y1 = min(1000, Y0+1)
        out.append([X0,Y0,X1,Y1])
    return out

def box_iou(a: List[float], b: List[float]) -> float:
    ax0, ay0, ax1, ay1 = a
    bx0, by0, bx1, by1 = b
    ix0, iy0 = max(ax0, bx0), max(ay0, by0)
    ix1, iy1 = min(ax1, bx1), min(ay1, by1)
    iw, ih = max(0.0, ix1-ix0), max(0.0, iy1-iy0)
    inter = iw*ih
    areaA = max(0.0, ax1-ax0)*max(0.0, ay1-ay0)
    areaB = max(0.0, bx1-bx0)*max(0.0, by1-by0)
    union = areaA + areaB - inter + 1e-6
    return inter/union

def normalize_region_name(name: str) -> str:
    return "PICTURE" if name == "FIGURE" else name

def region_name_from_label(l: Any) -> str:
    if isinstance(l, str): return normalize_region_name(l)
    if isinstance(l, int): return normalize_region_name(ID2NAME_FALLBACK.get(l, "TEXT"))
    return "TEXT"

def label_id_from_name(name: str) -> int:
    name = normalize_region_name(name)
    if name not in DOC_LAYNET_CLASSES:
        name = "TEXT"
    return label2id[name]

# Flatten nested pdf_cells → dicts with text & bbox
def _iter_cells(obj: Any):
    if obj is None:
        return
    if isinstance(obj, dict):
        if "text" in obj and ("bbox" in obj or "box" in obj):
            yield obj
        for k in ("cells","items","data"):
            if k in obj:
                yield from _iter_cells(obj[k])
    elif isinstance(obj, (list, tuple)):
        for it in obj:
            yield from _iter_cells(it)

def derive_words_from_pdf_cells(ex: Dict[str, Any]) -> Tuple[List[str], List[List[float]]]:
    words, boxes = [], []
    for c in _iter_cells(ex.get("pdf_cells")):
        bb  = c.get("bbox", c.get("box"))
        txt = c.get("text")
        if not txt or not bb or len(bb) != 4: 
            continue
        toks = str(txt).strip().split()
        if not toks: 
            continue
        words.extend(toks)
        boxes.extend([bb]*len(toks))
    return words, boxes

def assign_cell_label(cell_box, region_boxes, region_labels) -> int:
    if not region_boxes or not region_labels:
        return 0
    best, best_iou = -1, 0.0
    for i, rb in enumerate(region_boxes):
        iou = box_iou(cell_box, rb)
        if iou > best_iou:
            best_iou, best = iou, i
    if best < 0:
        return 0
    rname = region_name_from_label(region_labels[best])
    return label_id_from_name(rname)

def detect_image(ex: Dict[str, Any]) -> Image.Image | None:
    p = ex.get("image_path")
    if p:
        try: return Image.open(p).convert("RGB")
        except Exception: return None
    img = ex.get("image")
    if img is not None:
        if isinstance(img, Image.Image): return img.convert("RGB")
        try: return Image.fromarray(img).convert("RGB")
        except Exception: return None
    return None

# =========================
# PREPROCESS (dummy pack + images=/text=)
# =========================
def build_preprocess(processor):
    skip = {"no_image":0, "no_words":0, "len_mismatch":0}

    # dummy pack so skipped rows still emit full columns
    _dummy_img = Image.new("RGB", (8,8), color=(255,255,255))
    _dummy_enc = processor(
        images=_dummy_img, text=["_"], boxes=[[0,0,1,1]], word_labels=[0],
        truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt",
    )
    DUMMY_PACK = {k: v.squeeze(0) for k, v in _dummy_enc.items()}
    DUMMY_PACK["_skip"] = True

    def preprocess(ex):
        image = detect_image(ex)
        if image is None:
            skip["no_image"] += 1
            return DUMMY_PACK.copy()

        # prefer existing perfectly aligned word-level data
        words = ex.get("words") or ex.get("tokens") or []
        word_boxes = ex.get("bboxes") or []
        direct_labels = ex.get("word_labels") or ex.get("labels")
        ok_direct = bool(words) and bool(word_boxes) and bool(direct_labels) and \
                    len(words) == len(word_boxes) == len(direct_labels)

        if not ok_direct:
            words, word_boxes = derive_words_from_pdf_cells(ex)
            if not words or not word_boxes:
                skip["no_words"] += 1
                return DUMMY_PACK.copy()
            if len(words) != len(word_boxes):
                skip["len_mismatch"] += 1
                return DUMMY_PACK.copy()
            region_boxes  = ex.get("bboxes") or []
            region_labels = ex.get("category_id") or []
            direct_labels = [assign_cell_label(bb, region_boxes, region_labels) for bb in word_boxes]

        # normalize labels into ids 0..11 (0="O")
        norm = []
        for l in direct_labels:
            if isinstance(l, int):
                if l == 0:
                    norm.append(0)
                elif 1 <= l <= 11:
                    cname = ID2NAME_FALLBACK.get(l, "TEXT")
                    norm.append(label_id_from_name(cname))
                elif 0 <= l < len(LABELS):
                    norm.append(l)
                else:
                    norm.append(0)
            elif isinstance(l, str):
                norm.append(label_id_from_name(l))
            else:
                norm.append(0)

        # hard trim
        if len(words) > MAX_LEN:
            words = words[:MAX_LEN]; word_boxes = word_boxes[:MAX_LEN]; norm = norm[:MAX_LEN]

        W, H = image.size
        boxes_1000 = to_1000_space(word_boxes, W, H)

        enc = processor(
            images=image, text=words, boxes=boxes_1000, word_labels=norm,
            truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt",
        )
        out = {k: v.squeeze(0) for k, v in enc.items()}
        out["_skip"] = False
        return out

    return preprocess, skip

# =========================
# LOAD, MAP, FILTER, CLEAN
# =========================
train_raw = load_from_disk(str(BASE_PATH / "train_with_images"))
val_raw   = load_from_disk(str(BASE_PATH / "validation_with_images"))
print("[train] size:", len(train_raw), "columns:", train_raw.column_names)
print("[val]   size:", len(val_raw),   "columns:", val_raw.column_names)

processor = LayoutLMv3Processor.from_pretrained(MODEL_NAME, apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(LABELS), id2label=id2label, label2id=label2id
)

preprocess, skip = build_preprocess(processor)
train_ds = train_raw.map(preprocess, batched=False, remove_columns=train_raw.column_names)
val_ds   = val_raw.map(preprocess,   batched=False, remove_columns=val_raw.column_names)

train_ds = train_ds.filter(lambda ex: ex["_skip"] is False)
val_ds   = val_ds.filter(lambda ex: ex["_skip"] is False)
print(f"[train] kept {len(train_ds)} samples")
print(f"[val]   kept {len(val_ds)} samples")
print("skip reasons:", skip)

# drop helper + keep only model inputs
if "_skip" in train_ds.column_names: train_ds = train_ds.remove_columns(["_skip"])
if "_skip" in val_ds.column_names:   val_ds   = val_ds.remove_columns(["_skip"])
MODEL_COLS = ["input_ids", "attention_mask", "bbox", "pixel_values", "labels"]
train_ds = train_ds.remove_columns([c for c in train_ds.column_names if c not in MODEL_COLS])
val_ds   = val_ds.remove_columns([c for c in val_ds.column_names   if c not in MODEL_COLS])

train_ds.set_format(type="torch")
val_ds.set_format(type="torch")

# =========================
# TRAIN
# =========================
def compute_metrics(p):
    # p.predictions, p.label_ids are NumPy arrays here
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    mask = labels != -100
    if mask.any():
        acc = (preds[mask] == labels[mask]).mean()  # NumPy mean -> float
        return {"token_acc": float(acc)}
    return {"token_acc": 0.0}

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    remove_unused_columns=False,
    fp16=torch.cuda.is_available(),
    save_strategy="epoch",
)

data_collator = DataCollatorForTokenClassification(
    tokenizer=processor.tokenizer,
    padding="max_length"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()
print("✅ Training finished (ALL classes).")

[i] Found existing 'train_with_images' at /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/train_with_images
[i] Found existing 'validation_with_images' at /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/validation_with_images
[train] size: 300 columns: ['metadata', 'bboxes', 'category_id', 'segmentation', 'area', 'pdf_cells', 'image_path']
[val]   size: 97 columns: ['metadata', 'bboxes', 'category_id', 'segmentation', 'area', 'pdf_cells', 'image_path']


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

Filter:   0%|          | 0/300 [00:00<?, ? examples/s]

Filter:   0%|          | 0/97 [00:00<?, ? examples/s]

[train] kept 292 samples
[val]   kept 96 samples
skip reasons: {'no_image': 0, 'no_words': 9, 'len_mismatch': 0}




Epoch,Training Loss,Validation Loss,Token Acc
1,0.0015,0.012875,0.999907
2,0.0009,0.011723,0.999907
3,0.001,0.011081,0.999907
4,0.0093,0.00959,0.999907




✅ Training finished (ALL classes).


In [1]:
import sys
import numpy, torch, torchvision

import transformers, datasets
import accelerate
print("py", sys.version)
print("numpy", numpy.__version__)
print("torch", torch.__version__)
print("torchvision", torchvision.__version__)



print("transformers", transformers.__version__)
print("datasets", datasets.__version__)
print('accelerate', accelerate.__version__)

py 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:08:06) [GCC 11.3.0]
numpy 1.26.4
torch 2.9.0+cu128
torchvision 0.24.0+cu128
transformers 4.45.2
datasets 4.2.0
accelerate 1.10.1


In [12]:
pip install --upgrade datasets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install "datasets==2.20.0" "transformers==4.45.2" "pyarrow>=14.0.1" "timm>=0.9.12" "accelerate>=0.26"

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets==2.20.0
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers==4.45.2
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting accelerate>=0.26
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow-hotfix (from datasets==2.20.0)
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.20.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets==2.20.0)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers==4.45.2)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━

In [4]:
import os
from pathlib import Path
from typing import Any, List, Dict

import torch
from datasets import load_from_disk
from transformers import (
    LayoutLMv3Processor,
    LayoutLMv3ForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification,
)
from PIL import Image

from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

# Paths - edit if needed
BASE_PATH = Path("/home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout")
EVAL_DIR = BASE_PATH / "test_with_images"
FALLBACK = BASE_PATH / "validation_with_images"
CKPT_PATH = Path("./layoutlmv3_doclaynet_tokcls_all/checkpoint-584")
OUTPUT_EVAL = "./eval_report_allclasses.txt"

# Label setup
DOC_LAYNET_CLASSES = [
    "CAPTION","FOOTNOTE","FORMULA","LIST-ITEM","PAGE-FOOTER",
    "PAGE-HEADER","PICTURE","SECTION-HEADER","TABLE","TEXT","TITLE"
]
LABELS = ["O"] + DOC_LAYNET_CLASSES
label2id = {l: i for i, l in enumerate(LABELS)}
id2label = {i: l for i, l in enumerate(LABELS)}
ID2NAME_FALLBACK = {1:"CAPTION", 2:"FOOTNOTE", 3:"FORMULA", 4:"LIST-ITEM", 5:"PAGE-FOOTER",
                    6:"PAGE-HEADER", 7:"PICTURE", 8:"SECTION-HEADER", 9:"TABLE", 10:"TEXT", 11:"TITLE"}

MAX_LEN = 512
BATCH_EVAL = 2

# Helper functions (detect_image, to_1000_space, etc.) - Add your existing helper functions here

def detect_image(ex: Dict[str, Any]) -> Image.Image | None:
    p = ex.get("image_path")
    if p:
        try: return Image.open(p).convert("RGB")
        except Exception: return None
    img = ex.get("image")
    if img is not None:
        if isinstance(img, Image.Image): return img.convert("RGB")
        try: return Image.fromarray(img).convert("RGB")
        except Exception: return None
    return None

def clamp(v, lo, hi): 
    return max(lo, min(hi, v))

def to_1000_space(boxes: List[List[float]], w: int, h: int) -> List[List[int]]:
    mv = max((max(b) for b in boxes if b), default=0.0)
    out = []
    for b in boxes:
        if len(b) != 4:
            out.append([0,0,1,1])
            continue
        x0,y0,x1,y1 = b
        if x1 < x0: x0,x1 = x1,x0
        if y1 < y0: y0,y1 = y1,y0
        if mv <= 1.0001:
            X0,Y0,X1,Y1 = int(round(x0*1000)),int(round(y0*1000)),int(round(x1*1000)),int(round(y1*1000))
        elif w>0 and h>0:
            X0,Y0,X1,Y1 = int(round((x0/w)*1000)),int(round((y0/h)*1000)),int(round((x1/w)*1000)),int(round((y1/h)*1000))
        else:
            X0,Y0,X1,Y1 = int(round(x0)),int(round(y0)),int(round(x1)),int(round(y1))
        X0,Y0,X1,Y1 = clamp(X0,0,1000),clamp(Y0,0,1000),clamp(X1,0,1000),clamp(Y1,0,1000)
        if X1 == X0: X1 = min(1000, X0+1)
        if Y1 == Y0: Y1 = min(1000, Y0+1)
        out.append([X0,Y0,X1,Y1])
    return out

# Align labels to tokens for proper evaluation ignoring subword tokens
def align_labels_with_tokens(words: List[str], labels: List[int], tokenizer) -> List[int]:
    tokenized = tokenizer(words, truncation=True, is_split_into_words=True)
    word_ids = tokenized.word_ids()
    previous_word_idx = None
    aligned_labels = []
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(labels[word_idx])
        else:
            aligned_labels.append(-100)
        previous_word_idx = word_idx
    return aligned_labels

# Preprocess function for evaluation
def build_preprocess(processor):
    _dummy_img = Image.new("RGB", (8,8), color=(255,255,255))
    _dummy_enc = processor(images=_dummy_img, text=["_"], boxes=[[0,0,1,1]], word_labels=[0],
                           truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt")
    DUMMY_PACK = {k: v.squeeze(0) for k, v in _dummy_enc.items()}
    DUMMY_PACK["_skip"] = True
    skip = {"no_image":0, "no_words":0, "len_mismatch":0}

    def preprocess(ex):
        image = detect_image(ex)
        if image is None:
            skip["no_image"] += 1
            return DUMMY_PACK.copy()

        words = ex.get("words") or ex.get("tokens") or []
        word_boxes = ex.get("bboxes") or []
        direct_labels = ex.get("word_labels") or ex.get("labels")
        if not (words and word_boxes and direct_labels and len(words) == len(word_boxes) == len(direct_labels)):
            skip["no_words"] += 1
            return DUMMY_PACK.copy()

        # Normalize labels
        norm = []
        for l in direct_labels:
            if isinstance(l, int):
                if 0 <= l < len(LABELS):
                    norm.append(l)
                else:
                    norm.append(0)
            elif isinstance(l, str):
                norm.append(label2id.get(l, 0))
            else:
                norm.append(0)

        if len(words) > MAX_LEN:
            words, word_boxes, norm = words[:MAX_LEN], word_boxes[:MAX_LEN], norm[:MAX_LEN]

        W, H = image.size
        boxes_1000 = to_1000_space(word_boxes, W, H)

        aligned_labels = align_labels_with_tokens(words, norm, processor.tokenizer)

        enc = processor(images=image, text=words, boxes=boxes_1000, word_labels=aligned_labels,
                        truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt")
        out = {k: v.squeeze(0) for k, v in enc.items()}
        out["_skip"] = False
        return out

    return preprocess, skip

# Load eval dataset
eval_raw = load_from_disk(str(EVAL_DIR)) if EVAL_DIR.exists() else load_from_disk(str(FALLBACK))
print(f"Evaluating on dataset with {len(eval_raw)} samples")

# Check checkpoint
if not CKPT_PATH.exists():
    raise FileNotFoundError(f"Checkpoint not found: {CKPT_PATH}")

processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained(CKPT_PATH).to("cuda" if torch.cuda.is_available() else "cpu")

preprocess, skip = build_preprocess(processor)
eval_ds = eval_raw.map(preprocess, batched=False, remove_columns=eval_raw.column_names)
eval_ds = eval_ds.filter(lambda ex: ex["_skip"] is False)
if "_skip" in eval_ds.column_names: eval_ds = eval_ds.remove_columns(["_skip"])
MODEL_COLS = ["input_ids", "attention_mask", "bbox", "pixel_values", "labels"]
eval_ds = eval_ds.remove_columns([c for c in eval_ds.column_names if c not in MODEL_COLS])
eval_ds.set_format(type="torch")

print(f"Evaluation dataset prepared: {len(eval_ds)} samples, skipped: {skip}")

# Setup Trainer for prediction
args = TrainingArguments(
    output_dir="./tmp_eval_only",
    per_device_eval_batch_size=BATCH_EVAL,
    report_to="none",
    remove_unused_columns=False,
)

data_collator = DataCollatorForTokenClassification(tokenizer=processor.tokenizer, padding="max_length")

eval_trainer = Trainer(
    model=model,
    args=args,
    eval_dataset=eval_ds,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
)

pred = eval_trainer.predict(eval_ds)
logits = pred.predictions
labels = pred.label_ids

pred_ids = logits.argmax(-1)
mask = labels != -100
token_acc = float((pred_ids[mask] == labels[mask]).mean()) if mask.any() else 0.0

y_true, y_pred = [], []
for logit, lab in zip(logits, labels):
    pi = logit.argmax(-1)
    m = lab != -100
    y_true.append([id2label[int(i)] for i in lab[m]])
    y_pred.append([id2label[int(i)] for i in pi[m]])

report = classification_report(y_true, y_pred, digits=3)
macro_f1 = f1_score(y_true, y_pred, average="macro")
micro_f1 = f1_score(y_true, y_pred, average="micro")
macro_p = precision_score(y_true, y_pred, average="macro")
macro_r = recall_score(y_true, y_pred, average="macro")

print("\n=== Evaluation Summary ===")
print(f"Checkpoint: {CKPT_PATH}")
print(f"Token accuracy: {token_acc:.4f}")
print(f"Macro F1: {macro_f1:.4f} | Micro F1: {micro_f1:.4f} | Macro Precision: {macro_p:.4f} | Macro Recall: {macro_r:.4f}")
print("\nPer-class report:\n", report)

with open(OUTPUT_EVAL, "w") as f:
    f.write(f"Checkpoint: {CKPT_PATH}\n")
    f.write(f"Token accuracy: {token_acc:.6f}\n")
    f.write(f"Macro F1: {macro_f1:.6f} | Micro F1: {micro_f1:.6f} | Macro Precision: {macro_p:.6f} | Macro Recall: {macro_r:.6f}\n\n")
    f.write(report)
print(f"Report saved to: {OUTPUT_EVAL}")

Evaluating on dataset with 100 samples


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Evaluation dataset prepared: 0 samples, skipped: {'no_image': 0, 'no_words': 100, 'len_mismatch': 0}


AttributeError: 'NoneType' object has no attribute 'argmax'

In [2]:
%pip install -qU pip setuptools wheel
%pip install -qU --use-pep517 --no-build-isolation seqeval

[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
# ========= Region-level evaluation (majority vote over tokens per region) — FIXED =========
from collections import Counter
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
from PIL import Image

IOU_THRESH = 0.10  # try 0.05 if nothing overlaps

ID2NAME_FALLBACK = {
    1:"CAPTION", 2:"FOOTNOTE", 3:"FORMULA", 4:"LIST-ITEM", 5:"PAGE-FOOTER",
    6:"PAGE-HEADER", 7:"PICTURE", 8:"SECTION-HEADER", 9:"TABLE", 10:"TEXT", 11:"TITLE"
}

def normalize_region_name(name: str) -> str:
    return "PICTURE" if name == "FIGURE" else name

def region_name_from_label(l):
    if isinstance(l, str): return normalize_region_name(l)
    if isinstance(l, int): return normalize_region_name(ID2NAME_FALLBACK.get(l, "TEXT"))
    return "TEXT"

def label_id_from_name(name: str) -> int:
    name = normalize_region_name(name)
    if name not in label2id:
        name = "TEXT"
    return label2id[name]

def box_iou_1000(a, b) -> float:
    ax0, ay0, ax1, ay1 = a
    bx0, by0, bx1, by1 = b
    ix0, iy0 = max(ax0, bx0), max(ay0, by0)
    ix1, iy1 = min(ax1, bx1), min(ay1, by1)
    iw, ih = max(0, ix1 - ix0), max(0, iy1 - iy0)
    inter = iw * ih
    areaA = max(0, ax1 - ax0) * max(0, ay1 - ay0)
    areaB = max(0, bx1 - bx0) * max(0, by1 - by0)
    union = areaA + areaB - inter + 1e-6
    return inter / union

region_true = []
region_pred_majority = []       # majority including "O"
region_pred_majority_nonO = []  # majority ignoring "O"

N = len(eval_ds)
for i in range(N):
    ex = eval_raw[i]
    gt_boxes_px   = ex.get("bboxes") or []
    gt_labels_raw = ex.get("category_id") or []
    if len(gt_boxes_px) == 0 or len(gt_labels_raw) == 0:
        continue

    # convert GT boxes to 0..1000 space
    img = Image.open(ex["image_path"]).convert("RGB")
    W, H = img.size
    gt_boxes_1000 = to_1000_space(gt_boxes_px, W, H)
    gt_label_ids  = [label_id_from_name(region_name_from_label(l)) for l in gt_labels_raw]

    # predicted token labels for this sample
    pred_ids = logits[i].argmax(-1)
    lab_i    = labels[i]
    mask     = (lab_i != -100)

    token_boxes_1000 = np.array(eval_ds[i]["bbox"])[mask]
    token_preds      = pred_ids[mask]

    # vote per region
    for r_box, r_true in zip(gt_boxes_1000, gt_label_ids):
        hits = []
        for tb, tpred in zip(token_boxes_1000, token_preds):
            if box_iou_1000(tb, r_box) > IOU_THRESH:
                hits.append(int(tpred))

        if not hits:
            maj = label2id["O"]
            maj_nonO = label2id["O"]
        else:
            maj = Counter(hits).most_common(1)[0][0]
            nonO = [h for h in hits if h != label2id["O"]]
            maj_nonO = Counter(nonO).most_common(1)[0][0] if nonO else label2id["O"]

        region_true.append(r_true)
        region_pred_majority.append(maj)
        region_pred_majority_nonO.append(maj_nonO)

# If nothing got collected, exit early with a hint
if len(region_true) == 0:
    print("No regions evaluated. Try lowering IOU_THRESH to 0.05 or check that eval_raw has 'bboxes'/'category_id'.")
else:
    # labels for report: **exclude 'O'** because GT regions never use it
    class_names = [name for name, idx in sorted(label2id.items(), key=lambda x: x[1]) if name != "O"]
    class_ids   = [label2id[name] for name in class_names]

    print("\n=== Region-level (majority vote, including 'O') ===")
    print("Accuracy:", accuracy_score(region_true, region_pred_majority))
    print(classification_report(
        region_true, region_pred_majority,
        labels=class_ids, target_names=class_names, digits=3, zero_division=0
    ))

    print("\n=== Region-level (majority vote, ignoring 'O' in the vote) ===")
    print("Accuracy:", accuracy_score(region_true, region_pred_majority_nonO))
    print(classification_report(
        region_true, region_pred_majority_nonO,
        labels=class_ids, target_names=class_names, digits=3, zero_division=0
    ))

No regions evaluated. Try lowering IOU_THRESH to 0.05 or check that eval_raw has 'bboxes'/'category_id'.


In [6]:
import torch
import numpy as np
from pathlib import Path
from PIL import Image
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification

# ============== CONFIG ==============
CKPT_PATH = Path("./layoutlmv3_doclaynet_tokcls_all/checkpoint-584")
EVAL_SPLIT = "/home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/test_with_images"  # change to validation if needed
BATCH_SIZE = 2
IOU_THRESH = 0.05  # lowered for region overlap
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ============== LOAD MODEL & PROCESSOR ==============
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained(CKPT_PATH).to(DEVICE)

# ============== LOAD DATA ==============
from datasets import load_from_disk
eval_raw = load_from_disk(EVAL_SPLIT)
eval_ds = eval_raw

print(f"[i] Loaded {len(eval_raw)} samples from {EVAL_SPLIT}")

# ============== PREDICT (logits + labels) ==============
def _dummy_metrics(_): return {}

args = TrainingArguments(
    output_dir="./eval_tmp",
    per_device_eval_batch_size=BATCH_SIZE,
    remove_unused_columns=False,
    report_to="none"
)

data_collator = DataCollatorForTokenClassification(
    tokenizer=processor.tokenizer, padding="max_length"
)

trainer = Trainer(
    model=model,
    args=args,
    eval_dataset=eval_ds,
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
    compute_metrics=_dummy_metrics
)

pred = trainer.predict(eval_ds)
logits = pred.predictions
labels = pred.label_ids

# ============== TOKEN-LEVEL EVALUATION ==============
pred_ids = logits.argmax(-1)
mask = labels != -100

token_acc = float((pred_ids[mask] == labels[mask]).mean()) if mask.any() else 0.0
print(f"\n✅ Token Accuracy: {token_acc:.4f}")

# build token-level true/pred label lists excluding -100 and 'O'
y_true, y_pred = [], []
for lrow, prow in zip(labels, pred_ids):
    for l, p in zip(lrow, prow):
        if l == -100: continue
        y_true.append(int(l))
        y_pred.append(int(p))

# prepare class names (excluding O if needed)
id2label = model.config.id2label
label2id = {v: int(k) for k, v in model.config.label2id.items()}
class_names = [id2label[str(i)] for i in range(len(id2label))]
non_o_ids = [i for i, name in enumerate(class_names) if name != "O"]
non_o_names = [name for name in class_names if name != "O"]

print("\n=== TOKEN-LEVEL REPORT (excluding 'O') ===")
print(classification_report(
    y_true, y_pred,
    labels=non_o_ids,
    target_names=non_o_names,
    digits=3,
    zero_division=0
))

# Confusion matrix visualization
import matplotlib.pyplot as plt
cm = confusion_matrix(y_true, y_pred, labels=non_o_ids)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=non_o_names)
fig, ax = plt.subplots(figsize=(8, 8))
disp.plot(ax=ax, xticks_rotation='vertical', cmap='Blues')
plt.title("Confusion Matrix (Token Level, Non-O)")
plt.show()

# ============== REGION-LEVEL EVALUATION ==============
def box_iou_1000(a, b):
    ax0, ay0, ax1, ay1 = a
    bx0, by0, bx1, by1 = b
    ix0, iy0 = max(ax0, bx0), max(ay0, by0)
    ix1, iy1 = min(ax1, bx1), min(ay1, by1)
    iw, ih = max(0, ix1 - ix0), max(0, iy1 - iy0)
    inter = iw * ih
    areaA = max(0, ax1 - ax0) * max(0, ay1 - ay0)
    areaB = max(0, bx1 - bx0) * max(0, by1 - by0)
    union = areaA + areaB - inter + 1e-6
    return inter / union

region_true, region_pred = [], []

for i in range(len(eval_raw)):
    ex = eval_raw[i]
    if not ex.get("bboxes") or not ex.get("category_id"):
        continue

    # GT regions
    img = Image.open(ex["image_path"]).convert("RGB")
    W, H = img.size
    gt_boxes = np.array([[b[0]*1000/W, b[1]*1000/H, b[2]*1000/W, b[3]*1000/H] for b in ex["bboxes"]])
    gt_labels = [int(l) if isinstance(l, int) else label2id[l] for l in ex["category_id"]]

    # Token-level predictions for this page
    token_boxes = np.array(eval_ds[i]["bbox"])
    page_pred = logits[i].argmax(-1)
    page_mask = labels[i] != -100
    token_boxes = token_boxes[page_mask]
    page_pred = page_pred[page_mask]

    # vote per region
    for r_box, r_label in zip(gt_boxes, gt_labels):
        hits = [int(pred) for tbox, pred in zip(token_boxes, page_pred)
                if box_iou_1000(tbox, r_box) > IOU_THRESH]
        if not hits:
            region_pred.append(label2id["O"])
        else:
            region_pred.append(Counter(hits).most_common(1)[0][0])
        region_true.append(r_label)

if len(region_true) == 0:
    print("\n❌ No regions evaluated. Check your eval_raw['bboxes'] or IOU threshold.")
else:
    print("\n=== REGION-LEVEL REPORT (majority vote) ===")
    print(classification_report(
        region_true, region_pred,
        labels=non_o_ids,
        target_names=non_o_names,
        digits=3,
        zero_division=0
    ))
    print("Region-level Accuracy:", accuracy_score(region_true, region_pred))


[i] Loaded 100 samples from /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/test_with_images


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['metadata', 'bboxes', 'category_id', 'segmentation', 'area', 'pdf_cells', 'image_path']

In [2]:
from datasets import load_from_disk
from collections import Counter
from pathlib import Path

BASE_PATH = Path("/home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout")
ID2NAME_FALLBACK = {
    1:"CAPTION",2:"FOOTNOTE",3:"FORMULA",4:"LIST-ITEM",5:"PAGE-FOOTER",
    6:"PAGE-HEADER",7:"PICTURE",8:"SECTION-HEADER",9:"TABLE",10:"TEXT",11:"TITLE"
}

train_raw = load_from_disk(str(BASE_PATH / "train_with_images"))
region_counts = Counter()
for ex in train_raw:
    for lab in ex.get("category_id", []):
        region_counts[ID2NAME_FALLBACK.get(int(lab), "TEXT")] += 1

print("Region (page-level) counts:")
for k,v in region_counts.most_common():
    print(f"{k:>14}: {v}")

Region (page-level) counts:
          TEXT: 593
     LIST-ITEM: 394
SECTION-HEADER: 385
       PICTURE: 278
         TABLE: 232
   PAGE-FOOTER: 205
       CAPTION: 158
   PAGE-HEADER: 140
      FOOTNOTE: 136
         TITLE: 49
