In [3]:
!pip install --upgrade pip
!pip install torch torchvision pycocotools opencv-python tqdm

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting pycocotools
  Downloading pycocotools-2.0.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting opencv-python
  Downloading opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (19 kB)
Collecting numpy (from torchvision)
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Downloading pycocotools-2.0.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (455 kB)
Downloading opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m13.5 MB/s[0m  [33m0:00:04[0mm0:00:01[0m00:01[0m
[?25hDownloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB

In [4]:
pip install detectron2==0.6 opencv-python pycocotools tqdm

Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement detectron2==0.6 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for detectron2==0.6[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
from pathlib import Path
from datasets import load_from_disk

BASE = Path("/home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout")  # << adjust if needed
IMG_ROOT = BASE / "images"   # images/train/{i}.png etc.

# you already have these
print("train exists:", (BASE/"train").exists())
print("train_with_images exists:", (BASE/"train_with_images").exists())
print("validation_with_images exists:", (BASE/"validation_with_images").exists())

# peek a couple of rows to confirm image paths are mapped
if (BASE/"train_with_images").exists():
    ds = load_from_disk(str(BASE/"train_with_images"))
    print("columns:", ds.column_names)
    for i in range(min(3, len(ds))):
        print(i, ds[i].get("image_path"))

train exists: True
train_with_images exists: True
validation_with_images exists: True
columns: ['metadata', 'bboxes', 'category_id', 'segmentation', 'area', 'pdf_cells', 'image_path']
0 /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/images/train/0.png
1 /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/images/train/1.png
2 /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/images/train/2.png


In [6]:
import os, json
from tqdm import tqdm
from PIL import Image

OUT = BASE / "MaskRCNN_torchvision"   # where COCO jsons + model outputs will live
OUT.mkdir(parents=True, exist_ok=True)

TARGETS = ["TEXT", "TABLE", "PICTURE", "CAPTION"]
CAT2ID  = {c:i+1 for i,c in enumerate(TARGETS)}  # COCO category ids 1..4

ID2NAME_FALLBACK = {
    1:"CAPTION", 2:"FOOTNOTE", 3:"FORMULA", 4:"LIST-ITEM", 5:"PAGE-FOOTER",
    6:"PAGE-HEADER", 7:"PICTURE", 8:"SECTION-HEADER", 9:"TABLE", 10:"TEXT", 11:"TITLE"
}

def norm_name(name_or_id):
    """Map DocLayNet label (int or str) → our 4 targets; FIGURE→PICTURE."""
    if isinstance(name_or_id, int):
        name = ID2NAME_FALLBACK.get(name_or_id, "TEXT")
    else:
        name = str(name_or_id)
    if name == "FIGURE":
        name = "PICTURE"
    return name

def to_xywh(box, W, H):
    """Accept [x0,y0,x1,y1] in pixels or normalized; return pixel [x,y,w,h]."""
    if not box or len(box) != 4: return None
    x0,y0,x1,y1 = map(float, box)
    if max(abs(x0),abs(y0),abs(x1),abs(y1)) <= 1.05 and W>1 and H>1:
        x0,y0,x1,y1 = x0*W, y0*H, x1*W, y1*H
    x0,x1 = min(x0,x1), max(x0,x1)
    y0,y1 = min(y0,y1), max(y0,y1)
    x0 = max(0, min(W-1, x0)); y0 = max(0, min(H-1, y0))
    x1 = max(1, min(W,   x1)); y1 = max(1, min(H,   y1))
    w  = max(1, x1 - x0); h = max(1, y1 - y0)
    return [float(x0), float(y0), float(w), float(h)]

def build_coco_for_split(split_with_images: str):
    """split_with_images: 'train_with_images' or 'validation_with_images'"""
    ds_dir = BASE / split_with_images
    ds = load_from_disk(str(ds_dir))

    split = split_with_images.replace("_with_images","")
    images_dir = IMG_ROOT / split

    coco = {
        "images": [],
        "annotations": [],
        "categories": [{"id": CAT2ID[name], "name": name} for name in TARGETS]
    }

    ann_id = 1
    for img_id, ex in enumerate(tqdm(ds, desc=f"COCO {split}"), start=1):
        img_path = ex.get("image_path") or str(images_dir / f"{img_id-1}.png")
        try:
            with Image.open(img_path) as im:
                W, H = im.size
        except Exception:
            continue

        coco["images"].append({
            "id": img_id,
            "file_name": os.path.basename(img_path),
            "width": W,
            "height": H
        })

        boxes = ex.get("bboxes") or []
        cats  = ex.get("category_id") or []
        for b, c in zip(boxes, cats):
            cname = norm_name(c)
            if cname not in TARGETS:
                continue
            bbox = to_xywh(b, W, H)
            if bbox is None:
                continue
            coco["annotations"].append({
                "id": ann_id,
                "image_id": img_id,
                "category_id": CAT2ID[cname],
                "bbox": bbox,
                "area": float(bbox[2] * bbox[3]),
                "iscrowd": 0
            })
            ann_id += 1

    out_json = OUT / f"{split}.json"
    with open(out_json, "w") as f:
        json.dump(coco, f)
    print(f"✔ wrote {out_json}  | images={len(coco['images'])}  anns={len(coco['annotations'])}")
    print(f"   uses image root: {images_dir}")
    return str(out_json), str(images_dir)

train_json, train_imgdir = build_coco_for_split("train_with_images")
val_json,   val_imgdir   = build_coco_for_split("validation_with_images")

COCO train: 100%|██████████| 300/300 [00:01<00:00, 205.06it/s]


✔ wrote /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/train.json  | images=300  anns=1261
   uses image root: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/images/train


COCO validation: 100%|██████████| 97/97 [00:00<00:00, 177.63it/s]

✔ wrote /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/validation.json  | images=97  anns=521
   uses image root: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/images/validation





In [2]:
import os, json, numpy as np, torch, torchvision, random
from pathlib import Path
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.transforms import functional as TF
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

TARGETS = ["TEXT","TABLE","PICTURE","CAPTION"]
NUM_CLASSES = 1 + len(TARGETS)   # background + 4

# ---------- minimal COCO dataset ----------
class COCOSimple(Dataset):
    """
    Returns:
      image (FloatTensor [C,H,W])
      target dict: boxes [N,4] (xyxy), labels [N], masks [N,H,W], image_id
    Masks are rectangles derived from boxes (simple but effective for mask head).
    """
    def __init__(self, json_path, images_dir):
        with open(json_path, "r") as f:
            coco = json.load(f)
        self.images_dir = images_dir
        self.imgs = {img["id"]: img for img in coco["images"]}
        self.by_img = {img_id: [] for img_id in self.imgs}
        for ann in coco["annotations"]:
            self.by_img[ann["image_id"]].append(ann)
        self.ids = list(self.imgs.keys())

    def __len__(self): return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        meta = self.imgs[img_id]
        path = os.path.join(self.images_dir, meta["file_name"])
        img = Image.open(path).convert("RGB")
        W, H = img.size

        anns = self.by_img.get(img_id, [])
        boxes, labels, masks = [], [], []

        for a in anns:
            x,y,w,h = a["bbox"]
            if w <= 1 or h <= 1: 
                continue
            x0,y0,x1,y1 = x, y, x+w, y+h
            boxes.append([x0,y0,x1,y1])
            labels.append(int(a["category_id"]))
            m = np.zeros((H,W), dtype=np.uint8)
            m[int(y0):int(y1), int(x0):int(x1)] = 1
            masks.append(m)

        if len(boxes)==0:
            boxes  = [[0.0,0.0,1.0,1.0]]
            labels = [0]
            masks  = [np.zeros((H,W), dtype=np.uint8)]

        boxes  = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        masks  = torch.as_tensor(np.stack(masks,0), dtype=torch.uint8)
        image  = TF.to_tensor(img)  # [0,1] float32

        target = {
            "boxes": boxes, "labels": labels, "masks": masks,
            "image_id": torch.tensor([img_id]),
            "iscrowd": torch.zeros((boxes.shape[0],), dtype=torch.int64),
            "area": (boxes[:,2]-boxes[:,0])*(boxes[:,3]-boxes[:,1]),
        }
        return image, target

def collate_fn(batch):
    imgs, tgts = list(zip(*batch))
    return list(imgs), list(tgts)

# loaders
train_ds = COCOSimple(train_json, train_imgdir)
val_ds   = COCOSimple(val_json,   val_imgdir)
train_loader = DataLoader(train_ds, batch_size=2, shuffle=True,  num_workers=2, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=2, shuffle=False, num_workers=2, collate_fn=collate_fn)

# model
model = maskrcnn_resnet50_fpn(weights="COCO_V1")
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

# replace heads
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, NUM_CLASSES)
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, 256, NUM_CLASSES)

model.to(device)

optimizer = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=2.5e-4, weight_decay=1e-4)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.1)

EPOCHS = 10
save_dir = OUT / "tv_maskrcnn_output"
save_dir.mkdir(parents=True, exist_ok=True)
best_val = float("inf")

for epoch in range(EPOCHS):
    # ---- train ----
    model.train()
    tr_loss = 0.0
    for images, targets in tqdm(train_loader, desc=f"Train {epoch+1}/{EPOCHS}"):
        images  = [im.to(device) for im in images]
        targets = [{k:v.to(device) for k,v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        loss = sum(loss for loss in loss_dict.values())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
    lr_scheduler.step()
    tr_loss /= max(1, len(train_loader))

    # ---- val (loss) ----
    model.train()  # to get detection losses
    va_loss = 0.0
    with torch.no_grad():
        for images, targets in tqdm(val_loader, desc="Val"):
            images  = [im.to(device) for im in images]
            targets = [{k:v.to(device) for k,v in t.items()} for t in targets]
            ldict = model(images, targets)
            va_loss += sum(ldict.values()).item()
    va_loss /= max(1, len(val_loader))
    print(f"Epoch {epoch+1}: train_loss={tr_loss:.4f}  val_loss={va_loss:.4f}")

    if va_loss < best_val:
        best_val = va_loss
        torch.save(model.state_dict(), str(save_dir/"model_best.pth"))
        print("✔ saved best:", save_dir/"model_best.pth")

torch.save(model.state_dict(), str(save_dir/"model_last.pth"))
print("✔ training complete →", save_dir/"model_last.pth")

device: cuda


NameError: name 'train_json' is not defined

In [9]:
# === 30-epoch Mask R-CNN training (torchvision), self-contained ===
import os, json, numpy as np, torch, random
from pathlib import Path
from PIL import Image
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import functional as TF
from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

# ---- Paths (adjust BASE if needed) ----
BASE = Path("/home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout")
OUT  = BASE / "MaskRCNN_torchvision"
train_json   = str(OUT / "train.json")
val_json     = str(OUT / "validation.json")
train_imgdir = str(BASE / "images" / "train")
val_imgdir   = str(BASE / "images" / "validation")

assert os.path.isfile(train_json), f"Missing {train_json}. Run the COCO conversion step first."
assert os.path.isfile(val_json),   f"Missing {val_json}. Run the COCO conversion step first."

# ---- Config ----
TARGETS = ["TEXT","TABLE","PICTURE","CAPTION"]
NUM_CLASSES = 1 + len(TARGETS)  # background + 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

# ---- Minimal COCO dataset with light augments for train ----
# --- DROP-IN: robust dataset with safe flip + bbox sanitize ---
import os, json, random, numpy as np, torch
from PIL import Image
from torch.utils.data import Dataset
import torchvision.transforms.functional as TF

class COCOSimple(Dataset):
    def __init__(self, json_path, img_root, is_train=False):
        with open(json_path, "r") as f:
            coco = json.load(f)
        self.img_root = img_root
        self.images = {im["id"]: im for im in coco["images"]}
        self.anns_by_img = {imid: [] for imid in self.images}
        for a in coco["annotations"]:
            self.anns_by_img[a["image_id"]].append(a)
        self.ids = list(self.images.keys())
        self.is_train = is_train  # apply augments only on train

    def _sanitize(self, boxes, labels, masks, W, H, min_size=1.0):
        """
        boxes: Tensor [N,4] (xyxy), labels: Tensor [N], masks: Tensor [N,H,W] (uint8)
        Ensures x0<=x1, y0<=y1, clamps to image bounds, removes degenerate boxes.
        """
        if boxes.numel() == 0:
            return boxes, labels, masks

        # clamp to bounds (allow x1==W, y1==H so width/height stay positive after max with min_size)
        boxes[:, 0] = boxes[:, 0].clamp(0, W)
        boxes[:, 2] = boxes[:, 2].clamp(0, W)
        boxes[:, 1] = boxes[:, 1].clamp(0, H)
        boxes[:, 3] = boxes[:, 3].clamp(0, H)

        # ensure proper ordering
        x0 = torch.minimum(boxes[:, 0], boxes[:, 2])
        x1 = torch.maximum(boxes[:, 0], boxes[:, 2])
        y0 = torch.minimum(boxes[:, 1], boxes[:, 3])
        y1 = torch.maximum(boxes[:, 1], boxes[:, 3])
        boxes = torch.stack([x0, y0, x1, y1], dim=1)

        # enforce strictly positive width/height
        w = boxes[:, 2] - boxes[:, 0]
        h = boxes[:, 3] - boxes[:, 1]
        keep = (w > min_size) & (h > min_size)

        if keep.any():
            boxes  = boxes[keep]
            labels = labels[keep]
            masks  = masks[keep] if masks is not None and masks.numel() > 0 else masks
        else:
            # if everything got filtered, return one tiny background box to avoid crashes
            boxes  = torch.tensor([[0.0, 0.0, 1.0, 1.0]], dtype=torch.float32)
            labels = torch.tensor([0], dtype=torch.int64)
            masks  = torch.zeros((1, H, W), dtype=torch.uint8)

        return boxes, labels, masks

    def __len__(self): 
        return len(self.ids)

    def __getitem__(self, idx):
        im_id = self.ids[idx]
        meta  = self.images[im_id]
        path  = os.path.join(self.img_root, meta["file_name"])
        img   = Image.open(path).convert("RGB")
        W, H  = img.size

        # build targets from COCO (xywh -> xyxy)
        anns = self.anns_by_img.get(im_id, [])
        boxes, labels, masks = [], [], []
        for a in anns:
            x, y, w, h = a["bbox"]
            if w <= 1 or h <= 1:
                continue
            x0, y0, x1, y1 = x, y, x + w, y + h
            boxes.append([x0, y0, x1, y1])
            labels.append(int(a["category_id"]))
            m = np.zeros((H, W), dtype=np.uint8)
            m[int(y0):int(y1), int(x0):int(x1)] = 1
            masks.append(m)

        if len(boxes) == 0:
            boxes  = [[0.0, 0.0, 1.0, 1.0]]
            labels = [0]
            masks  = [np.zeros((H, W), dtype=np.uint8)]

        boxes  = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        masks  = torch.as_tensor(np.stack(masks, 0), dtype=torch.uint8)

        # --- light data augments for training only ---
        if self.is_train:
            # Horizontal flip with proper bbox/mask update
            if random.random() < 0.5:
                img   = img.transpose(Image.FLIP_LEFT_RIGHT)
                # flip boxes: x' = W - x
                x0, y0, x1, y1 = boxes[:, 0].clone(), boxes[:, 1], boxes[:, 2].clone(), boxes[:, 3]
                boxes[:, 0] = W - x1
                boxes[:, 2] = W - x0
                masks = torch.flip(masks, dims=[2])  # flip width dimension

            # mild brightness/contrast jitter
            img_t = TF.to_tensor(img)
            # contrast
            c = 1.0 + (random.random() - 0.5) * 0.4  # ±20%
            img_t = torch.clamp((img_t - 0.5) * c + 0.5, 0, 1)
            # brightness
            b = 1.0 + (random.random() - 0.5) * 0.4
            img_t = torch.clamp(img_t * b, 0, 1)
            img   = TF.to_pil_image(img_t)

        # --- sanitize after any augments ---
        boxes, labels, masks = self._sanitize(boxes, labels, masks, W, H, min_size=1.0)

        image = TF.to_tensor(img)
        target = {
            "boxes": boxes,
            "labels": labels,
            "masks": masks,
            "image_id": torch.tensor([im_id]),
            "iscrowd": torch.zeros((boxes.shape[0],), dtype=torch.int64),
            "area": (boxes[:,2]-boxes[:,0]) * (boxes[:,3]-boxes[:,1]),
        }
        return image, target

def collate_fn(batch):
    imgs, tgts = zip(*batch)
    return list(imgs), list(tgts)

# ---- DataLoaders ----
train_ds = COCOSimple(train_json, train_imgdir, is_train=True)
val_ds   = COCOSimple(val_json,   val_imgdir,   is_train=False)

train_loader = DataLoader(train_ds, batch_size=2, shuffle=True,  num_workers=2, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=2, shuffle=False, num_workers=2, collate_fn=collate_fn)

# ---- Model ----
model = maskrcnn_resnet50_fpn(weights="COCO_V1")
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, NUM_CLASSES)
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, 256, NUM_CLASSES)
model.to(device)

optimizer = torch.optim.AdamW([p for p in model.parameters() if p.requires_grad], lr=2.5e-4, weight_decay=1e-4)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.1)

# ---- Train for 30 epochs, save to a NEW output dir ----
EPOCHS = 30
save_dir = OUT / "tv_maskrcnn_output_e30"
save_dir.mkdir(parents=True, exist_ok=True)
best_val = float("inf")

for epoch in range(EPOCHS):
    # train
    model.train()
    train_loss = 0.0
    for images, targets in tqdm(train_loader, desc=f"Train {epoch+1}/{EPOCHS}"):
        images  = [im.to(device) for im in images]
        targets = [{k:v.to(device) for k,v in t.items()} for t in targets]
        loss_dict = model(images, targets)
        loss = sum(loss_dict.values())
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        train_loss += loss.item()
    lr_scheduler.step()
    train_loss /= max(1, len(train_loader))

    # val (compute losses)
    model.train()
    val_loss = 0.0
    with torch.no_grad():
        for images, targets in tqdm(val_loader, desc="Val"):
            images  = [im.to(device) for im in images]
            targets = [{k:v.to(device) for k,v in t.items()} for t in targets]
            ldict = model(images, targets)
            val_loss += sum(ldict.values()).item()
    val_loss /= max(1, len(val_loader))
    print(f"Epoch {epoch+1}: train_loss={train_loss:.4f}  val_loss={val_loss:.4f}")

    if val_loss < best_val:
        best_val = val_loss
        torch.save(model.state_dict(), str(save_dir / "model_best.pth"))
        print("✅ Saved best:", save_dir / "model_best.pth")

torch.save(model.state_dict(), str(save_dir / "model_last.pth"))
print("✔ Training complete →", save_dir / "model_last.pth")


device: cuda


Train 1/30: 100%|██████████| 150/150 [00:36<00:00,  4.07it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.08it/s]


Epoch 1: train_loss=1.5172  val_loss=1.3520
✅ Saved best: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/tv_maskrcnn_output_e30/model_best.pth


Train 2/30: 100%|██████████| 150/150 [00:38<00:00,  3.90it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.07it/s]


Epoch 2: train_loss=1.2448  val_loss=1.2335
✅ Saved best: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/tv_maskrcnn_output_e30/model_best.pth


Train 3/30: 100%|██████████| 150/150 [00:39<00:00,  3.84it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.20it/s]


Epoch 3: train_loss=1.2216  val_loss=1.2674


Train 4/30: 100%|██████████| 150/150 [00:39<00:00,  3.84it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.16it/s]


Epoch 4: train_loss=1.1787  val_loss=1.2181
✅ Saved best: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/tv_maskrcnn_output_e30/model_best.pth


Train 5/30: 100%|██████████| 150/150 [00:39<00:00,  3.80it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.16it/s]


Epoch 5: train_loss=1.1437  val_loss=1.2044
✅ Saved best: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/tv_maskrcnn_output_e30/model_best.pth


Train 6/30: 100%|██████████| 150/150 [00:38<00:00,  3.93it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.04it/s]


Epoch 6: train_loss=1.1571  val_loss=1.2691


Train 7/30: 100%|██████████| 150/150 [00:39<00:00,  3.84it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.11it/s]


Epoch 7: train_loss=1.1374  val_loss=1.2502


Train 8/30: 100%|██████████| 150/150 [00:39<00:00,  3.77it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.08it/s]


Epoch 8: train_loss=1.1074  val_loss=1.2600


Train 9/30: 100%|██████████| 150/150 [00:39<00:00,  3.75it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.12it/s]


Epoch 9: train_loss=1.0338  val_loss=1.1907
✅ Saved best: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/tv_maskrcnn_output_e30/model_best.pth


Train 10/30: 100%|██████████| 150/150 [00:39<00:00,  3.80it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.12it/s]


Epoch 10: train_loss=0.9967  val_loss=1.1619
✅ Saved best: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/tv_maskrcnn_output_e30/model_best.pth


Train 11/30: 100%|██████████| 150/150 [00:40<00:00,  3.74it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.03it/s]


Epoch 11: train_loss=0.9676  val_loss=1.1644


Train 12/30: 100%|██████████| 150/150 [00:39<00:00,  3.83it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.05it/s]


Epoch 12: train_loss=0.9586  val_loss=1.1734


Train 13/30: 100%|██████████| 150/150 [00:39<00:00,  3.80it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.95it/s]


Epoch 13: train_loss=0.9517  val_loss=1.1761


Train 14/30: 100%|██████████| 150/150 [00:39<00:00,  3.76it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.97it/s]


Epoch 14: train_loss=0.9296  val_loss=1.1816


Train 15/30: 100%|██████████| 150/150 [00:39<00:00,  3.78it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.01it/s]


Epoch 15: train_loss=0.9219  val_loss=1.1993


Train 16/30: 100%|██████████| 150/150 [00:40<00:00,  3.72it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  7.00it/s]


Epoch 16: train_loss=0.9040  val_loss=1.2029


Train 17/30: 100%|██████████| 150/150 [00:39<00:00,  3.77it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.82it/s]


Epoch 17: train_loss=0.8939  val_loss=1.1898


Train 18/30: 100%|██████████| 150/150 [00:40<00:00,  3.71it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.94it/s]


Epoch 18: train_loss=0.8806  val_loss=1.1830


Train 19/30: 100%|██████████| 150/150 [00:40<00:00,  3.71it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.00it/s]


Epoch 19: train_loss=0.8693  val_loss=1.1943


Train 20/30: 100%|██████████| 150/150 [00:40<00:00,  3.72it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.03it/s]


Epoch 20: train_loss=0.8750  val_loss=1.1938


Train 21/30: 100%|██████████| 150/150 [00:40<00:00,  3.71it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.94it/s]


Epoch 21: train_loss=0.8757  val_loss=1.2082


Train 22/30: 100%|██████████| 150/150 [00:40<00:00,  3.71it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.78it/s]


Epoch 22: train_loss=0.8738  val_loss=1.1991


Train 23/30: 100%|██████████| 150/150 [00:40<00:00,  3.71it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.78it/s]


Epoch 23: train_loss=0.8737  val_loss=1.2099


Train 24/30: 100%|██████████| 150/150 [00:40<00:00,  3.71it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.98it/s]


Epoch 24: train_loss=0.8701  val_loss=1.2068


Train 25/30: 100%|██████████| 150/150 [00:39<00:00,  3.75it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.95it/s]


Epoch 25: train_loss=0.8599  val_loss=1.2146


Train 26/30: 100%|██████████| 150/150 [00:40<00:00,  3.70it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.85it/s]


Epoch 26: train_loss=0.8632  val_loss=1.2117


Train 27/30: 100%|██████████| 150/150 [00:40<00:00,  3.73it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.02it/s]


Epoch 27: train_loss=0.8640  val_loss=1.2137


Train 28/30: 100%|██████████| 150/150 [00:40<00:00,  3.71it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.94it/s]


Epoch 28: train_loss=0.8639  val_loss=1.2080


Train 29/30: 100%|██████████| 150/150 [00:41<00:00,  3.64it/s]
Val: 100%|██████████| 49/49 [00:06<00:00,  7.08it/s]


Epoch 29: train_loss=0.8542  val_loss=1.2123


Train 30/30: 100%|██████████| 150/150 [00:40<00:00,  3.73it/s]
Val: 100%|██████████| 49/49 [00:07<00:00,  6.90it/s]


Epoch 30: train_loss=0.8603  val_loss=1.2106
✔ Training complete → /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/tv_maskrcnn_output_e30/model_last.pth


In [10]:
import os, torch
from PIL import Image
import torchvision
from torchvision.utils import draw_bounding_boxes
import torchvision.transforms.functional as TF

model.eval()
model.to(device)

label_names = ["__bg__"] + TARGETS
pred_dir = OUT / "val_preds"
pred_dir.mkdir(parents=True, exist_ok=True)

for i in range(min(12, len(val_ds))):
    img, _ = val_ds[i]
    with torch.no_grad():
        pred = model([img.to(device)])[0]
    boxes  = pred["boxes"].cpu()
    labels = pred["labels"].cpu()
    scores = pred["scores"].cpu()

    keep = scores > 0.5
    boxes, labels, scores = boxes[keep], labels[keep], scores[keep]

    drawn = draw_bounding_boxes(
        (img*255).to(torch.uint8),
        boxes,
        [f"{label_names[int(l)]}:{float(s):.2f}" for l,s in zip(labels, scores)],
        width=2
    )
    Image.fromarray(drawn.permute(1,2,0).numpy()).save(pred_dir/f"val_{i:04d}.jpg")

print("✔ wrote previews to:", pred_dir)



✔ wrote previews to: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/val_preds


In [1]:
import os, json, torch
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import torchvision.transforms.functional as TF
from torchvision.models.detection import maskrcnn_resnet50_fpn

BASE = Path("/home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout")
OUT  = BASE / "MaskRCNN_torchvision"
save_dir = OUT / "tv_maskrcnn_output_e30"

val_json   = str(OUT / "validation.json")
val_imgdir = str(BASE / "images" / "validation")

TARGETS = ["TEXT","TABLE","PICTURE","CAPTION"]
NUM_CLASSES = 1 + len(TARGETS)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- rebuild model and load best weights ---
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

model = maskrcnn_resnet50_fpn(weights="COCO_V1")
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, NUM_CLASSES)
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, 256, NUM_CLASSES)
model.load_state_dict(torch.load(save_dir / "model_best.pth", map_location="cpu"))
model.to(device).eval()

# --- load COCO val to iterate in order ---
with open(val_json, "r") as f: coco_val = json.load(f)
images = coco_val["images"]

SCORE_TH = 0.5  # adjust if you want more/less detections
preds_for_coco = []  # list of dicts: {image_id, category_id, bbox, score}

for im in tqdm(images, desc="Infer val"):
    path = os.path.join(val_imgdir, im["file_name"])
    img = Image.open(path).convert("RGB")
    tensor = TF.to_tensor(img).to(device)
    with torch.no_grad():
        out = model([tensor])[0]

    boxes  = out["boxes"].cpu()
    labels = out["labels"].cpu()
    scores = out["scores"].cpu()

    keep = scores > SCORE_TH
    boxes, labels, scores = boxes[keep], labels[keep], scores[keep]

    # xyxy -> xywh
    if len(boxes):
        xywh = boxes.clone()
        xywh[:,2] = boxes[:,2] - boxes[:,0]
        xywh[:,3] = boxes[:,3] - boxes[:,1]
        xywh[:,0] = boxes[:,0]
        xywh[:,1] = boxes[:,1]
        for b, lab, sc in zip(xywh.tolist(), labels.tolist(), scores.tolist()):
            preds_for_coco.append({
                "image_id": im["id"],
                "category_id": int(lab),   # already 1..4
                "bbox": [float(b[0]), float(b[1]), float(b[2]), float(b[3])],
                "score": float(sc)
            })

pred_path = OUT / "pred_val.json"
with open(pred_path, "w") as f: json.dump(preds_for_coco, f)
print("✓ wrote predictions:", pred_path)

Infer val: 100%|██████████| 97/97 [00:20<00:00,  4.72it/s]

✓ wrote predictions: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/pred_val.json





In [13]:
import json
from pathlib import Path

BASE = Path("/home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout")
OUT  = BASE / "MaskRCNN_torchvision"

def ensure_coco_headers(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    # add minimal headers if missing
    data.setdefault("info", {
        "description": "DocLayNet 4-class (TEXT/TABLE/PICTURE/CAPTION)",
        "version": "1.0",
        "year": 2025,
        "contributor": "auto-converted",
        "date_created": ""
    })
    data.setdefault("licenses", [])
    # (optional) ensure required top-level keys exist
    data.setdefault("images", [])
    data.setdefault("annotations", [])
    data.setdefault("categories", [])
    with open(json_path, "w") as f:
        json.dump(data, f)
    print("Fixed headers in:", json_path)

ensure_coco_headers(str(OUT / "validation.json"))
# (optional) ensure_coco_headers(str(OUT / "train.json"))

Fixed headers in: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/validation.json


In [4]:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import json

val_json = str(OUT / "validation.json")
pred_path = str(OUT / "pred_val.json")  # from the inference step

gt = COCO(val_json)
dt = gt.loadRes(pred_path)

evaluator = COCOeval(gt, dt, iouType="bbox")
evaluator.evaluate()
evaluator.accumulate()
evaluator.summarize()

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.06s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.98s).
Accumulating evaluation results...
DONE (t=0.23s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.022
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.078
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.004
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.023
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.042
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.110
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

In [2]:
import os, json, torch
from PIL import Image
from tqdm import tqdm
import torchvision.transforms.functional as TF

from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

BASE = "/home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout"
OUT  = f"{BASE}/MaskRCNN_torchvision"
val_json   = f"{OUT}/validation.json"
val_imgdir = f"{BASE}/images/validation"
best_ckpt  = f"{OUT}/tv_maskrcnn_output_e30/model_best.pth"

TARGETS = ["TEXT","TABLE","PICTURE","CAPTION"]
NUM_CLASSES = 1 + len(TARGETS)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# rebuild model
model = maskrcnn_resnet50_fpn(weights="COCO_V1")
in_feat = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_feat, NUM_CLASSES)
in_feat_m = model.roi_heads.mask_predictor.conv5_mask.in_channels
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_feat_m, 256, NUM_CLASSES)
model.load_state_dict(torch.load(best_ckpt, map_location="cpu"))
model.to(device).eval()

with open(val_json,"r") as f: coco_val = json.load(f)
images = coco_val["images"]

preds = []
for im in tqdm(images, desc="Infer (no-threshold)"):
    path = os.path.join(val_imgdir, im["file_name"])
    img  = Image.open(path).convert("RGB")
    tens = TF.to_tensor(img).to(device)
    with torch.no_grad():
        out = model([tens])[0]
    # sort by score desc and keep top 100
    scores = out["scores"].cpu()
    order  = torch.argsort(scores, descending=True)[:100]
    boxes  = out["boxes"].cpu()[order]
    labels = out["labels"].cpu()[order]
    scores = scores[order]

    # xyxy → xywh
    xywh = boxes.clone()
    xywh[:,2] = boxes[:,2] - boxes[:,0]
    xywh[:,3] = boxes[:,3] - boxes[:,1]
    xywh[:,0] = boxes[:,0]
    xywh[:,1] = boxes[:,1]

    for b, lab, sc in zip(xywh.tolist(), labels.tolist(), scores.tolist()):
        preds.append({
            "image_id": im["id"],
            "category_id": int(lab),           # MUST be 1..4
            "bbox": [float(b[0]), float(b[1]), float(b[2]), float(b[3])],  # pixels
            "score": float(sc)
        })

pred_path = f"{OUT}/pred_val.json"
with open(pred_path, "w") as f: json.dump(preds, f)
print("wrote:", pred_path)

Infer (no-threshold): 100%|██████████| 97/97 [00:14<00:00,  6.87it/s]


wrote: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/pred_val.json


In [10]:
import os, json, torch
from PIL import Image, ImageDraw
import torchvision.transforms.functional as TF

val_json = f"{OUT}/validation.json"
with open(val_json,"r") as f: coco_val=json.load(f)
by_img = {im["id"]: im for im in coco_val["images"]}
gt_by_img = {}
for ann in coco_val["annotations"]:
    gt_by_img.setdefault(ann["image_id"], []).append(ann)

# pick 5 random images
import random
ids = random.sample(list(by_img.keys()), k=min(5, len(by_img)))
ID2NAME = {1:"TEXT",2:"TABLE",3:"PICTURE",4:"CAPTION"}

with open(f"{OUT}/pred_val.json","r") as f: preds = json.load(f)
preds_map = {}
for p in preds:
    preds_map.setdefault(p["image_id"], []).append(p)

os.makedirs(f"{OUT}/vis_gt_pred", exist_ok=True)
for im_id in ids:
    meta = by_img[im_id]
    path = os.path.join(val_imgdir, meta["file_name"])
    img  = Image.open(path).convert("RGB")
    draw = ImageDraw.Draw(img, "RGBA")

    # draw GT in green
    for a in gt_by_img.get(im_id, []):
        x,y,w,h = a["bbox"]
        draw.rectangle([x,y,x+w,y+h], outline=(0,255,0,255), width=3)
        draw.text((x,y), f"GT:{ID2NAME[a['category_id']]}", fill=(0,255,0,255))

    # draw Pred in red
    for p in preds_map.get(im_id, []):
        x,y,w,h = p["bbox"]
        draw.rectangle([x,y,x+w,y+h], outline=(255,0,0,255), width=2)
        draw.text((x,y+h), f"P:{ID2NAME.get(p['category_id'],'?')} {p['score']:.2f}", fill=(255,0,0,255))

    img.save(f"{OUT}/vis_gt_pred/{meta['file_name']}")
print("Wrote overlays to:", f"{OUT}/vis_gt_pred")

Wrote overlays to: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/vis_gt_pred


In [3]:
import os, json, random, torch
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import torchvision.transforms.functional as TF
from torchvision.models.detection import maskrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.ops import nms

# ---- Paths (adjust BASE if yours differs) ----
BASE = Path("/home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout")
OUT  = BASE / "MaskRCNN_torchvision"
save_dir = OUT / "tv_maskrcnn_output_e30"
val_json   = OUT / "validation.json"
val_imgdir = BASE / "images" / "validation"
vis_dir    = OUT / "vis_gt_pred_nms"
vis_dir.mkdir(parents=True, exist_ok=True)

# ---- Labels ----
TARGETS = ["TEXT","TABLE","PICTURE","CAPTION"]
ID2NAME = {1:"TEXT", 2:"TABLE", 3:"PICTURE", 4:"CAPTION"}

# ---- Load model (best) ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = maskrcnn_resnet50_fpn(weights="COCO_V1")
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, 1 + len(TARGETS))
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, 256, 1 + len(TARGETS))
model.load_state_dict(torch.load(save_dir / "model_best.pth", map_location="cpu"))
model.to(device).eval()

# ---- Load GT COCO for overlays ----
with open(val_json, "r") as f:
    coco_val = json.load(f)
images = coco_val["images"]
gt_by_img = {}
for ann in coco_val["annotations"]:
    gt_by_img.setdefault(ann["image_id"], []).append(ann)

# ---- Viz settings ----
CONF_TH = 0.5       # score threshold
NMS_IOU = 0.5       # NMS IoU threshold
NUM_SAMPLES = min(10, len(images))

# ---- Draw helper ----
def draw_box(draw, xywh, color, text=None, width=3):
    x,y,w,h = xywh
    draw.rectangle([x,y,x+w,y+h], outline=color, width=width)
    if text:
        draw.text((x, max(0, y-12)), text, fill=color)

# ---- Run on a few random images ----
for im in random.sample(images, NUM_SAMPLES):
    img_path = val_imgdir / im["file_name"]
    img = Image.open(img_path).convert("RGB")

    # Predict
    tens = TF.to_tensor(img).to(device)
    with torch.no_grad():
        out = model([tens])[0]

    boxes_xyxy = out["boxes"].cpu()
    labels     = out["labels"].cpu()
    scores     = out["scores"].cpu()

    # Filter by score
    keep = scores > CONF_TH
    boxes_xyxy = boxes_xyxy[keep]
    labels     = labels[keep]
    scores     = scores[keep]

    # Apply NMS
    if len(boxes_xyxy) > 0:
        keep_n = nms(boxes_xyxy, scores, NMS_IOU)
        boxes_xyxy = boxes_xyxy[keep_n]
        labels     = labels[keep_n]
        scores     = scores[keep_n]

    # Convert xyxy -> xywh for drawing convenience
    boxes_xywh = []
    for b in boxes_xyxy.tolist():
        x0,y0,x1,y1 = b
        boxes_xywh.append([x0, y0, x1-x0, y1-y0])

    # Draw GT (green) + Pred (red)
    canvas = img.copy()
    draw = ImageDraw.Draw(canvas, "RGBA")

    # GT in green
    for a in gt_by_img.get(im["id"], []):
        x,y,w,h = a["bbox"]
        draw_box(draw, [x,y,w,h], color=(0,255,0,255), text=f"GT:{ID2NAME.get(a['category_id'],'?')}", width=3)

    # Pred in red (after threshold+NMS)
    for (x,y,w,h), lab, sc in zip(boxes_xywh, labels.tolist(), scores.tolist()):
        draw_box(draw, [x,y,w,h], color=(255,0,0,255), text=f"P:{ID2NAME.get(lab,'?')} {sc:.2f}", width=2)

    out_path = vis_dir / im["file_name"]
    canvas.save(out_path)

print("✓ Saved cleaned overlays (GT=green, Pred=red) to:", vis_dir)

✓ Saved cleaned overlays (GT=green, Pred=red) to: /home/jupyter-24251d5803/DLS_dataset/DocLayNet-Balanced-Layout/MaskRCNN_torchvision/vis_gt_pred_nms
