# Detectron2 Layout Analysis Training and Inference Notebook
This notebook fine-tunes a Detectron2 layout model on your custom dataset and provides utilities for building a predictor, inference (single image, batch, PDF), and evaluation.

## 1. Setup Dependencies
Installs required packages. Run once per environment.

In [None]:
!pip install tqdm
!pip install "Pillow==9.5.0" #use a downgrade version of PIL
!pip install torchvision
!pip install torch 
!pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
!pip install -U layoutparser
!pip install pytesseract
!pip install tensorboard

## 2. Imports & Configuration
Imports libraries, sets paths, and configures hyperparameters. Update paths before running.

In [None]:
import os
from pathlib import Path
import layoutparser as lp
from detectron2.config import get_cfg
from detectron2 import model_zoo
from pdf2image import convert_from_path
import cv2, matplotlib.pyplot as plt
# Dataset paths
DATA_DIR = Path('/path/to/dataset')
TRAIN_IMAGES = DATA_DIR/'images'
VAL_IMAGES   = DATA_DIR/'images'
TRAIN_ANN    = DATA_DIR/'annotations/train.json'
VAL_ANN      = DATA_DIR/'annotations/val.json'
PRETRAINED_WEIGHTS = 'your path to model weights'
OUTPUT_DIR   = Path('./output')
OUTPUT_DIR.mkdir(exist_ok=True)
# Hyperparameters
NUM_CLASSES        = 5  # PubLayNet classes
BACKBONE_FREEZE_AT = 2
LEARNING_RATE      = 1e-4
MAX_ITER           = 5000
IMS_PER_BATCH      = 2

## 3. Register COCO Datasets
Registers train/val sets.

In [None]:
from detectron2.data.datasets import register_coco_instances
register_coco_instances('my_train', {}, str(TRAIN_ANN), str(TRAIN_IMAGES))
register_coco_instances('my_val',   {}, str(VAL_ANN),   str(VAL_IMAGES))

In [None]:
from detectron2.data import MetadataCatalog

# List all registered datasets
# print("Registered datasets:", DatasetCatalog.list())

metadata = MetadataCatalog.get("my_train")
print("Image root:", metadata.image_root)
print("Annotation file:", metadata.json_file)

## 4. Model Initialization
Loads pretrained PubLayNet weights and applies overrides.

In [None]:
cfg = get_cfg()
#rmb change the config if needed based on the model used
cfg.merge_from_file("lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config")
cfg.MODEL.WEIGHTS = 'your path to pretrained weights'
cfg.MODEL.ROI_HEADS.NUM_CLASSES = NUM_CLASSES
cfg.MODEL.BACKBONE.FREEZE_AT = BACKBONE_FREEZE_AT
cfg.SOLVER.BASE_LR = LEARNING_RATE
cfg.SOLVER.MAX_ITER = MAX_ITER
cfg.SOLVER.IMS_PER_BATCH = IMS_PER_BATCH
cfg.MODEL.MASK_ON = False # set to false if you dont want to train mask
cfg.DATASETS.TRAIN = ('my_train',)
cfg.DATASETS.TEST = ('my_val',)
cfg.OUTPUT_DIR = str(OUTPUT_DIR/'model_outputs')


os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

## 5. Training
Defines hook and starts training from scratch.

In [None]:
from detectron2.engine import DefaultTrainer, HookBase
from tqdm import tqdm
class TQDMWithLossHook(HookBase):
    def before_train(self):
        self.pbar = tqdm(total=self.trainer.max_iter, desc="Training", unit="iter")

    def after_step(self):
        storage   = self.trainer.storage
        loss_dict = storage.latest()
        raw       = loss_dict.get("total_loss", None)

        # Unpack tuple if necessary
        if isinstance(raw, (tuple, list)):
            loss_value = raw[0]
        else:
            loss_value = raw

        # Now it's safe to float()
        if loss_value is not None:
            self.pbar.set_postfix(loss=float(loss_value))
        else:
            self.pbar.set_postfix(loss="N/A")

        self.pbar.update(1)

    def after_train(self):
        self.pbar.close()


trainer = DefaultTrainer(cfg)
trainer.register_hooks([TQDMWithLossHook()])
trainer.resume_or_load(resume=False)  #IMPORTANT!! fresh weights, only when we first start the training
trainer.train()

## 6. Build Predictor
Creates DefaultPredictor with trained cfg and explicit checkpoint path.

In [None]:
from detectron2.engine import DefaultPredictor
# After training completes, load the final checkpoint
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, 'model_final.pth')  # trained weights

# Pick an inference-only threshold here:
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8
predictor = DefaultPredictor(cfg)

## 7. Inference Utilities
Helper functions for single, batch, and PDF inference.

In [None]:
def single_inference(image_path, predictor, save_path=None, show=False):
    img = cv2.imread(str(image_path))
    outputs = predictor(img)
    vis = lp.draw_box(img, outputs['instances'].pred_boxes, box_width=2)
    if save_path: cv2.imwrite(str(save_path), vis)
    if show: plt.imshow(cv2.cvtColor(vis, cv2.COLOR_BGR2RGB)); plt.axis('off')
    return outputs

def batch_inference(input_dir, output_dir, predictor):
    output_dir = Path(output_dir); output_dir.mkdir(exist_ok=True)
    for img_file in Path(input_dir).glob('*.jpg'):
        single_inference(img_file, predictor, save_path=output_dir/img_file.name)

def pdf_inference(pdf_path, image_dir, output_dir, predictor, dpi=200):
    pages = convert_from_path(str(pdf_path), dpi=dpi)
    image_dir = Path(image_dir); image_dir.mkdir(exist_ok=True)
    for i, page in enumerate(pages, start=1):
        jpg = image_dir/f'page_{i}.jpg'; page.save(jpg, 'JPEG')
    batch_inference(image_dir, output_dir, predictor)

## 7.1 Single-Image Inference Example
Run detection on one image and display the result inline.


In [None]:
# Single-image inference demo
from pathlib import Path

img_path = Path('/path/to/some/image.jpg')
outputs = single_inference(
    image_path=img_path,
    predictor=predictor,
    show=True               # will plt.imshow() the box visualization
)
print(outputs['instances'].to('cpu'))  # view raw box/tensor data


## 7.2 Batch Inference Example
Process all JPEGs in a folder and write visualized outputs to another.


In [None]:
# Batch inference demo
input_folder  = '/path/to/jpg/folder'
output_folder = '/path/to/save/results'
batch_inference(
    input_dir=input_folder,
    output_dir=output_folder,
    predictor=predictor
)
print(f"Wrote visualizations to {output_folder}")


## 8.3 PDF Inference Example
Convert each PDF page to JPEG, then run inference on every page.


In [None]:
# PDF inference demo
pdf_file     = '/path/to/document.pdf'
tmp_img_dir  = './temp_pages'
result_dir   = './pdf_results'
pdf_inference(
    pdf_path=pdf_file,
    image_dir=tmp_img_dir,
    output_dir=result_dir,
    predictor=predictor,
    dpi=150         # adjust resolution if needed
)
print(f"Pages processed and outputs saved under {result_dir}")


## 8. Evaluation
Runs COCO evaluation on validation set.

In [None]:
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader
evaluator = COCOEvaluator('my_val', cfg, False, output_dir=cfg.OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, 'my_val')
metrics = inference_on_dataset(predictor.model, val_loader, evaluator)
print(metrics)