# 📄 Fine-Tuning Complex Law Document Layout Detection in Google Colab

# ---
# Setup Info:
# Model: Faster R-CNN R50-FPN pretrained on PubLayNet
# Framework: Detectron2
# Goal: Fine-tune to handle multi-column, tables, footnotes, headers, signatures, logos.

# ---


In [50]:
!pip install tqdm
!pip install "Pillow==9.5.0" #use a downgrade version of PIL
!pip install torchvision



In [11]:
pip show layoutparser pillow

Name: layoutparser
Version: 0.3.4
Summary: A unified toolkit for Deep Learning Based Document Image Analysis
Home-page: https://github.com/Layout-Parser/layout-parser
Author: Zejiang Shen, Ruochen Zhang, and Layout Parser Model Contributors
Author-email: layoutparser@gmail.com
License: Apache-2.0
Location: /opt/anaconda3/envs/layoutparser_experiment/lib/python3.11/site-packages
Requires: iopath, numpy, opencv-python, pandas, pdf2image, pdfplumber, pillow, pyyaml, scipy
Required-by: 
---
Name: Pillow
Version: 9.5.0
Summary: Python Imaging Library (Fork)
Home-page: https://python-pillow.org
Author: Jeffrey A. Clark (Alex)
Author-email: aclark@aclark.net
License: HPND
Location: /opt/anaconda3/envs/layoutparser_experiment/lib/python3.11/site-packages
Requires: 
Required-by: detectron2, fvcore, layoutparser, matplotlib, pdf2image, pdfplumber, pytesseract, torchvision
Note: you may need to restart the kernel to use updated packages.


In [12]:
!pip install torch 
!pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
!pip install -U layoutparser
!pip install pytesseract

Collecting detectron2
  Cloning https://github.com/facebookresearch/detectron2.git (to revision v0.4) to /private/var/folders/57/0byy2pcx5fnckzr0s35zly7w0000gn/T/pip-install-r70qnbz1/detectron2_56eff9c5783440758026371bea81fc62
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /private/var/folders/57/0byy2pcx5fnckzr0s35zly7w0000gn/T/pip-install-r70qnbz1/detectron2_56eff9c5783440758026371bea81fc62
  Running command git checkout -q 4aca4bdaa9ad48b8e91d7520e0d0815bb8ca0fb1
  Resolved https://github.com/facebookresearch/detectron2.git to commit 4aca4bdaa9ad48b8e91d7520e0d0815bb8ca0fb1
  Preparing metadata (setup.py) ... [?25ldone


In [13]:
# 2. Import libraries
import layoutparser as lp
import pytesseract
from PIL import Image

In [14]:
import layoutparser as lp
model = lp.Detectron2LayoutModel(
            config_path ='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', # In model catalog
            label_map   ={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}, # In model`label_map`
            extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8] # Optional
        )


In [43]:
# 4. Access images and PDFs from local directory
image_dir = "./research paper"  # 📂 Change this to your folder
image_paths = glob.glob(os.path.join(image_dir, "*.png")) + \
              glob.glob(os.path.join(image_dir, "*.jpg")) + \
              glob.glob(os.path.join(image_dir, "*.jpeg"))

pdf_paths = glob.glob(os.path.join(image_dir, "*.pdf"))

In [41]:
# 5. Convert PDFs to images (each page = 1 jpg)
from pdf2image import convert_from_path
for pdf_path in pdf_paths:
    pages = convert_from_path(pdf_path, dpi=300)  # Convert each page to an image
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    for idx, page in enumerate(pages):
        img_save_path = os.path.join(image_dir, f"{base_name}_page{idx+1}.jpg")
        page.save(img_save_path, "JPEG")  # Save each page as JPG
        image_paths.append(img_save_path)  # Add new JPGs to processing list

In [44]:
import numpy as np
import json
from PIL import Image, ImageDraw

for image_path in image_paths:
    image = Image.open(image_path).convert("RGB")  # Open image and ensure RGB format
    image_np = np.array(image)  # Convert PIL image to numpy array for LayoutParser
    layout = model.detect(image_np)  # Perform layout detection

    draw = ImageDraw.Draw(image)  # Prepare to draw bounding boxes on the image
    results = []  # Store extracted structured data here

    for block in layout:
        # Crop each detected block from the full image
        segment_image = block.crop_image(image_np)

        # Perform OCR on the cropped segment
        text = ocr_agent.detect(segment_image)

        # Draw bounding box around detected block
        x_1, y_1, x_2, y_2 = block.coordinates
        draw.rectangle([x_1, y_1, x_2, y_2], outline="red", width=3)  # Red rectangle
        draw.text((x_1, y_1 - 10), block.type, fill="red")  # Write block type above box

        # Save the block's structured information
        results.append({
            "type": block.type,
            "text": text,
            "bounding_box": block.coordinates
        })

    # Save extracted results as JSON
    base_name = os.path.basename(image_path)
    base_no_ext = os.path.splitext(base_name)[0]

    json_path = os.path.join(image_dir, base_no_ext + "_output.json")
    with open(json_path, "w") as f:
        json.dump(results, f, indent=4)  # Write formatted JSON

    # Save the image with bounding boxes
    boxed_image_path = os.path.join(image_dir, base_no_ext + "_boxed.png")
    image.save(boxed_image_path)

    print(f"✅ Exported structured JSON and boxed image for {base_name}!")

✅ Exported structured JSON and boxed image for Multimodal Healthcare AI Identifying and Designing Clinically Relevant Vision-Language Applications for Radiology-page11_page_1.jpg!
✅ Exported structured JSON and boxed image for Multimodal Healthcare AI Identifying and Designing Clinically Relevant Vision-Language Applications for Radiology-page2_page_1.jpg!
✅ Exported structured JSON and boxed image for Multimodal Healthcare AI Identifying and Designing Clinically Relevant Vision-Language Applications for Radiology-page7_page_1.jpg!
✅ Exported structured JSON and boxed image for Multimodal Healthcare AI Identifying and Designing Clinically Relevant Vision-Language Applications for Radiology-page19_page_1.jpg!
✅ Exported structured JSON and boxed image for Multimodal Healthcare AI Identifying and Designing Clinically Relevant Vision-Language Applications for Radiology-page14_page_1.jpg!
✅ Exported structured JSON and boxed image for Multimodal Healthcare AI Identifying and Designing Clin

## Setup folder structure

remember to ensure the project folder loooks like this:
```
project
├── data
│   ├── images
│   └── annotations


In [5]:
from detectron2.data.datasets import register_coco_instances

# Register the dataset
register_coco_instances(
    "doclaynet_train", {}, 
    "./datasets/DocLayNet/annotations/train.json", 
    "./datasets/DocLayNet/images"
)

register_coco_instances(
    "doclaynet_val", {}, 
    "./datasets/DocLayNet/annotations/val.json", 
    "./datasets/DocLayNet/images"
)


AssertionError: Dataset 'doclaynet_train' is already registered!

## Configuration

In [13]:
from detectron2.config import get_cfg
from detectron2 import model_zoo
import os

cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(
    "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))  # Base config

cfg.DATASETS.TRAIN = ("doclaynet_train",)
cfg.DATASETS.TEST = ("doclaynet_val",)

cfg.DATALOADER.NUM_WORKERS = 8
cfg.MODEL.WEIGHTS = "detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl"  # Pretrained COCO weights
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025  # Lower learning rate for fine-tuning
cfg.SOLVER.MAX_ITER = 5000    # Adjust based on your dataset size
cfg.SOLVER.STEPS = []         # No learning rate decay
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128  
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 11  # DocLayNet has 11 classes

# Force CPU usage (no GPU on macOS)
cfg.MODEL.DEVICE = "cpu"  # This ensures the model runs on CPU

# Output directory
cfg.OUTPUT_DIR = "./output_doclaynet"
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)


## Training

In [14]:
from detectron2.engine import HookBase
from tqdm import tqdm

from detectron2.engine import HookBase
from tqdm import tqdm

class TQDMWithLossHook(HookBase):
    def before_train(self):
        self.pbar = tqdm(total=self.trainer.max_iter, desc="Training Progress", unit="iter")

    def after_step(self):
        # Get latest logs from storage
        storage = self.trainer.storage
        loss_dict = storage.latest()  # latest() returns a dict of metrics

        # Safely get total_loss if it exists
        loss_value = loss_dict.get('total_loss', None)

        # Update progress bar
        if loss_value is not None:
            self.pbar.set_postfix(loss=float(loss_value))
        else:
            self.pbar.set_postfix(loss="N/A")

        self.pbar.update(1)

    def after_train(self):
        self.pbar.close()

# Then after you create your trainer:

trainer = DefaultTrainer(cfg)
trainer.model = model  # attach the model (fine-tuned one)

# Add the tqdm hook
trainer.register_hooks([TQDMWithLossHook()])

# Start training
trainer.resume_or_load(resume=False)
trainer.train()


[32m[04/28 00:21:00 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

: 

## Inference (after training)

In [None]:
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")  # Load your fine-tuned model
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7   # Set threshold for this model
predictor = DefaultPredictor(cfg)

# Test on new image
from detectron2.utils.visualizer import Visualizer
im = cv2.imread("./datasets/DocLayNet/images/example.png")

outputs = predictor(im)

v = Visualizer(im[:, :, ::-1], scale=1.2)
out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
cv2.imshow("result", out.get_image()[:, :, ::-1])
cv2.waitKey(0)
