In [1]:
!pip install tqdm
!pip install "Pillow==9.5.0" #use a downgrade version of PIL
!pip install torchvision
!pip install torch 
!pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
!pip install -U layoutparser
!pip install pytesseract
!pip install tensorboard


Collecting detectron2
  Cloning https://github.com/facebookresearch/detectron2.git (to revision v0.4) to /private/var/folders/57/0byy2pcx5fnckzr0s35zly7w0000gn/T/pip-install-v8bf6ywn/detectron2_c039ca61bf954ce19bbfc9ff303012cb
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/detectron2.git /private/var/folders/57/0byy2pcx5fnckzr0s35zly7w0000gn/T/pip-install-v8bf6ywn/detectron2_c039ca61bf954ce19bbfc9ff303012cb
  Running command git checkout -q 4aca4bdaa9ad48b8e91d7520e0d0815bb8ca0fb1
  Resolved https://github.com/facebookresearch/detectron2.git to commit 4aca4bdaa9ad48b8e91d7520e0d0815bb8ca0fb1
  Preparing metadata (setup.py) ... [?25ldone


# Initial Importing (from the publaynet dataset pretrained model) -> FOR INITIAL INFERENCE ONLY

In [5]:
import layoutparser as lp
model = lp.Detectron2LayoutModel(
            config_path ='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config', # In model catalog
            label_map   ={0: "Text", 1: "Title", 2: "List", 3:"Table", 4:"Figure"}, # In model`label_map`
            extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8] # Optional
        )

# (NO NEEDED!! )Remap COCO -> as by DEFAULT the category_ids start from 1

As the model is pretrained on PubLayNet, we need to convert the annotations to the PubLayNet format -> labels have start with 0

In [None]:
# # remap_coco.py
# import json

# # old→new mapping
# MAPPING = {1: 0, 2: 1, 3: 2, 4: 3}

# def remap_coco(input_json, output_json):
#     with open(input_json, 'r') as f:
#         data = json.load(f)

#     # remap categories
#     for cat in data['categories']:
#         cat['id'] = MAPPING[cat['id']]

#     # remap each annotation
#     for ann in data['annotations']:
#         ann['category_id'] = MAPPING[ann['category_id']]

#     # write out
#     with open(output_json, 'w') as f:
#         json.dump(data, f, indent=2)

# if __name__ == "__main__":
#     remap_coco("datasets/phase_1_training_minimal/annotations/train_phase_1_minimal_v2.json", "datasets/phase_1_training_minimal/annotations/train_phase_1_minimal_v2_remapped.json")
#     remap_coco("datasets/phase_1_training_minimal/annotations/val_phase_1_minimal_v2.json",   "datasets/phase_1_training_minimal/annotations/val_phase_1_minimal_v2_remapped.json")


# Register dataset

In [1]:
from detectron2.data.datasets import register_coco_instances
# ── Register your NEW 4-class splits ───────────────────────────────
register_coco_instances(
    "train_phase_1_minimal_v2_remapped", {}, 
    "./datasets/phase_1_training_minimal/annotations/train_phase_1_minimal_v2_remapped.json",
    "./datasets/phase_1_training_minimal/images"
)
register_coco_instances(
    "val_phase_1_minimal_v2_remapped",   {}, 
    "./datasets/phase_1_training_minimal/annotations/val_phase_1_minimal_v2_remapped.json",
    "./datasets/phase_1_training_minimal/images"
)


# Just to check your environment

In [2]:
from detectron2.data import DatasetCatalog, MetadataCatalog

# List all registered datasets
# print("Registered datasets:", DatasetCatalog.list())

metadata = MetadataCatalog.get("val_phase_1_minimal_v2_remapped")
print("Image root:", metadata.image_root)
print("Annotation file:", metadata.json_file)

Image root: ./datasets/phase_1_training_minimal/images
Annotation file: ./datasets/phase_1_training_minimal/annotations/val_phase_1_minimal_v2_remapped.json


In [6]:
from detectron2.config import get_cfg
from detectron2 import model_zoo
import os


cfg = get_cfg()

cfg.merge_from_file(model_zoo.get_config_file(
    "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))  # Base config

cfg.DATASETS.TRAIN = ("train_phase_1_minimal_v2_remapped",)
cfg.DATASETS.TEST = ("val_phase_1_minimal_v2_remapped",)

cfg.DATALOADER.NUM_WORKERS = 8 
# NOTE dont change weights -> USE PUBLAYNET PRETRAINED MODEL
cfg.MODEL.WEIGHTS = "pretrained_models/model_final.pth"  
cfg.SOLVER.LOG_PERIOD = 50 # Log every 50 iterations


cfg.SOLVER.IMS_PER_BATCH = 4
cfg.SOLVER.BASE_LR = 0.00025  # Lower learning rate for fine-tuning

# NOTE rule of thumb for count of iterations: num_images / IMS_PER_BATCH 
# if you have 40 images and IMS_PER_BATCH = 4, and you want to train for 20 epochs, then MAX_ITER = 40 / 4  = 200
"""
I have 64 images in train_phase_1_minimal_v2_remapped:
Assuming IMS_PER_BATCH = 4 and you want to train for 20 epochs:

- Iterations per epoch: 64 / 4 = 16
- For 20 epochs: 16 * 20 = 320
"""

cfg.SOLVER.MAX_ITER = 320    # Adjust based on your dataset size
cfg.SOLVER.STEPS = []         # No learning rate decay
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128  
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8

# Force CPU usage (no GPU on macOS)
cfg.MODEL.DEVICE = "cpu"  # This ensures the model runs on CPU

cfg.MODEL.ROI_HEADS.NUM_CLASSES = 4 #IMPORTANT - number of classes

# Output directory -> NOTE u can change this
cfg.OUTPUT_DIR = "./training_results/training3_output_phase_1_minimal_detectron_ready"
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)


# Training (dont change)


In [7]:
from detectron2.engine import HookBase
from tqdm import tqdm
from detectron2.engine import DefaultTrainer
from detectron2.engine import HookBase
from tqdm import tqdm

class TQDMWithLossHook(HookBase):
    def before_train(self):
        self.pbar = tqdm(total=self.trainer.max_iter, desc="Training", unit="iter")

    def after_step(self):
        storage   = self.trainer.storage
        loss_dict = storage.latest()
        raw       = loss_dict.get("total_loss", None)

        # Unpack tuple if necessary
        if isinstance(raw, (tuple, list)):
            loss_value = raw[0]
        else:
            loss_value = raw

        # Now it's safe to float()
        if loss_value is not None:
            self.pbar.set_postfix(loss=float(loss_value))
        else:
            self.pbar.set_postfix(loss="N/A")

        self.pbar.update(1)

    def after_train(self):
        self.pbar.close()

# Then after you create your trainer:

trainer = DefaultTrainer(cfg)

# Add the tqdm hook
trainer.register_hooks([TQDMWithLossHook()])

# Start training
trainer.resume_or_load(resume=False) # <-- ensures a fresh load of the pretrained weights
trainer.train()


[32m[05/05 19:34:45 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (7, 1024) in the checkpoint but (5, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (7,) in the checkpoint but (5,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (24, 1024) in the checkpoint but (16, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (24,) in the checkpoint but (16,) in the model! You might want to double check if this is expected.


[32m[05/05 19:34:45 d2.engine.train_loop]: [0mStarting training from iteration 0


  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Training:   6%|▌         | 19/320 [01:27<22:37,  4.51s/iter, loss=2.54]

[32m[05/05 19:36:18 d2.utils.events]: [0m eta: 0:23:06  iter: 19  total_loss: 2.67  loss_cls: 1.501  loss_box_reg: 0.855  loss_rpn_cls: 0.09199  loss_rpn_loc: 0.1878  time: 4.5002  data_time: 0.0737  lr: 4.9953e-06  


Training:  12%|█▏        | 39/320 [02:59<20:37,  4.40s/iter, loss=2.11]

[32m[05/05 19:37:49 d2.utils.events]: [0m eta: 0:21:27  iter: 39  total_loss: 2.482  loss_cls: 1.277  loss_box_reg: 0.8536  loss_rpn_cls: 0.1008  loss_rpn_loc: 0.228  time: 4.5173  data_time: 0.0013  lr: 9.9902e-06  


Training:  18%|█▊        | 59/320 [04:29<20:18,  4.67s/iter, loss=2]   

[32m[05/05 19:39:19 d2.utils.events]: [0m eta: 0:19:59  iter: 59  total_loss: 2.116  loss_cls: 0.9425  loss_box_reg: 0.8638  loss_rpn_cls: 0.05858  loss_rpn_loc: 0.2023  time: 4.5111  data_time: 0.0013  lr: 1.4985e-05  


Training:  25%|██▍       | 79/320 [06:00<17:44,  4.42s/iter, loss=2.26]

[32m[05/05 19:40:51 d2.utils.events]: [0m eta: 0:18:33  iter: 79  total_loss: 1.886  loss_cls: 0.765  loss_box_reg: 0.8353  loss_rpn_cls: 0.04882  loss_rpn_loc: 0.1969  time: 4.5408  data_time: 0.0013  lr: 1.998e-05  


Training:  31%|███       | 99/320 [07:31<16:47,  4.56s/iter, loss=1.81]

[32m[05/05 19:42:21 d2.utils.events]: [0m eta: 0:16:55  iter: 99  total_loss: 1.796  loss_cls: 0.6924  loss_box_reg: 0.8449  loss_rpn_cls: 0.04306  loss_rpn_loc: 0.176  time: 4.5289  data_time: 0.0013  lr: 2.4975e-05  


Training:  37%|███▋      | 119/320 [09:00<15:03,  4.49s/iter, loss=1.56]

[32m[05/05 19:43:50 d2.utils.events]: [0m eta: 0:15:17  iter: 119  total_loss: 1.702  loss_cls: 0.6334  loss_box_reg: 0.8373  loss_rpn_cls: 0.04382  loss_rpn_loc: 0.1539  time: 4.5151  data_time: 0.0012  lr: 2.997e-05  


Training:  43%|████▎     | 139/320 [10:27<12:41,  4.21s/iter, loss=1.51]

[32m[05/05 19:45:17 d2.utils.events]: [0m eta: 0:13:42  iter: 139  total_loss: 1.64  loss_cls: 0.6139  loss_box_reg: 0.8119  loss_rpn_cls: 0.04227  loss_rpn_loc: 0.1812  time: 4.4958  data_time: 0.0012  lr: 3.4965e-05  


Training:  50%|████▉     | 159/320 [11:54<11:56,  4.45s/iter, loss=1.53]

[32m[05/05 19:46:45 d2.utils.events]: [0m eta: 0:12:06  iter: 159  total_loss: 1.637  loss_cls: 0.5821  loss_box_reg: 0.83  loss_rpn_cls: 0.04502  loss_rpn_loc: 0.193  time: 4.4782  data_time: 0.0012  lr: 3.996e-05  


Training:  56%|█████▌    | 179/320 [13:21<10:35,  4.50s/iter, loss=1.61]

[32m[05/05 19:48:11 d2.utils.events]: [0m eta: 0:10:24  iter: 179  total_loss: 1.528  loss_cls: 0.5676  loss_box_reg: 0.7877  loss_rpn_cls: 0.03975  loss_rpn_loc: 0.1741  time: 4.4629  data_time: 0.0012  lr: 4.4955e-05  


Training:  62%|██████▏   | 199/320 [14:51<09:32,  4.73s/iter, loss=0.973]

[32m[05/05 19:49:42 d2.utils.events]: [0m eta: 0:08:56  iter: 199  total_loss: 1.549  loss_cls: 0.5341  loss_box_reg: 0.776  loss_rpn_cls: 0.03582  loss_rpn_loc: 0.1725  time: 4.4678  data_time: 0.0013  lr: 4.995e-05  


Training:  68%|██████▊   | 219/320 [16:19<07:30,  4.46s/iter, loss=1.44] 

[32m[05/05 19:51:10 d2.utils.events]: [0m eta: 0:07:26  iter: 219  total_loss: 1.448  loss_cls: 0.474  loss_box_reg: 0.7412  loss_rpn_cls: 0.03476  loss_rpn_loc: 0.1803  time: 4.4617  data_time: 0.0013  lr: 5.4945e-05  


Training:  75%|███████▍  | 239/320 [17:53<06:39,  4.94s/iter, loss=1.36]

[32m[05/05 19:52:43 d2.utils.events]: [0m eta: 0:05:58  iter: 239  total_loss: 1.401  loss_cls: 0.4929  loss_box_reg: 0.6946  loss_rpn_cls: 0.03311  loss_rpn_loc: 0.187  time: 4.4775  data_time: 0.0013  lr: 5.994e-05  


Training:  81%|████████  | 259/320 [19:30<04:49,  4.74s/iter, loss=1.28]

[32m[05/05 19:54:21 d2.utils.events]: [0m eta: 0:04:31  iter: 259  total_loss: 1.329  loss_cls: 0.4466  loss_box_reg: 0.6633  loss_rpn_cls: 0.03483  loss_rpn_loc: 0.1693  time: 4.5094  data_time: 0.0014  lr: 6.4935e-05  


Training:  87%|████████▋ | 279/320 [21:03<03:13,  4.72s/iter, loss=1.16]

[32m[05/05 19:55:53 d2.utils.events]: [0m eta: 0:03:01  iter: 279  total_loss: 1.264  loss_cls: 0.4254  loss_box_reg: 0.6332  loss_rpn_cls: 0.03518  loss_rpn_loc: 0.1714  time: 4.5189  data_time: 0.0014  lr: 6.993e-05  


Training:  93%|█████████▎| 299/320 [22:37<01:40,  4.80s/iter, loss=0.97] 

[32m[05/05 19:57:28 d2.utils.events]: [0m eta: 0:01:30  iter: 299  total_loss: 1.202  loss_cls: 0.4074  loss_box_reg: 0.5704  loss_rpn_cls: 0.04136  loss_rpn_loc: 0.1687  time: 4.5320  data_time: 0.0013  lr: 7.4925e-05  


Training: 100%|█████████▉| 319/320 [24:05<00:04,  4.22s/iter, loss=1.3]  

[32m[05/05 19:58:55 d2.utils.events]: [0m eta: 0:00:00  iter: 319  total_loss: 1.123  loss_cls: 0.3845  loss_box_reg: 0.5273  loss_rpn_cls: 0.0376  loss_rpn_loc: 0.1787  time: 4.5215  data_time: 0.0012  lr: 7.992e-05  


Training: 100%|██████████| 320/320 [24:10<00:00,  4.17s/iter, loss=0.944]

[32m[05/05 19:58:55 d2.engine.hooks]: [0mOverall training speed: 318 iterations in 0:23:57 (4.5215 s / it)
[32m[05/05 19:58:55 d2.engine.hooks]: [0mTotal training time: 0:23:58 (0:00:00 on hooks)
[32m[05/05 19:58:55 d2.data.datasets.coco]: [0mLoaded 16 images in COCO format from ./datasets/phase_1_training_minimal/annotations/val_phase_1_minimal_v2.json
[32m[05/05 19:58:55 d2.data.dataset_mapper]: [0m[DatasetMapper] Augmentations used in inference: [ResizeShortestEdge(short_edge_length=(800, 800), max_size=1333, sample_style='choice')]
[32m[05/05 19:58:55 d2.data.common]: [0mSerializing 16 elements to byte tensors and concatenating them all ...
[32m[05/05 19:58:55 d2.data.common]: [0mSerialized dataset takes 0.01 MiB


Training: 100%|██████████| 320/320 [24:10<00:00,  4.53s/iter, loss=0.944]


# Inference

### Step 1 : Build config same as training

In [8]:
import os, cv2
from detectron2.config        import get_cfg
from detectron2 import model_zoo
from detectron2.engine        import DefaultPredictor
from detectron2.data          import MetadataCatalog
from detectron2.utils.visualizer import Visualizer


cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(
    "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
))
cfg.MODEL.ROI_HEADS.NUM_CLASSES       = 4
cfg.MODEL.WEIGHTS                     = "training_results/training3_output_phase_1_minimal_detectron_ready/model_final.pth"
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8
cfg.MODEL.DEVICE                      = "cpu"

### Step 2 Create predictor and metadata

In [9]:
predictor = DefaultPredictor(cfg)
metadata  = MetadataCatalog.get("val_phase_1_minimal_v2_remapped")


### Step 3 Inference and save

In [7]:
def infer_and_save(image_path, predictor, metadata, output_dir):
    im = cv2.imread(image_path)
    outputs = predictor(im)
    instances = outputs["instances"].to("cpu")

    v = Visualizer(im[:, :, ::-1], metadata=metadata, scale=1.2)
    out = v.draw_instance_predictions(instances)

    os.makedirs(output_dir, exist_ok=True)
    base = os.path.splitext(os.path.basename(image_path))[0]
    save_path = os.path.join(output_dir, f"{base}_output.jpg")
    cv2.imwrite(save_path, out.get_image()[:, :, ::-1])
    print("Saved:", save_path)

# Usage
infer_and_save(
    "/Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/datasets/phase_1_training_minimal/images/920cb6ed__Appellant___Appellant_s_Bundle_of_Documents_Volume_1_Tab2_page_7.png",
    predictor,
    metadata,
    "/Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/training_results/training3_output_phase_1_minimal_detectron_ready"
)

Saved: /Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/training_results/training3_output_phase_1_minimal_detectron_ready/920cb6ed__Appellant___Appellant_s_Bundle_of_Documents_Volume_1_Tab2_page_7_output.jpg


# Batch Inference 
1. Inferene the images available in the folder
2. Based on the json file

In [10]:
import json
# --- Paths (modify as needed) ---
images_dir = "/Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/datasets/phase_1_training_minimal/images"
json_path  = "/Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/datasets/phase_1_training_minimal/annotations/train_phase_1_minimal_v2_remapped.json"  # or val_phase_1_minimal_v2_remapped.json
output_dir = "/Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/training_results/training3_output_phase_1_minimal_detectron_ready/training_images"

# 4) Prepare output directory for batch inference
os.makedirs(output_dir, exist_ok=True)

# 5) Load list of images from JSON
with open(json_path, 'r') as f:
    imgs = json.load(f)["images"]

# 6) Run inference & save visualizations
for img_info in imgs:
    img_file = img_info["file_name"] # get the image file name
    img_path = os.path.join(images_dir, img_file)
    if not os.path.exists(img_path):
        continue

    im = cv2.imread(img_path) # load image
    outputs = predictor(im) # run inference
    instances = outputs["instances"].to("cpu")

    # Save the visualized image
    v = Visualizer(im[:, :, ::-1], metadata=metadata, scale=1.2)
    out = v.draw_instance_predictions(instances)
    save_name = os.path.splitext(img_file)[0] + "_output.jpg"
    save_path = os.path.join(output_dir, save_name)

    cv2.imwrite(save_path, out.get_image()[:, :, ::-1])
    print(f"Saved: {save_path}")


Saved: /Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/training_results/training3_output_phase_1_minimal_detectron_ready/training_images/920cb6ed__Appellant___Appellant_s_Bundle_of_Documents_Volume_1_Tab2_page_7_output.jpg
Saved: /Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/training_results/training3_output_phase_1_minimal_detectron_ready/training_images/78affaa4__Appellant_s_Bundle_of_Documents_Volume_1_Tab6_Tab7_page_8_output.jpg
Saved: /Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/training_results/training3_output_phase_1_minimal_detectron_ready/training_images/dbd47ee1__Appellant___Appellant_s_Bundle_of_Documents_Volume_1_Tab2_page_19_output.jpg
Saved: /Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/training_results/training3_output_phase_1_minimal_detectron_ready/training_images/bc80bfcb__Appellant_s_Bundle_of_Documents_Volume_1_Tab6_Tab1_page_2_output.jpg
Saved: /Users/doodledaron/Documents/F

# Evaluate the model

In [11]:
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader

In [12]:
# Create evaluator and data loader for validation set
evaluator = COCOEvaluator("val_phase_1_minimal_v2_remapped", cfg, False, output_dir=cfg.OUTPUT_DIR)
val_loader = build_detection_test_loader(cfg, "val_phase_1_minimal_v2_remapped")

# Run inference and evaluation
results = inference_on_dataset(trainer.model, val_loader, evaluator)
print(results)

# Save results to file
eval_output_dir = "/Users/doodledaron/Documents/Freelances/Leon/layoutparser_experiment/training_results/training3_output_phase_1_minimal_detectron_ready/evaluator"
os.makedirs(eval_output_dir, exist_ok=True)
with open(os.path.join(eval_output_dir, "eval_results.json"), "w") as f:
    json.dump(results, f, indent=2)

[32m[05/05 20:05:54 d2.data.datasets.coco]: [0mLoaded 16 images in COCO format from ./datasets/phase_1_training_minimal/annotations/val_phase_1_minimal_v2.json
[32m[05/05 20:05:54 d2.data.dataset_mapper]: [0m[DatasetMapper] Augmentations used in inference: [ResizeShortestEdge(short_edge_length=(800, 800), max_size=1333, sample_style='choice')]
[32m[05/05 20:05:54 d2.data.common]: [0mSerializing 16 elements to byte tensors and concatenating them all ...
[32m[05/05 20:05:54 d2.data.common]: [0mSerialized dataset takes 0.01 MiB
[32m[05/05 20:05:54 d2.evaluation.evaluator]: [0mStart inference on 16 images
[32m[05/05 20:06:02 d2.evaluation.evaluator]: [0mInference done 11/16. 0.5720 s / img. ETA=0:00:02
[32m[05/05 20:06:25 d2.evaluation.evaluator]: [0mTotal inference time: 0:00:26.003141 (2.363922 s / img per device, on 1 devices)
[32m[05/05 20:06:25 d2.evaluation.evaluator]: [0mTotal inference pure compute time: 0:00:05 (0.544590 s / img per device, on 1 devices)
[32m[05/0

# Issue: Class Imbalance
### Strategy:
1. Freeze the backbone : cfg.MODEL.BACKBONE.FREEZE_AT = 5  # Freeze all ResNet stages (0–4)
2. Enable RepeatFactorTrainingSampler (RFS)
3. Increase Iterations to 1000 and Tune LR Schedule
4. (Optional Bonus) 🔁 Use Augmentation (Drop-in)

### Configuration Setup

In [3]:
from detectron2.config import get_cfg
from detectron2 import model_zoo
import os

cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(
    "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))

cfg.DATASETS.TRAIN = ("train_phase_1_minimal_v2_remapped",)
cfg.DATASETS.TEST  = ("val_phase_1_minimal_v2_remapped",)

# load the pretrained model weights from publaynet
cfg.MODEL.WEIGHTS = "pretrained_models/model_final.pth"  # from PubLayNet
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 4
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8

cfg.SOLVER.IMS_PER_BATCH = 4
cfg.SOLVER.BASE_LR       = 0.00025
cfg.SOLVER.MAX_ITER      = 1000 # # ~62.5 epochs at batch 4
"""
How it works :
- Training starts with your base learning rate ( cfg.SOLVER.BASE_LR = 0.00025 )
- At iteration 600, the learning rate will be multiplied by cfg.SOLVER.GAMMA = 0.1
- At iteration 800, it will be multiplied by cfg.SOLVER.GAMMA = 0.1 again

- Initial LR: 0.00025
- At iteration 600: LR becomes 0.000025
- At iteration 800: LR becomes 0.0000025
"""
cfg.SOLVER.STEPS         = (600, 800)
cfg.SOLVER.GAMMA         = 0.1

"""
- First 100 iterations: LR increases linearly from ~0 to 0.00025
- Helps stabilize early training
"""
cfg.SOLVER.WARMUP_ITERS  = 100
cfg.SOLVER.WARMUP_METHOD = "linear"

cfg.SOLVER.LOG_PERIOD    = 50
cfg.DATALOADER.NUM_WORKERS = 8  # use lower if CPU limited

"""
- Helps with class imbalance by oversampling rare categories
- Images with rare categories appear more frequently
"""
cfg.DATALOADER.SAMPLER_TRAIN = "RepeatFactorTrainingSampler"

"""
- Setting REPEAT_THRESHOLD = 0.1 means any category that appears in less than 10% of your images will be oversampled
- This helps balance your training data where "Text" is dominant but "Figure", "Table", and "Title" are rare
"""
cfg.DATALOADER.REPEAT_THRESHOLD = 0.1

# freeze the first 5 layers of the backbone and train the remaining layers
# helps with faster training and less overfitting
# also helps with generalization as it reduces the risk of overfitting to the training data
cfg.MODEL.BACKBONE.FREEZE_AT = 5


cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128
cfg.MODEL.DEVICE = "cpu"

cfg.OUTPUT_DIR = "./training_results/training3_output_phase_1_minimal_detectron_ready_solving_class_imbalance"
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)


###  Augmented Trainer

In [4]:
from detectron2.engine import DefaultTrainer
from detectron2.data import DatasetMapper, build_detection_train_loader
import detectron2.data.transforms as T

class AugmentedTrainer(DefaultTrainer):
    @classmethod
    def build_train_loader(cls, cfg):
        aug = [
            T.RandomRotation(angle=[-5, 5]),
            T.RandomBrightness(0.9, 1.1),
            T.RandomFlip(horizontal=True, vertical=False)
        ]
        mapper = DatasetMapper(cfg, is_train=True, augmentations=aug)
        return build_detection_train_loader(cfg, mapper=mapper)


###  Add Progress Bar Hook (Optional but Nice)

In [5]:
from detectron2.engine import HookBase
from tqdm import tqdm

class TQDMWithLossHook(HookBase):
    def before_train(self):
        self.pbar = tqdm(total=self.trainer.max_iter, desc="Training", unit="iter")
    def after_step(self):
        storage = self.trainer.storage
        loss_dict = storage.latest()
        loss = loss_dict.get("total_loss", None)
        if isinstance(loss, (tuple, list)):
            loss = loss[0]
        self.pbar.set_postfix(loss=float(loss) if loss else "N/A")
        self.pbar.update(1)
    def after_train(self):
        self.pbar.close()


In [6]:
trainer = AugmentedTrainer(cfg)
trainer.register_hooks([TQDMWithLossHook()])
trainer.resume_or_load(resume=False)
trainer.train()


[32m[05/06 15:33:22 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (7, 1024) in the checkpoint but (5, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (7,) in the checkpoint but (5,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (24, 1024) in the checkpoint but (16, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (24,) in the checkpoint but (16,) in the model! You might want to double check if this is expected.


[32m[05/06 15:33:22 d2.engine.train_loop]: [0mStarting training from iteration 0


  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Training:   1%|          | 8/1000 [43:54<202:42:22, 735.63s/iter, loss=3.5]

[32m[05/06 16:17:22 d2.engine.hooks]: [0mOverall training speed: 6 iterations in 0:06:46 (67.7240 s / it)
[32m[05/06 16:17:22 d2.engine.hooks]: [0mTotal training time: 0:06:46 (0:00:00 on hooks)
[32m[05/06 16:17:22 d2.utils.events]: [0m eta: 18:13:52  iter: 8  total_loss: 3.524  loss_cls: 1.894  loss_box_reg: 0.4738  loss_rpn_cls: 0.3745  loss_rpn_loc: 0.8135  time: 66.8219  data_time: 0.1868  lr: 1.7733e-05  


Training:   1%|          | 8/1000 [43:59<90:55:44, 329.98s/iter, loss=3.5] 


KeyboardInterrupt: 