In [1]:
# !pip install 'git+https://github.com/facebookresearch/detectron2.git'

In [2]:
# !pip freeze > requirements.txt

In [3]:
print("use yolov8 env to run this code")

use yolov8 env to run this code


In [4]:
import torch, detectron2
print(torch.__version__)  # Should match Colab's default (e.g., 2.x)
print(detectron2.__version__)  # Should print a version number

2.6.0+cu124
0.6


In [5]:
import os
import torch
import detectron2
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import register_coco_instances
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2.evaluation import COCOEvaluator
from detectron2.utils.logger import setup_logger
from detectron2.model_zoo import get_config_file, get_checkpoint_url
import numpy as np
import time
# from google.colab import drive

In [6]:
drive_folder = "/home/ai_train/skyhub_project/june/23rd_new_dataset_to_train/train"  # Change as needed
os.makedirs(drive_folder, exist_ok=True)

print("Training started...")
setup_logger()


Training started...


<Logger detectron2 (DEBUG)>

In [7]:
# Paths
dataset_folder = "/home/ai_train/skyhub_project/june/23rd_new_dataset_to_train"  # Update this if dataset is elsewhere
train_json ='/home/ai_train/skyhub_project/june/23rd_new_dataset_to_train/train/train.json' # os.path.join(dataset_folder, "train_annotations.json")
val_json ='/home/ai_train/skyhub_project/june/23rd_new_dataset_to_train/test/test.json'  #os.path.join(dataset_folder, "val_annotations.json")
train_images = os.path.join(dataset_folder, "train/images")
val_images = os.path.join(dataset_folder, "test/images")
output_dir = os.path.join(drive_folder, "output")  # Save output to Google Drive

In [8]:
# Register datasets
register_coco_instances("my_dataset_train", {}, train_json, train_images)
register_coco_instances("my_dataset_val", {}, val_json, val_images)

In [9]:
# Get metadata
train_metadata = MetadataCatalog.get("my_dataset_train")
val_metadata = MetadataCatalog.get("my_dataset_val")


#### Training configuration 

In [10]:
from detectron2.config import get_cfg
from detectron2.model_zoo import get_config_file, get_checkpoint_url
import torch

cfg = get_cfg()

# Load base configuration file from Detectron2 model zoo
cfg.merge_from_file(get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))

# Register the custom training and validation datasets
cfg.DATASETS.TRAIN = ("my_dataset_train",)  # Replace with the name used during dataset registration
cfg.DATASETS.TEST = ("my_dataset_val",)     # Replace with the name used during dataset registration

# Number of data loading workers (increase for better performance if you have more CPU cores)
cfg.DATALOADER.NUM_WORKERS = 2

# Use pre-trained COCO weights to fine-tune the model
cfg.MODEL.WEIGHTS = get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")

# Number of images per batch across all GPUs (adjust based on GPU memory; higher = faster)
cfg.SOLVER.IMS_PER_BATCH = 2

# Base learning rate for the optimizer
cfg.SOLVER.BASE_LR = 0.00025

# Maximum number of training iterations (not epochs); adjust based on dataset size
cfg.SOLVER.MAX_ITER = 8900

# Learning rate decay steps (iteration numbers where learning rate will be reduced)
cfg.SOLVER.STEPS = (6100, 7900)

# Number of RoI (Region of Interest) samples per image used for training
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512

# Number of object classes (excluding background); set to 1 for a single-class segmentation task
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1

# Evaluate model every 500 iterations using the validation set
cfg.TEST.EVAL_PERIOD = 500

# Directory where output files (models, logs) will be saved
cfg.OUTPUT_DIR = output_dir  # Make sure this directory exists or create it using os.makedirs

# Specify whether to use GPU ('cuda') or CPU based on availability
cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


In [11]:
# cfg = get_cfg()
# cfg.merge_from_file(get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
# cfg.DATASETS.TRAIN = ("my_dataset_train",)
# cfg.DATASETS.TEST = ("my_dataset_val",)
# cfg.DATALOADER.NUM_WORKERS = 2  # Reduce for Colab
# cfg.MODEL.WEIGHTS = get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
# cfg.SOLVER.IMS_PER_BATCH = 2  # Reduce for Colab
# cfg.SOLVER.BASE_LR = 0.00025
# cfg.SOLVER.MAX_ITER = 2500  # Total iterations for 50 epochs
# cfg.SOLVER.STEPS = (1750, 2250)  # Learning rate decay steps at 60% and 80% of the total iterations
# cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
# cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
# cfg.TEST.EVAL_PERIOD = 500
# cfg.OUTPUT_DIR = output_dir
# cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
# Create output directory
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)


In [13]:
# Early Stopping & Model Saving Hook
class EarlyStoppingHook(HookBase):
    def __init__(self, patience=3):  # Stop if no improvement for 3 evals
        self.patience = patience
        self.best_loss = float('inf')
        self.counter = 0

    def after_step(self):
        if self.trainer.iter % cfg.TEST.EVAL_PERIOD == 0:
            eval_results = self.trainer.storage.latest()
            val_loss = eval_results.get("total_loss", None)

            print("val_loss", val_loss)

            # Check if val_loss is not None and is a tuple, then access the first element
            if val_loss is not None:
                val_loss_value = val_loss[0]  # Access the actual validation loss value

                # Print the validation loss correctly
                print(f"Iteration {self.trainer.iter}: Validation Loss = {val_loss_value:.4f}")

                model_path = os.path.join(cfg.OUTPUT_DIR, f"model_iter_{self.trainer.iter}.pth")
                torch.save(self.trainer.model.state_dict(), model_path)
                print(f"Model saved at: {model_path}")

                # Compare and save the best model based on the loss
                if val_loss_value < self.best_loss:
                    self.best_loss = val_loss_value
                    self.counter = 0
                    # Save the best model
                    torch.save(self.trainer.model.state_dict(), os.path.join(cfg.OUTPUT_DIR, "best_model.pth"))
                else:
                    self.counter += 1
                    if self.counter >= self.patience:
                        print("Early stopping triggered. Stopping training.")
                        self.trainer.iter = cfg.SOLVER.MAX_ITER  # Force training to stop

In [14]:
# Custom Trainer
class CocoTrainer(DefaultTrainer):
    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        if output_folder is None:
            output_folder = os.path.join(cfg.OUTPUT_DIR, "eval")
        os.makedirs(output_folder, exist_ok=True)
        return COCOEvaluator(dataset_name, cfg, False, output_folder)

    def build_hooks(self):
        hooks = super().build_hooks()
        hooks.append(EarlyStoppingHook(patience=3))  # Add early stopping & model saving
        return hooks

In [15]:
# Train
trainer = CocoTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

[32m[06/23 12:51:34 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (81, 1024) in the checkpoint but (2, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (81,) in the checkpoint but (2,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (320, 1024) in the checkpoint but (4, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (320,) in the checkpoint but (4,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.mask_head.predictor.weight' to the model due to incompatible shapes: (80, 256, 1, 1) in the checkpoint but (1, 256, 1, 1) in

[32m[06/23 12:51:34 d2.engine.train_loop]: [0mStarting training from iteration 0


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


val_loss (2.468878650572151, 0)
Iteration 0: Validation Loss = 2.4689
Model saved at: /home/ai_train/skyhub_project/june/23rd_new_dataset_to_train/train/output/model_iter_0.pth
[32m[06/23 12:51:41 d2.utils.events]: [0m eta: 0:42:35  iter: 19  total_loss: 2.524  loss_cls: 0.5866  loss_box_reg: 0.007786  loss_mask: 0.6911  loss_rpn_cls: 1.16  loss_rpn_loc: 0.09155    time: 0.2930  last_time: 0.2857  data_time: 0.0138  last_data_time: 0.0037   lr: 4.9953e-06  max_mem: 2255M
[32m[06/23 12:51:47 d2.utils.events]: [0m eta: 0:43:43  iter: 39  total_loss: 1.881  loss_cls: 0.4994  loss_box_reg: 0.003791  loss_mask: 0.6893  loss_rpn_cls: 0.6464  loss_rpn_loc: 0.05156    time: 0.2942  last_time: 0.3237  data_time: 0.0036  last_data_time: 0.0036   lr: 9.9903e-06  max_mem: 2333M
[32m[06/23 12:51:53 d2.utils.events]: [0m eta: 0:43:46  iter: 59  total_loss: 2.068  loss_cls: 0.366  loss_box_reg: 0.01259  loss_mask: 0.6868  loss_rpn_cls: 0.8279  loss_rpn_loc: 0.07914    time: 0.2952  last_time: 0

In [16]:
# Save final model
final_model_path = os.path.join(cfg.OUTPUT_DIR, "model_final_1.pth")
torch.save(trainer.model.state_dict(), final_model_path)
print(f"Training complete. Model saved at: {final_model_path}")


Training complete. Model saved at: /home/ai_train/skyhub_project/june/23rd_new_dataset_to_train/train/output/model_final_1.pth


In [17]:
import os
import torch
import detectron2
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import register_coco_instances
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2.evaluation import COCOEvaluator
from detectron2.utils.logger import setup_logger
from detectron2.model_zoo import get_config_file, get_checkpoint_url
import numpy as np
import time
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

drive_folder = "/content/drive/My Drive/detectron2_training"  # Change as needed
os.makedirs(drive_folder, exist_ok=True)

print("Training started...")
setup_logger()

# Paths
dataset_folder = "/content/dataset"  # Update this if dataset is elsewhere
train_json = os.path.join(dataset_folder, "train_annotations.json")
val_json = os.path.join(dataset_folder, "val_annotations.json")
train_images = os.path.join(dataset_folder, "train")
val_images = os.path.join(dataset_folder, "val")
output_dir = os.path.join(drive_folder, "output")  # Save output to Google Drive

# Register datasets
register_coco_instances("my_dataset_train", {}, train_json, train_images)
register_coco_instances("my_dataset_val", {}, val_json, val_images)

# Get metadata
train_metadata = MetadataCatalog.get("my_dataset_train")
val_metadata = MetadataCatalog.get("my_dataset_val")

# Configuration
cfg = get_cfg()
cfg.merge_from_file(get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("my_dataset_train",)
cfg.DATASETS.TEST = ("my_dataset_val",)
cfg.DATALOADER.NUM_WORKERS = 2  # Reduce for Colab
cfg.MODEL.WEIGHTS = get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.SOLVER.IMS_PER_BATCH = 2  # Reduce for Colab
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.MAX_ITER = 3750
cfg.SOLVER.STEPS = (2500, 3200)
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
cfg.TEST.EVAL_PERIOD = 500
cfg.OUTPUT_DIR = output_dir
cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Create output directory
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

# Early Stopping & Model Saving Hook
class EarlyStoppingHook(HookBase):
    def __init__(self, patience=3):  # Stop if no improvement for 3 evals
        self.patience = patience
        self.best_loss = float('inf')
        self.counter = 0

    def after_step(self):
        if self.trainer.iter % cfg.TEST.EVAL_PERIOD == 0:
            eval_results = self.trainer.storage.latest()
            val_loss = eval_results.get("total_loss", None)
            if val_loss is not None:
                print(f"Iteration {self.trainer.iter}: Validation Loss = {val_loss:.4f}")
                model_path = os.path.join(cfg.OUTPUT_DIR, f"model_iter_{self.trainer.iter}.pth")
                torch.save(self.trainer.model.state_dict(), model_path)
                print(f"Model saved at: {model_path}")
                if val_loss < self.best_loss:
                    self.best_loss = val_loss
                    self.counter = 0
                    # Save the best model
                    torch.save(self.trainer.model.state_dict(), os.path.join(cfg.OUTPUT_DIR, "best_model.pth"))
                else:
                    self.counter += 1
                    if self.counter >= self.patience:
                        print("Early stopping triggered. Stopping training.")
                        self.trainer.iter = cfg.SOLVER.MAX_ITER  # Force training to stop

# Custom Trainer
class CocoTrainer(DefaultTrainer):
    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        if output_folder is None:
            output_folder = os.path.join(cfg.OUTPUT_DIR, "eval")
        os.makedirs(output_folder, exist_ok=True)
        return COCOEvaluator(dataset_name, cfg, False, output_folder)

    def build_hooks(self):
        hooks = super().build_hooks()
        hooks.append(EarlyStoppingHook(patience=3))  # Add early stopping & model saving
        return hooks

# Train
trainer = CocoTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

# Save final model
final_model_path = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
torch.save(trainer.model.state_dict(), final_model_path)
print(f"Training complete. Model saved at: {final_model_path}")


ModuleNotFoundError: No module named 'google.colab'

In [None]:
import os
import torch
import detectron2
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.data.datasets import register_coco_instances
from detectron2.engine import DefaultTrainer, HookBase
from detectron2.config import get_cfg
from detectron2.evaluation import COCOEvaluator
from detectron2.utils.logger import setup_logger
from detectron2.model_zoo import get_config_file, get_checkpoint_url
import numpy as np
import time
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

drive_folder = "/content/drive/My Drive/detectron2_training"  # Change as needed
os.makedirs(drive_folder, exist_ok=True)

print("Training started...")
setup_logger()

# Paths
dataset_folder = "/content/dataset"  # Update this if dataset is elsewhere
train_json = os.path.join(dataset_folder, "train_annotations.json")
val_json = os.path.join(dataset_folder, "val_annotations.json")
train_images = os.path.join(dataset_folder, "train")
val_images = os.path.join(dataset_folder, "val")
output_dir = os.path.join(drive_folder, "output")  # Save output to Google Drive

# Register datasets
register_coco_instances("my_dataset_train", {}, train_json, train_images)
register_coco_instances("my_dataset_val", {}, val_json, val_images)

# Get metadata
train_metadata = MetadataCatalog.get("my_dataset_train")
val_metadata = MetadataCatalog.get("my_dataset_val")

# Configuration
cfg = get_cfg()
cfg.merge_from_file(get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("my_dataset_train",)
cfg.DATASETS.TEST = ("my_dataset_val",)
cfg.DATALOADER.NUM_WORKERS = 2  # Reduce for Colab
cfg.MODEL.WEIGHTS = get_checkpoint_url("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.SOLVER.IMS_PER_BATCH = 2  # Reduce for Colab
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.MAX_ITER = 3750
cfg.SOLVER.STEPS = (2500, 3200)
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1
cfg.TEST.EVAL_PERIOD = 500
cfg.OUTPUT_DIR = output_dir
cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Create output directory
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

# Early Stopping Hook
class EarlyStoppingHook(HookBase):
    def __init__(self, patience=3):  # Stop if no improvement for 3 evals
        self.patience = patience
        self.best_loss = float('inf')
        self.counter = 0

    def after_step(self):
        if self.trainer.iter % cfg.TEST.EVAL_PERIOD == 0:
            eval_results = self.trainer.storage.latest()
            val_loss = eval_results.get("total_loss", None)
            if val_loss is not None:
                print(f"Iteration {self.trainer.iter}: Validation Loss = {val_loss:.4f}")
                if val_loss < self.best_loss:
                    self.best_loss = val_loss
                    self.counter = 0
                    # Save the best model
                    torch.save(self.trainer.model.state_dict(), os.path.join(cfg.OUTPUT_DIR, "best_model.pth"))
                else:
                    self.counter += 1
                    if self.counter >= self.patience:
                        print("Early stopping triggered. Stopping training.")
                        self.trainer.iter = cfg.SOLVER.MAX_ITER  # Force training to stop

# Custom Trainer
class CocoTrainer(DefaultTrainer):
    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        if output_folder is None:
            output_folder = os.path.join(cfg.OUTPUT_DIR, "eval")
        os.makedirs(output_folder, exist_ok=True)
        return COCOEvaluator(dataset_name, cfg, False, output_folder)

    def build_hooks(self):
        hooks = super().build_hooks()
        hooks.append(EarlyStoppingHook(patience=3))  # Add early stopping
        return hooks

# Train
trainer = CocoTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

# Save final model
final_model_path = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
torch.save(trainer.model.state_dict(), final_model_path)
print(f"Training complete. Model saved at: {final_model_path}")
