# Training

In [1]:
import os

if "models" in os.getcwd():
    os.chdir("..")

if "notebooks" in os.getcwd():
    os.chdir("..")
os.getcwd()

'/home/jordi/Documents/GitHub/zebra_fish'

In [2]:
from src.dataset import register_default_datasets
from src.LrFinder import LRFinder
from src.hooks.LossEvalHook import LossEvalHook
from src.hooks.PredictionVisualHook import PredictionVisualHook
from src.hooks.ConfusionHook import ConfusionHook
from src.RandomZoom import RandomZoom

#detectron
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.utils.logger import setup_logger
from detectron2.data import DatasetCatalog, build_detection_test_loader, build_detection_train_loader, DatasetMapper, detection_utils as utils, transforms as T
from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
from detectron2.engine import DefaultTrainer
from detectron2.evaluation import COCOEvaluator
from detectron2.config import get_cfg
from detectron2 import model_zoo
from detectron2.engine import CallbackHook
from detectron2.engine.hooks import HookBase
from detectron2.utils.events import EventStorage
import detectron2.utils.comm as comm
from detectron2.utils.logger import log_every_n_seconds
from detectron2.utils.visualizer import Visualizer
import matplotlib.pyplot as plt
from pathlib import Path

import time
import datetime
import torch
import numpy as np
import gc

register_default_datasets(clear=True)

In [3]:
class CustomTrainer(DefaultTrainer):

    def __init__(self, cfg):
        super().__init__(cfg)

    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        if output_folder is None:
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
            os.makedirs(output_folder, exist_ok=True)
        return COCOEvaluator(dataset_name, cfg, True, output_folder)

    def build_hooks(self):
        hooks = super().build_hooks()
        hooks.insert(-1, LossEvalHook.create(self.cfg))
        hooks.insert(-1, PredictionVisualHook.create(self.cfg)),
        #hooks.insert(-1, ConfusionHook.create(self.cfg, threshold=0.75))
        
        return hooks
    
    @classmethod
    def build_train_loader(cls, cfg):
        augmentations = [
            T.RandomRotation([0, 360], expand=False, center=[[0.45, 0.45], [0.55, 0.55]], sample_style="range"),
            RandomZoom([[x, x] for x in np.linspace(0.5, 0.95, 10)], prob=0.25),
            T.RandomFlip(prob=0.5, horizontal=True),
            T.ResizeShortestEdge(short_edge_length=cfg.INPUT.MIN_SIZE_TRAIN, max_size=cfg.INPUT.MAX_SIZE_TRAIN, sample_style=cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING),
        ]
        mapper = DatasetMapper(
            is_train=True,
            augmentations=augmentations,
            image_format=cfg.INPUT.FORMAT,
            use_instance_mask=cfg.MODEL.MASK_ON,
            instance_mask_format=cfg.INPUT.MASK_FORMAT,
            use_keypoint=cfg.MODEL.KEYPOINT_ON,
            recompute_boxes=True,
        )

        return build_detection_train_loader(cfg, mapper=mapper)
    
    @classmethod
    def build_lr_scheduler(cls, cfg, optimizer):
        return torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=cfg.SOLVER.BASE_LR,
            total_steps=cfg.SOLVER.MAX_ITER,
            final_div_factor=25,
            div_factor=25,
        )
    
    def find_lr(self):
        finder = LRFinder()
        
        with EventStorage(0) as self.storage:
            res = finder.find(
                self.model,
                self.optimizer,
                self.build_train_loader(self.cfg),
            )
        
        return res

In [4]:
# Default config
use_collapsed = True
cfg = get_cfg()
#cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("train",) if not use_collapsed else ("train_collapsed",)
cfg.DATASETS.TEST = ("val",) if not use_collapsed else ("val_collapsed",)
meta_dataset = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
cfg.MODEL.ROI_HEADS.NUM_CLASSES = len(meta_dataset.thing_classes)


print(meta_dataset.thing_classes)

cfg.MODEL.WEIGHTS = str(Path("./outputs/new_dataset/res101_fr2_no_problamatic_10000_random_zoom/model_final.pth"))
cfg.SOLVER.CHECKPOINT_PERIOD = 250
cfg.SOLVER.MAX_ITER = 5000
cfg.TEST.EVAL_PERIOD = 40
cfg.OUTPUT_DIR = "./outputs/new_dataset/res101_fr2_no_problamatic_5000_random_zoom_with_Yolk"
cfg.MODEL.BACKBONE.FREEZE_AT = 2
cfg.SOLVER.WARMUP_ITERS = 0
if cfg.SOLVER.WARMUP_ITERS > 0:
    cfg.SOLVER.WARMUP_FACTOR = 1.0 / cfg.SOLVER.WARMUP_ITERS

# Hyper-params
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.MODEL.RPN.NMS_THRESH = 0.70
cfg.SOLVER.WEIGHT_DECAY = 0.001
cfg.SOLVER.BASE_LR =  0.0012328467394420682



#cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 0.5, 1.0, 2, 4]]
#cfg.INPUT.MIN_SIZE_TRAIN = (1333, 1333)
#cfg.INPUT.MAX_SIZE_TRAIN = 1333
#cfg.INPUT.MIN_SIZE_TEST = (1333, 1333)
#cfg.INPUT.MAX_SIZE_TEST = 1333
cfg.MODEL.RPN.LOSS_WEIGHT = 2.0
#cfg.MODEL.RESNETS.NORM = "BN"

assert cfg.TEST.EVAL_PERIOD % 20 == 0, "EVAL_PERIOD must be a multiple of 20"

['Healthy', 'Deformed', 'Death', 'Yolk']


In [5]:
resume = True
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
assert resume or len(os.listdir(cfg.OUTPUT_DIR))  == 0, "Output dir is not empty!"

trainer = CustomTrainer(cfg)
trainer.resume_or_load(resume=resume)



with open(Path(cfg.OUTPUT_DIR) / "config.yaml", "w") as f:
    f.write(cfg.dump())

[32m[08/18 14:29:39 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (4, 1024) in the checkpoint but (5, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (4,) in the checkpoint but (5,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (12, 1024) in the checkpoint but (16, 1024) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (12,) in the checkpoint but (16,) in the model! You might want to double check if this is expected.
Skip loading parameter 'roi_heads.mask_head.predictor.weight' to the model due to incompatible shapes: (3, 256, 1, 1) in the checkpoint but (4, 256, 1, 1) in th

In [6]:
trainer.train()

[32m[08/18 14:29:42 d2.engine.train_loop]: [0mStarting training from iteration 0
[32m[08/18 14:29:52 d2.utils.events]: [0m eta: 0:37:10  iter: 19  total_loss: 3.722  loss_cls: 1.178  loss_box_reg: 0.870  loss_mask: 0.635  loss_rpn_cls: 0.735  loss_rpn_loc: 0.346  time: 0.4491  data_time: 0.0181  lr: 0.000050  max_mem: 3856M
[32m[08/18 14:30:01 d2.data.common]: [0mSerializing 14 elements to byte tensors and concatenating them all ...
[32m[08/18 14:30:01 d2.data.common]: [0mSerialized dataset takes 0.14 MiB
[32m[08/18 14:30:01 d2.data.dataset_mapper]: [0mAugmentations used in training: [ResizeShortestEdge(short_edge_length=(800, 800), max_size=1333, sample_style='choice')]
[32m[08/18 14:30:01 d2.evaluation.coco_evaluation]: [0m'val_collapsed' is not registered by `register_coco_instances`. Therefore trying to convert it to COCO format ...
[32m[08/18 14:30:01 d2.data.datasets.coco]: [0mConverting annotations of dataset 'val_collapsed' to COCO format ...)
[32m[08/18 14:30:01

In [7]:
trainer.model

GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
      (res2): Sequential(
        (0): BottleneckBlock