In [1]:
%load_ext autoreload
%autoreload 2

from collections import OrderedDict
import cv2
import copy
import os
import matplotlib.pyplot as plt
import numpy as np
import torch
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.config import CfgNode as CN
from detectron2.data import (
    MetadataCatalog,
    build_detection_test_loader,
    build_detection_train_loader,
)
from detectron2.engine import DefaultTrainer, DefaultPredictor, default_argument_parser, default_setup, launch
from detectron2.evaluation import inference_on_dataset, COCOEvaluator
from detectron2.modeling import build_model
from detectron2.utils.events import EventStorage
from detectron2.utils.logger import setup_logger
from detectron2.utils.visualizer import Visualizer

import scene_generation.inverse_graphics.synthetic_scene_database_loader as data
import scene_generation.inverse_graphics.roi_heads as roi_heads

np.set_printoptions(precision=2)

DATA_ROOT = "/home/gizatt/data/generated_cardboard_envs/"
DETECTRON_ROOT = "/home/gizatt/tools/detectron2/"

%matplotlib inline
def cv2_imshow(im):
    plt.imshow(cv2.cvtColor(im, cv2.COLOR_BGR2RGB))
    plt.axis('off')

In [2]:
from detectron2.data import DatasetCatalog, MetadataCatalog
from scene_generation.inverse_graphics.synthetic_scene_database_loader import load_xencoco_json

DatasetCatalog.clear()
def load_dataset(d):
    return load_xencoco_json(
        os.path.join(DATA_ROOT, "%s.json" % (d)),
        data_root=DATA_ROOT,
        dataset_name="synthetic_%s" % d)
def load_real_dataset():
    return detectron2.data.datasets.load_coco_json(
        "/home/gizatt/data/coco/cardboard_boxes_in_wild/coco.json",
        image_root="/home/gizatt/data/coco/cardboard_boxes_in_wild/images",
        dataset_name="prime_boxes_real", extra_annotation_keys=None)
for d in ["train", "test"]:
    DatasetCatalog.register("synthetic_" + d, lambda d=d: load_dataset(d))
DatasetCatalog.register("prime_boxes_real", load_real_dataset)
synthetic_train_metadata = MetadataCatalog.get("synthetic_train")
real_prime_boxes_metadata = MetadataCatalog.get("prime_boxes_real")
real_prime_boxes_metadata.set(json_file="/home/gizatt/data/coco/cardboard_boxes_in_wild/coco.json")
print("Metadata train: ", synthetic_train_metadata)
print("Metadata real: ", real_prime_boxes_metadata)

Metadata train:  Metadata(name='synthetic_train')
Metadata real:  Metadata(json_file='/home/gizatt/data/coco/cardboard_boxes_in_wild/coco.json', name='prime_boxes_real')


In [3]:
cfg = get_cfg()
cfg.merge_from_file(os.path.join(DETECTRON_ROOT, "configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))

cfg.INPUT.MIN_SIZE_TRAIN = (480)
cfg.DATALOADER.ASPECT_RATIO_GROUPING = False

cfg.DATASETS.TRAIN = ("synthetic_train",)
cfg.DATASETS.TEST = ("synthetic_test",)
cfg.DATALOADER.NUM_WORKERS = 4
cfg.MODEL.WEIGHTS = "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"  # initialize from model zoo
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512   # default
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class (prime box)
cfg.MODEL.ROI_HEADS.NAME = "XenRCNNROIHeads"

cfg.MODEL.ROI_HEADS.SHAPE_LOSS_WEIGHT = 1.0
cfg.MODEL.ROI_HEADS.POSE_LOSS_WEIGHT = 1.0
cfg.MODEL.ROI_HEADS.SHAPE_LOSS_NORM = 'l1'
cfg.MODEL.ROI_HEADS.POSE_LOSS_NORM = 'l1'

cfg.MODEL.MASK_ON = True
cfg.MODEL.SHAPE_ON = True
cfg.MODEL.POSE_ON = True

cfg.MODEL.ROI_SHARED_HEAD = CN()
cfg.MODEL.ROI_SHARED_HEAD.POOLER_RESOLUTION = 14
cfg.MODEL.ROI_SHARED_HEAD.POOLER_SAMPLING_RATIO = 2
cfg.MODEL.ROI_SHARED_HEAD.POOLER_TYPE = "ROIAlign"

cfg.MODEL.ROI_SHAPE_HEAD = CN()
cfg.MODEL.ROI_SHAPE_HEAD.NAME = "RCNNShapeHead"
cfg.MODEL.ROI_SHAPE_HEAD.NUM_CONV = 0
cfg.MODEL.ROI_SHAPE_HEAD.CONV_DIM = 128 # formerly 3
cfg.MODEL.ROI_SHAPE_HEAD.NUM_FC = 3
cfg.MODEL.ROI_SHAPE_HEAD.FC_DIM = 256 # formerly 100
cfg.MODEL.ROI_SHAPE_HEAD.NORM = ""
cfg.MODEL.ROI_SHAPE_HEAD.NUM_SHAPE_PARAMS = 3
cfg.MODEL.ROI_SHAPE_HEAD.NUM_SHAPE_BINS = 64
cfg.MODEL.ROI_SHAPE_HEAD.SHAPE_BIN_RANGES = ((0., 0.5),
                                             (0., 0.5),
                                             (0., 0.5))

cfg.MODEL.ROI_POSE_XYZ_HEAD = CN()
cfg.MODEL.ROI_POSE_XYZ_HEAD.NAME = "RCNNPoseXyzHead"
cfg.MODEL.ROI_POSE_XYZ_HEAD.NUM_CONV = 0
cfg.MODEL.ROI_POSE_XYZ_HEAD.CONV_DIM = 128 # formerly 3
cfg.MODEL.ROI_POSE_XYZ_HEAD.NUM_FC = 3
cfg.MODEL.ROI_POSE_XYZ_HEAD.FC_DIM = 256 # formerly 100
cfg.MODEL.ROI_POSE_XYZ_HEAD.NORM = ""
cfg.MODEL.ROI_POSE_XYZ_HEAD.NUM_BINS = 64
cfg.MODEL.ROI_POSE_XYZ_HEAD.XYZ_BIN_RANGES = ((-2., 2.),
                                              (-2., 2.),
                                              (0., 4.))

cfg.MODEL.ROI_POSE_RPY_HEAD = CN()
cfg.MODEL.ROI_POSE_RPY_HEAD.NAME = "RCNNPoseRpyHead"
cfg.MODEL.ROI_POSE_RPY_HEAD.NUM_CONV = 0
cfg.MODEL.ROI_POSE_RPY_HEAD.CONV_DIM = 128 # formerly 3
cfg.MODEL.ROI_POSE_RPY_HEAD.NUM_FC = 3
cfg.MODEL.ROI_POSE_RPY_HEAD.FC_DIM = 256 # formerly 100
cfg.MODEL.ROI_POSE_RPY_HEAD.NORM = ""
cfg.MODEL.ROI_POSE_RPY_HEAD.NUM_BINS = 64

#cfg.DEVICE = 'cpu'
#cfg.MODEL.DEVICE = 'cpu'
#cfg.freeze()
test_loader = build_detection_test_loader(cfg, dataset_name="synthetic_test", mapper=data.XenRCNNMapper(cfg, True))

In [4]:
model = build_model(cfg)
model.eval()

GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
      (res2): Sequential(
        (0): BottleneckBlock

In [5]:
class Trainer(DefaultTrainer):

    @classmethod
    def build_test_loader(cls, cfg, dataset_name):
        return build_detection_test_loader(cfg, dataset_name, mapper=data.XenRCNNMapper(cfg, False))

    @classmethod
    def build_train_loader(cls, cfg):
        return build_detection_train_loader(cfg, mapper=data.XenRCNNMapper(cfg, True))

    @classmethod
    def build_evaluator(cls, cfg, dataset_name):
        return COCOEvaluator(dataset_name, cfg, True, output_dir=cfg.OUTPUT_DIR)
            
cfg.SOLVER.IMS_PER_BATCH = 4
cfg.SOLVER.BASE_LR = 0.00025  # pick a good LR
cfg.SOLVER.MAX_ITER = 100    # 300 iterations seems good enough for this toy dataset; you may need to train longer for a practical dataset
cfg.OUTPUT_DIR = "output/box_and_mask_and_shape_4/"

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = Trainer(cfg)
trainer.resume_or_load(resume=True)
trainer.train()

[32m[03/25 16:34:36 d2.engine.defaults]: [0mModel:
GeneralizedRCNN(
  (backbone): FPN(
    (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
    (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (top_block): LastLevelMaxPool()
    (bottom_up): ResNet(
      (stem): BasicStem(
        (conv1): Conv2d(
          3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
          (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05)
        )
      )
 

[32m[03/25 16:34:36 d2.data.detection_utils]: [0mTransformGens used in training: [ResizeShortestEdge(short_edge_length=(480, 480), max_size=1333, sample_style='choice'), RandomFlip()]
[32m[03/25 16:34:37 d2.data.build]: [0mRemoved 0 images with no usable annotations. 1000 images left.
[32m[03/25 16:34:37 d2.data.build]: [0mDistribution of instances among all 1 categories:
[36m|  category  | #instances   |
|:----------:|:-------------|
| prime_box  | 5516         |
|            |              |[0m
[32m[03/25 16:34:37 d2.data.common]: [0mSerializing 1000 elements to byte tensors and concatenating them all ...
[32m[03/25 16:34:37 d2.data.common]: [0mSerialized dataset takes 59.99 MiB
[32m[03/25 16:34:37 d2.data.build]: [0mUsing training sampler TrainingSampler
Loaded a target with gt pose quatxyz:  tensor([[ 3.5377e-07, -8.5821e-01,  5.1394e-01,  5.7196e-06, -4.2632e-01,
          3.1339e-03,  2.2718e-01],
        [ 6.1789e-01, -1.7405e-05,  2.7083e-07, -7.8672e-01, -3.4064e

          3.9597e-01,  2.0040e-01]])  and rpy  tensor([[ 1.5709e+00, -1.4588e-05,  2.6945e-01],
        [-3.0018e+00, -4.0069e-05,  5.3130e-01],
        [-1.5709e+00, -1.5857e-04, -3.1314e+00],
        [ 3.0015e+00, -1.5708e+00,  2.9501e+00],
        [-2.6182e+00, -2.6822e-07,  1.5461e+00],
        [-3.1208e+00,  1.5708e+00, -2.7923e+00],
        [ 2.3804e+00, -1.5708e+00, -9.0914e-01],
        [ 3.1416e+00, -2.1716e-05, -1.1955e+00],
        [ 3.1415e+00, -2.0864e-05,  1.5505e+00]])
Loaded a target with gt pose quatxyz:  tensor([[ 3.9889e-01, -5.8415e-01, -3.9886e-01, -5.8418e-01, -8.1067e-02,
         -2.3815e-01,  1.4477e-01],
        [-1.1820e-05, -9.8035e-01, -1.9891e-01,  2.9698e-04,  2.6030e-01,
         -5.1611e-02,  2.2545e-01],
        [-8.6521e-02, -7.0182e-01, -8.6517e-02,  7.0180e-01, -2.0673e-02,
          1.5269e-01,  1.7292e-01]])  and rpy  tensor([[-3.1339e+00, -1.5708e+00, -3.0606e+00],
        [-3.1415e+00,  5.8700e-04,  4.0062e-01],
        [ 3.0381e+00,  1.5708e+00

        [ 3.1416,  1.0620,  1.9037]])
Loaded a target with gt pose quatxyz:  tensor([[ 3.7069e-01, -6.0215e-01, -3.7071e-01, -6.0215e-01, -2.0300e-01,
         -2.0314e-01,  2.4956e-01],
        [ 6.3024e-01, -3.2105e-01,  6.3022e-01,  3.2104e-01,  3.4417e-01,
         -2.1727e-01,  2.2100e-01],
        [ 6.7602e-01,  2.0773e-01,  6.7603e-01, -2.0770e-01,  3.9757e-01,
          1.5823e-01,  1.7042e-01],
        [ 4.2764e-01,  4.8254e-01,  3.2832e-01, -6.9029e-01,  7.7244e-02,
          1.3079e-02,  8.3075e-01],
        [ 1.9282e-01, -1.9266e-01, -6.8029e-01,  6.8038e-01, -8.1315e-02,
         -2.1879e-01,  6.0901e-01],
        [ 8.8788e-01,  1.8576e-05, -1.5970e-06,  4.6009e-01,  4.3274e-02,
          4.0067e-01,  1.8497e-01],
        [ 6.9065e-01, -1.5157e-01,  6.9071e-01,  1.5150e-01, -1.2115e-01,
          3.6096e-01,  5.1177e-01],
        [ 6.9827e-01,  6.9825e-01, -1.1275e-01, -1.1278e-01, -4.9411e-01,
         -6.0597e-01,  1.9063e-01]])  and rpy  tensor([[ 1.8328e+00, -1.5708e+0

          2.6954e-01,  5.1299e-01]])  and rpy  tensor([[ 1.8437e+00, -1.2375e-01,  3.3076e-01],
        [-1.5711e+00,  3.0980e-05, -3.6721e-01],
        [-2.2431e-05, -2.1731e-05,  2.7758e+00],
        [ 3.1416e+00, -9.7123e-05, -7.0527e-01],
        [-1.5710e+00,  2.6822e-07, -2.5082e+00],
        [-3.1416e+00,  1.3127e-05, -2.3278e+00],
        [-2.4423e+00,  1.5708e+00, -2.6650e+00],
        [-3.0916e+00,  1.5708e+00, -3.1301e+00],
        [ 1.5748e+00,  1.4491e+00, -2.1166e+00]])
Loaded a target with gt pose quatxyz:  tensor([[ 0.8440,  0.1441, -0.4342,  0.2800,  0.3151,  0.2020,  0.2248],
        [ 0.4070, -0.6234,  0.5669,  0.3527, -0.1229,  0.5152,  0.2805],
        [ 0.4600, -0.5522,  0.6933, -0.0529,  0.2067, -0.0592,  0.4866],
        [ 0.4648,  0.5328, -0.4648,  0.5329, -0.2535, -0.2778,  0.5051],
        [ 0.4899,  0.5186, -0.6465, -0.2705, -0.2367,  0.1735,  0.3438],
        [ 0.9466,  0.3196,  0.0134,  0.0397,  0.2391, -0.3317,  0.2507],
        [-0.0543,  0.7051, -0.0543

ValueError: loaded state dict has a different number of parameter groups

In [None]:
test_cfg = cfg.clone()  # cfg can be modified by model
test_cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
test_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7   # set the testing threshold for this model, default was 0.7
test_cfg.DATASETS.TEST = ("synthetic_test", )

test_model = build_model(test_cfg)
test_model.eval()
test_metadata = MetadataCatalog.get(test_cfg.DATASETS.TEST[0])

checkpointer = DetectionCheckpointer(test_model)
checkpointer.load(test_cfg.MODEL.WEIGHTS)

height_to_show = 2
width_to_show = 2
plt.figure(dpi=100).set_size_inches(12, 12)
test_loader_iterator = iter(test_loader)
examples = [next(test_loader_iterator)[0] for k in range(height_to_show*width_to_show)]
with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
    #inputs = {"image": image, "height": height, "width": width}
    predictions = test_model(examples)
    for k, pred in enumerate(predictions):
        plt.subplot(height_to_show, width_to_show, k+1)
        im = examples[k]["image"].detach().cpu().numpy().transpose([1, 2, 0])
        v = Visualizer(
                cv2.cvtColor(im, cv2.COLOR_BGR2RGB),
                metadata=test_metadata, 
                scale=1.2,)
        v = v.draw_instance_predictions(pred["instances"].to("cpu"))
        if len(pred["instances"]) > 0:
            pred_shapes = pred["instances"].get("pred_shape_params").cpu().detach().numpy()
            actual_shapes = examples[k]["instances"].get("gt_shape_params").cpu().detach().numpy()
            #plt.title(str(pred_shapes))
            print("Pred shapes: ", pred_shapes)
            print("Actual shapes: ", actual_shapes)
        cv2_imshow(v.get_image()[:, :, ::-1])

In [None]:
trainer.test(test_cfg, test_model)