# Imports

https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py

https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/defaults.py



In [None]:
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.5/index.html
!pip install cython 
!pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
!wget https://raw.githubusercontent.com/facebookresearch/detectron2/master/configs/Base-RCNN-FPN.yaml -O ../Base-RCNN-FPN.yaml
!wget https://raw.githubusercontent.com/facebookresearch/detectron2/master/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
!wget https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl
!pip install addict > /dev/null 2>&1


In [28]:
import pandas as pd
import numpy as np

from PIL import Image
from IPython.display import display, HTML, clear_output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import sys
import os
DIR = os.getcwd()
import torch

def print_code(func):
    import inspect
    from pygments import highlight
    from pygments.lexers import PythonLexer
    from pygments.formatters import TerminalFormatter

    code = "".join(inspect.getsourcelines(func)[0])
    print(highlight(code, PythonLexer(), TerminalFormatter()))



In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# DETECTRON 2

In [56]:
import atexit
import bisect
import multiprocessing as mp
from collections import deque
import cv2
import torch
from detectron2.modeling import build_model
import detectron2.data.transforms as T

from detectron2.data import MetadataCatalog
from detectron2.utils.video_visualizer import VideoVisualizer
from detectron2.utils.visualizer import ColorMode, Visualizer
from detectron2.checkpoint import DetectionCheckpointer


class DefaultPredictor:
    """
    Create a simple end-to-end predictor with the given config that runs on
    single device for a single input image.
    Compared to using the model directly, this class does the following additions:
    1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
    2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
    3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
    4. Take one input image and produce a single output, instead of a batch.
    If you'd like to do anything more fancy, please refer to its source code
    as examples to build and use the model manually.
    Attributes:
        metadata (Metadata): the metadata of the underlying dataset, obtained from
            cfg.DATASETS.TEST.
    Examples:
    ::
        pred = DefaultPredictor(cfg)
        inputs = cv2.imread("input.jpg")
        outputs = pred(inputs)
    """

    def __init__(self, cfg):
        self.cfg = cfg.clone()  # cfg can be modified by model
        self.model = build_model(self.cfg)
        self.model.eval()
        self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])

        checkpointer = DetectionCheckpointer(self.model)
        checkpointer.load(cfg.MODEL.WEIGHTS)

        self.transform_gen = T.ResizeShortestEdge(
            [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
        )

        self.input_format = cfg.INPUT.FORMAT
        assert self.input_format in ["RGB", "BGR"], self.input_format

    def __call__(self, original_image):
        """
        Args:
            original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
        Returns:
            predictions (dict):
                the output of the model for one image only.
                See :doc:`/tutorials/models` for details about the format.
        """
        with torch.no_grad():  
            if self.input_format == "RGB":
                original_image = original_image[:, :, ::-1]
            height, width = original_image.shape[:2]
            image = self.transform_gen.get_transform(original_image).apply_image(original_image)
            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))

            inputs = {"image": image, "height": height, "width": width}
#             predictions = self.model([inputs])
            predictions = inference([inputs])
#             print(predictions[0])
            predictions = predictions[0]
            return predictions



class VisualizationDemo(object):
    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
        """
        Args:
            cfg (CfgNode):
            instance_mode (ColorMode):
            parallel (bool): whether to run the model in different processes from visualization.
                Useful since the visualization logic can be slow.
        """
        self.metadata = MetadataCatalog.get(
            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
        )
        self.cpu_device = torch.device("cpu")
        self.instance_mode = instance_mode
        self.predictor = DefaultPredictor(cfg)

    def run_on_image(self, image):
        """
        Args:
            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
                This is the format used by OpenCV.
        Returns:
            predictions (dict): the output of the model.
            vis_output (VisImage): the visualized image output.
        """
        vis_output = None
        predictions = self.predictor(image)
        # Convert image from OpenCV BGR format to Matplotlib RGB format.
        image = image[:, :, ::-1]
        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
        if "panoptic_seg" in predictions:
            panoptic_seg, segments_info = predictions["panoptic_seg"]
            vis_output = visualizer.draw_panoptic_seg_predictions(
                panoptic_seg.to(self.cpu_device), segments_info
            )
        else:
            if "sem_seg" in predictions:
                vis_output = visualizer.draw_sem_seg(
                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
                )
            if "instances" in predictions:
                instances = predictions["instances"].to(self.cpu_device)
                vis_output = visualizer.draw_instance_predictions(predictions=instances)

        return predictions, vis_output
    
    
    

In [71]:
import argparse
import glob
import multiprocessing as mp
import os
import time
import cv2
import tqdm

from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.utils.logger import setup_logger
from addict import Dict



def setup_cfg(args):
    # load config from file and command-line arguments
    cfg = get_cfg()
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold
    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
    cfg.MODEL.DEVICE = str(device)
    cfg.freeze()
    return cfg


args = {"config_file": "faster_rcnn_R_50_FPN_3x.yaml", 
        "input": ["../data/img/08291.png"], "output": "out.png", "confidence_threshold": 0.2, 
       "opts": ["MODEL.WEIGHTS", "model_final_280758.pkl"]}

args = Dict(args)



# mp.set_start_method("spawn", force=True)
# args = get_parser().parse_args()
setup_logger(name="fvcore")
logger = setup_logger()
logger.info("Arguments: " + str(args))

cfg = setup_cfg(args)

demo = VisualizationDemo(cfg)

if args.input:
    if len(args.input) == 1:
        args.input = glob.glob(os.path.expanduser(args.input[0]))
        assert args.input, "The input path(s) was not found"
    for path in tqdm.tqdm(args.input, disable=not args.output):
        # use PIL, to be consistent with evaluation
        img = read_image(path, format="BGR")
        start_time = time.time()
        predictions, visualized_output = demo.run_on_image(img)
        logger.info(
            "{}: {} in {:.2f}s".format(
                path,
                "detected {} instances".format(len(predictions["instances"]))
                if "instances" in predictions
                else "finished",
                time.time() - start_time,
            )
        )
        visualized_output.save(args.output)





<Logger fvcore (DEBUG)>

[32m[06/16 17:06:33 detectron2]: [0mArguments: {'config_file': 'faster_rcnn_R_50_FPN_3x.yaml', 'input': ['../data/img/08291.png'], 'output': 'out.png', 'confidence_threshold': 0.2, 'opts': ['MODEL.WEIGHTS', 'model_final_280758.pkl']}
[32m[06/16 17:06:34 fvcore.common.checkpoint]: [0mLoading checkpoint from model_final_280758.pkl
[32m[06/16 17:06:34 fvcore.common.checkpoint]: [0mReading a file from 'Detectron2 Model Zoo'


  0%|          | 0/1 [00:00<?, ?it/s]

Proposals =  torch.Size([1000]) tensor([[ 359.0942,  104.0869,  730.7505,  800.0000],
        [  77.8388,    0.0000,  455.4151,  744.6434],
        [ 679.4937,  143.2169, 1156.5720,  696.7388],
        ...,
        [  67.5205,   26.8047,  226.5770,  435.4660],
        [ 727.7982,  393.4505,  808.8560,  544.7234],
        [ 946.8929,   91.1367, 1158.0629,  199.5205]])
[Instances(num_instances=20, image_height=800, image_width=1202, fields=[pred_boxes: Boxes(tensor([[7.2101e+02, 1.2948e+02, 1.1464e+03, 7.9710e+02],
        [3.1940e+02, 8.4073e+01, 7.2568e+02, 7.8666e+02],
        [1.0960e+03, 2.0056e+00, 1.2011e+03, 2.5725e+02],
        [7.4143e+02, 7.0263e+00, 1.1175e+03, 2.8232e+02],
        [1.0280e+02, 0.0000e+00, 4.5777e+02, 7.1402e+02],
        [6.1678e+02, 8.5077e+01, 6.7205e+02, 1.2839e+02],
        [1.1870e+03, 7.6405e+02, 1.2020e+03, 7.8767e+02],
        [3.5829e+02, 7.0170e+02, 6.2234e+02, 7.9588e+02],
        [6.8281e+02, 2.6243e+02, 8.9620e+02, 3.6334e+02],
        [1.4678e+

100%|██████████| 1/1 [00:01<00:00,  1.40s/it]


In [None]:
# Image.open("../data/img/08291.png")
# Image.open("out.png")


In [70]:
from detectron2.modeling.meta_arch.rcnn import GeneralizedRCNN

dt2m = demo.predictor.model
def inference(batched_inputs, do_postprocess=True):
    """
    Run inference on the given inputs.

    Args:
        batched_inputs (list[dict]): same as in :meth:`forward`
        detected_instances (None or list[Instances]): if not None, it
            contains an `Instances` object per image. The `Instances`
            object contains "pred_boxes" and "pred_classes" which are
            known boxes in the image.
            The inference will then skip the detection of bounding boxes,
            and only predict other per-ROI outputs.
        do_postprocess (bool): whether to apply post-processing on the outputs.

    Returns:
        same as in :meth:`forward`.
    """

    images = dt2m.preprocess_image(batched_inputs)
    features = dt2m.backbone(images.tensor)
    
    proposals, _ = dt2m.proposal_generator(images, features, None)
    print("Proposals = ", proposals[0].objectness_logits.size(), proposals[0].proposal_boxes.tensor)
    
    results, x = dt2m.roi_heads(images, features, proposals, None)
    print(results)
    print(x)

    if do_postprocess:
        return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
    else:
        return results

In [36]:
print_code(GeneralizedRCNN._postprocess)

    [90m@staticmethod[39;49;00m
    [34mdef[39;49;00m [32m_postprocess[39;49;00m(instances, batched_inputs, image_sizes):
        [33m"""[39;49;00m
[33m        Rescale the output instances to the target size.[39;49;00m
[33m        """[39;49;00m
        [37m# note: private function; subject to changes[39;49;00m
        processed_results = []
        [34mfor[39;49;00m results_per_image, input_per_image, image_size [35min[39;49;00m [36mzip[39;49;00m(
            instances, batched_inputs, image_sizes
        ):
            height = input_per_image.get([33m"[39;49;00m[33mheight[39;49;00m[33m"[39;49;00m, image_size[[34m0[39;49;00m])
            width = input_per_image.get([33m"[39;49;00m[33mwidth[39;49;00m[33m"[39;49;00m, image_size[[34m1[39;49;00m])
            r = detector_postprocess(results_per_image, height, width)
            processed_results.append({[33m"[39;49;00m[33minstances[39;49;00m[33m"[39;49;00m: r})
        [34mreturn[39;49;00m pro

In [29]:
print_code(demo.predictor.model.inference)

    [34mdef[39;49;00m [32minference[39;49;00m([36mself[39;49;00m, batched_inputs, detected_instances=[34mNone[39;49;00m, do_postprocess=[34mTrue[39;49;00m):
        [33m"""[39;49;00m
[33m        Run inference on the given inputs.[39;49;00m
[33m[39;49;00m
[33m        Args:[39;49;00m
[33m            batched_inputs (list[dict]): same as in :meth:`forward`[39;49;00m
[33m            detected_instances (None or list[Instances]): if not None, it[39;49;00m
[33m                contains an `Instances` object per image. The `Instances`[39;49;00m
[33m                object contains "pred_boxes" and "pred_classes" which are[39;49;00m
[33m                known boxes in the image.[39;49;00m
[33m                The inference will then skip the detection of bounding boxes,[39;49;00m
[33m                and only predict other per-ROI outputs.[39;49;00m
[33m            do_postprocess (bool): whether to apply post-processing on the outputs.[39;49;00m
[33m[39;49;00m
[33

In [30]:
print_code(demo.predictor.model.forward)

    [34mdef[39;49;00m [32mforward[39;49;00m([36mself[39;49;00m, batched_inputs):
        [33m"""[39;49;00m
[33m        Args:[39;49;00m
[33m            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .[39;49;00m
[33m                Each item in the list contains the inputs for one image.[39;49;00m
[33m                For now, each item in the list is a dict that contains:[39;49;00m
[33m[39;49;00m
[33m                * image: Tensor, image in (C, H, W) format.[39;49;00m
[33m                * instances (optional): groundtruth :class:`Instances`[39;49;00m
[33m                * proposals (optional): :class:`Instances`, precomputed proposals.[39;49;00m
[33m[39;49;00m
[33m                Other information that's included in the original dicts, such as:[39;49;00m
[33m[39;49;00m
[33m                * "height", "width" (int): the output resolution of the model, used in inference.[39;49;00m
[33m                  See :meth:`postprocess` for detail