inference_detection.py

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import backend as K

import utils

def refine_detections(rois, probs, deltas, window, config):
    # Class IDs per ROI
    class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)
    # Class probability of the top class of each ROI
    indices = tf.stack([tf.range(tf.shape(probs)[0]), class_ids], axis=1)
    class_scores = tf.gather_nd(probs, indices)
    # Class-specific bounding box deltas
    deltas_specific = tf.gather_nd(deltas, indices)
    # Apply bounding box deltas
    refined_rois = utils.apply_bbox_offset(
        rois, deltas_specific * config.BBOX_STD_DEV)
    # Convert coordiates to image domain
    # TODO: better to keep them normalized until later
    height, width = config.IMAGE_SHAPE[:2]
    refined_rois *= tf.constant([height, width, height, width], dtype=tf.float32)
    # Clip boxes to image window
    refined_rois = utils.clip_boxes(refined_rois, window)
    # Round and cast to int since we're deadling with pixels now
    refined_rois = tf.cast(tf.math.rint(refined_rois), tf.int32)

    # TODO: Filter out boxes with zero area

    # Filter out background boxes
    keep = tf.where(class_ids > 0)[:, 0]
    # Filter out low confidence boxes
    if config.DETECTION_MIN_CONFIDENCE:
        conf_keep = tf.where(class_scores >= config.DETECTION_MIN_CONFIDENCE)[:, 0]
        #keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
        #                                tf.expand_dims(conf_keep, 0))
        keep = tf.sets.intersection(tf.expand_dims(keep, 0),tf.expand_dims(conf_keep, 0))
        #keep = tf.sparse_tensor_to_dense(keep)[0]
        keep = tf.sparse.to_dense(keep)[0]

    # Apply per-class NMS
    # 1. Prepare variables
    pre_nms_class_ids = tf.gather(class_ids, keep)
    pre_nms_scores = tf.gather(class_scores, keep)
    pre_nms_rois = tf.gather(refined_rois,   keep)
    unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]

    def nms_keep_map(class_id):
        """Apply Non-Maximum Suppression on ROIs of the given class."""
        # Indices of ROIs of the given class
        ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
        # Apply NMS
        class_keep = tf.image.non_max_suppression(
                #tf.to_float(tf.gather(pre_nms_rois, ixs)),
                tf.cast(tf.gather(pre_nms_rois, ixs), tf.float32),
                tf.gather(pre_nms_scores, ixs),
                max_output_size=config.DETECTION_MAX_INSTANCES,
                iou_threshold=config.DETECTION_NMS_THRESHOLD)
        # Map indicies
        class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
        # Pad with -1 so returned tensors have the same shape
        gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0]
        class_keep = tf.pad(class_keep, [(0, gap)],mode='CONSTANT', constant_values=-1)
        # Set shape so map_fn() can infer result shape
        class_keep.set_shape([config.DETECTION_MAX_INSTANCES])
        return class_keep

    # 2. Map over class IDs
    nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids,dtype=tf.int64)
    # 3. Merge results into one list, and remove -1 padding
    nms_keep = tf.reshape(nms_keep, [-1])
    nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
    # 4. Compute intersection between keep and nms_keep
    #keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
    #                                tf.expand_dims(nms_keep, 0))
    #keep = tf.sparse_tensor_to_dense(keep)[0]
    keep = tf.sets.intersection(tf.expand_dims(keep, 0),tf.expand_dims(nms_keep, 0))
    keep = tf.sparse.to_dense(keep)[0]
    # Keep top detections
    roi_count = config.DETECTION_MAX_INSTANCES
    class_scores_keep = tf.gather(class_scores, keep)
    num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
    top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
    keep = tf.gather(keep, top_ids)

    # Arrange output as [N, (y1, x1, y2, x2, class_id, score)]
    # Coordinates are in image domain.
    detections = tf.concat([
        #tf.to_float(tf.gather(refined_rois, keep)),
        tf.cast(tf.gather(refined_rois, keep), tf.float32),
        #tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis],
        tf.cast(tf.gather(class_ids, keep), tf.float32)[..., tf.newaxis],
        tf.gather(class_scores, keep)[..., tf.newaxis]
        ], axis=1)

    # Pad with zeros if detections < DETECTION_MAX_INSTANCES
    gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
    detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")
    return detections


class InferenceDetectionLayer(layers.Layer):
    """
    Takes classified proposal boxes and their bounding box deltas and
    returns the final detection boxes.
    """
    def __init__(self,config,**kwargs):
        super(InferenceDetectionLayer,self).__init__(**kwargs)
        self.config = config

    def call(self,input):
        rois = input[0]
        rcnn_class = input[1]
        rcnn_bbox = input[2]
        image_meta = input[3]

        window = utils.parse_image_meta(image_meta)['window']

        detections = utils.batch_slice(
            [rois, rcnn_class, rcnn_bbox, window],
            lambda x, y, z, w: refine_detections(x, y, z, w, self.config),
            self.config.IMAGES_PER_GPU
        )
        #[N, (y1, x1, y2, x2, class_id, score)]
        return tf.reshape(detections,
                          [self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6])

    def compute_output_shape(self, input_shape):
        return (None, self.config.DETECTION_MAX_INSTANCES, 6)