# Object Detection


### import dependencies

In [None]:
import os
import re
import zipfile
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt



### Downloading the COCO2017 dataset
-
Training on the entire COCO2017 dataset which has around 118k images takes a lot of time, hence we will be using a smaller subset of ~500 images for training in this example

In [None]:
url = "https://github.com/srihari-humbarwadi/datasets/releases/download/v0.1.0/data.zip"
filename = os.path.join(os.getcwd(), "data.zip")
keras.utils.get_file(filename, url)

In [None]:
with zipfile.ZipFile("data.zip" , "r") as f:
  f.extractall("./")

### Implementing utility functions
Bounding boxes can be represented in multiple ways, the most common formats are:

- Storing the coordinates of the corners [xmin, ymin, xmax, ymax]
- Storing the coordinates of the center and the box dimensions [x, y, width, height]

Since we require both formats, we will be implementing functions for converting between the formats

In [None]:
def swap_xy(boxes):
  return tf.stack([boxes[:,1], boxes[:,0], boxes[:,3], boxes[:,2]], axis = -1)

def convert_to_xywh(boxes):
  return tf.concat(
      [(boxes[...,2:] + boxes[...,:2]) / 2.0, boxes[...,2:] - boxes[...,:2]], axis = -1
  )

def convert_to_corners(boxes):
  return tf.concat(
      [boxes[...,2:] - boxes[...,:2] / 2.0, boxes[...,2:] + boxes[...,:2] / 2.0], axis = -1
  )

### IOU

In [None]:
def compute_iou(boxes1, boxes2):
    boxes1_corners = convert_to_corners(boxes1)
    boxes2_corners = convert_to_corners(boxes2)
    lu = tf.maximum(boxes1_corners[:, None, :2], boxes2_corners[:, :2])
    rd = tf.minimum(boxes1_corners[:, None, 2:], boxes2_corners[:, 2:])
    intersection = tf.maximum(0.0, rd - lu)
    intersection_area = intersection[:, :, 0] * intersection[:, :, 1]
    boxes1_area = boxes1[:, 2] * boxes1[:, 3]
    boxes2_area = boxes2[:, 2] * boxes2[:, 3]
    union_area = tf.maximum(
        boxes1_area[:, None] + boxes2_area - intersection_area, 1e-8
    )
    return tf.clip_by_value(intersection_area / union_area, 0.0, 1.0)

In [None]:
def visualize_detections(
    image, boxes, classes, scores, figsize=(7, 7), linewidth=1, color=[0, 0, 1]
):
    """Visualize Detections"""
    image = np.array(image, dtype=np.uint8)
    plt.figure(figsize=figsize)
    plt.axis("off")
    plt.imshow(image)
    ax = plt.gca()
    for box, _cls, score in zip(boxes, classes, scores):
        text = "{}: {:.2f}".format(_cls, score)
        x1, y1, x2, y2 = box
        w, h = x2 - x1, y2 - y1
        patch = plt.Rectangle(
            [x1, y1], w, h, fill=False, edgecolor=color, linewidth=linewidth
        )
        ax.add_patch(patch)
        ax.text(
            x1,
            y1,
            text,
            bbox={"facecolor": color, "alpha": 0.4},
            clip_box=ax.clipbox,
            clip_on=True,
        )
    plt.show()
    return ax

###Implementing Anchor generator
-
Anchor boxes are fixed sized boxes that the model uses to predict the bounding box for an object. It does this by regressing the offset between the location of the object's center and the center of an anchor box, and then uses the width and height of the anchor box to predict a relative scale of the object. In the case of RetinaNet, each location on a given feature map has nine anchor boxes (at three scales and three ratios).

In [None]:
class AnchorBox():
  def __init__(self):
    self.aspect_ratios  = [0.5, 1.0, 2.0]
    self.sclaes = [2 ** x for x in [0, 1/3, 2/3]]
    self.num_anchor = len(self.aspect_ratios) * len(self.sclaes)
    self.strides = [2 ** i for i in range(3, 8)]
    self.areas = [ x**2 for x in [32.0, 64.0, 128.0, 256.0, 512.0]]
    self._anchor_dims =  self._compute_dims()

  def _compute_dims(self):
    anchor_dims_all = []
    for area in self.areas:
      anchor_dims = []
      for ratio in self.ratios:
        anchor_height = tf.math.sqrt(area/ratio)
        anchor_width = area / anchor_height
        dims = tf.reshape(
                    tf.stack([anchor_width, anchor_height], axis=-1), [1, 1, 2]
                )
        for scale in self.scales:
          anchor_dims.append(scale*dims)
      anchor_dims_all.append(tf.stack(anchor_dims, axis =-2))
    return anchor_dims_all

  def _get_anchors(self, feature_height, feature_width, level):
    rx = tf.range(feature_width, dtype=tf.float32) + 0.5
    ry = tf.range(feature_height, dtype=tf.float32) + 0.5
    centers = tf.stack(tf.meshgrid(rx, ry), axis=-1) * self._strides[level - 3]
    centers = tf.expand_dims(centers, axis=-2)
    centers = tf.tile(centers, [1, 1, self._num_anchors, 1])
    dims = tf.tile(
            self._anchor_dims[level - 3], [feature_height, feature_width, 1, 1]
        )
    anchors = tf.concat([centers, dims], axis=-1)
    return tf.reshape(
            anchors, [feature_height * feature_width * self._num_anchors, 4]
      )

  def get_anchors(self, image_height, image_width):
    anchors = [
        self._get_anchors(
                tf.math.ceil(image_height / 2 ** i),
                tf.math.ceil(image_width / 2 ** i),
                i,
            )
    for i in range(3, 8)
        ]
    return tf.concat(anchors, axis=0)

### Preprocessing data
Preprocessing the images involves two steps:

- Resizing the image: Images are resized such that the shortest size is equal to 800 px, after resizing if the longest side of the image exceeds 1333 px, the image is resized such that the longest size is now capped at 1333 px.
- Applying augmentation: Random scale jittering and random horizontal flipping
are the only augmentations applied to the images.

Along with the images, bounding boxes are rescaled and flipped if required.

In [None]:
def random_flip_horizontal(image, boxes):
  if tf.random.uniform(()) > 0.5:
    image = tf.image.flip_left_right(image)
    boxes = tf.stack([1 - boxes[:, 2], boxes[:, 1], 1 - boxes[:, 0], boxes[:, 3]], axis=-1)
  return image, boxes

def resize_and_pad_image(
      image, min_side=800.0, max_side=1333.0, jitter=[640, 1024], stride=128.0
  ):
  image_shape = tf.cast(tf.shape(image)[:2], dtype = tf.float32)
  if jitter is not None:
    min_side = tf.random.uniform((), jitter[0], jitter[1], dtype=tf.float32)
  ratio = min_side / tf.reduce_min(image_shape)
  if ratio * tf.reduce_max(image_shape) > max_side:
    ratio = max_side / tf.reduce_max(image_shape)
  image_shape = ratio * image_shape
  image = tf.image.resize(image, tf.cast(image_shape, dtype=tf.int32))
  padded_image_shape = tf.cast(
        tf.math.ceil(image_shape / stride) * stride, dtype=tf.int32
    )
  image = tf.image.pad_to_bounding_box(
        image, 0, 0, padded_image_shape[0], padded_image_shape[1]
    )
  return image, image_shape, ratio

def preprocess_data(sample):
  image = sample["image"]
  bbox = swap_xy(sample["objects"]["bbox"])
  class_id = tf.cast(sample["objects"]["label"], dtype=tf.int32)

  image, bbox = random_flip_horizontal(image, bbox)
  image, image_shape, _ = resize_and_pad_image(image)

  bbox = tf.stack(
        [
            bbox[:, 0] * image_shape[1],
            bbox[:, 1] * image_shape[0],
            bbox[:, 2] * image_shape[1],
            bbox[:, 3] * image_shape[0],
        ],
        axis=-1,
    )
  bbox = convert_to_xywh(bbox)
  return image, bbox, class_id


### resize_and_pad_image

In [1]:
def get_backbone():
    backbone = tf.keras.apllications.RasNet50(include_top=False, image_size=[None, None, 3])
    c3_output, c4_output, c5_output = [
        backbone.get_layer(layer_name).output
        for layer_name in ["conv3_block4_out", "conv4_block6_out", "conv5_block3_out"]
    ]
    return(
        keras.Model(inputs = [backbone.input], outputs = [c3_output, c4_output, c5_output])
    )

### Building Feature Pyramid Network as a custom layer

In [None]:
from tensorflow import keras
class FeaturePyramid(keras.layers.Layer):
    def __init__(self, backbone=None, **kwargs):
        super().__init__(name = "FeaturePyramid" **kwargs)
        self.backbone = backbone if backbone else get_backbone()
        self.conv_c3_1x1 = keras.layers.Conv2D(256, 1, 1, "same")
        self.conv_c4_1x1 = keras.layers.Conv2D(256, 1, 1, "same")
        self.conv_c5_1x1 = keras.layers.Conv2D(256, 1, 1, "same")
        self.conv_c3_3x3 = keras.layers.Conv2D(256, 3, 1, "same")
        self.conv_c4_3x3 = keras.layers.Conv2D(256, 3, 1, "same")
        self.conv_c5_3x3 = keras.layers.Conv2D(256, 3, 1, "same")
        self.conv_c6_3x3 = keras.layers.Conv2D(256, 3, 2, "same")
        self.upsample_2x = keras.layers.UpSampling2D(2)
    
    def call(self, image, training = False):
        c3_output, c4_output, c5_output = self.backbone(image, training = training)
        p3_output = self.conv_c3_1x1(c3_output)
        p4_output = self.conv_c4_1x1(c4_output)
        p5_output = self.conv_c5_1x1(c5_output)
        p4_output = p4_output + self.upsample_2x(p5_output)
        p3_output = p3_output + self.upsample_2x(p4_output)
        p3_output = self.conv_c3_3x3(p3_output)
        p4_output = self.conv_c4_3x3(p4_output)
        p5_output = self.conv_c5_3x3(p5_output)
        p6_output = self.conv_c6_3x3(c5_output)
        p7_output = self.conv_c7_3x3(tf.nn.relu(p6_output))
        return p3_output, p4_output, p5_output, p6_output, p7_output

NameError: name 'keras' is not defined