This is a companion notebook for the book [Deep Learning with Python, Third Edition](TODO). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.

**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**

In [0]:
!pip install keras-nightly --upgrade -q

In [0]:
import os
os.environ["KERAS_BACKEND"] = "jax"

### Two families of object detection models

##### The R-CNN architecture

##### Single-stage detectors

#### Object detection with a pretrained model: RetinaNet

##### Introducing the dataset: Pascal VOC

In [0]:
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar

!tar -xf VOCtrainval_06-Nov-2007.tar
!tar -xf VOCtest_06-Nov-2007.tar

In [0]:
import os
import numpy as np
import xml.etree.ElementTree as ET
import tensorflow as tf

BASE_DIR = os.path.join(os.getcwd(), "VOCdevkit", "VOC2007")
IMAGE_DIR = os.path.join(BASE_DIR, "JPEGImages")
ANNOTATION_DIR = os.path.join(BASE_DIR, "Annotations")
IMAGESET_DIR = os.path.join(BASE_DIR, "ImageSets", "Main")
CLASSES = {
    0: "aeroplane",
    1: "bicycle",
    2: "bird",
    3: "boat",
    4: "bottle",
    5: "bus",
    6: "car",
    7: "cat",
    8: "chair",
    9: "cow",
    10: "diningtable",
    11: "dog",
    12: "horse",
    13: "motorbike",
    14: "person",
    15: "pottedplant",
    16: "sheep",
    17: "sofa",
    18: "train",
    19: "tvmonitor",
}

def parse_annotation(path):
    tree = ET.parse(path)
    root = tree.getroot()
    bboxes = []
    labels = []

    for obj in root.findall("object"):
        name = obj.find("name").text
        difficult = int(obj.find("difficult").text)
        if difficult:
            continue

        bbox = obj.find("bndbox")
        size = root.find("size")
        width = float(size.find("width").text)
        height = float(size.find("height").text)

        xmin = float(bbox.find("xmin").text) / width
        ymin = float(bbox.find("ymin").text) / height
        xmax = float(bbox.find("xmax").text) / width
        ymax = float(bbox.find("ymax").text) / height
        bboxes.append([ymin, xmin, ymax, xmax])

        class_idx = [k for k, v in CLASSES.items() if v == name][0]
        labels.append(class_idx)
    bboxes = tf.constant(bboxes, dtype=tf.float32)
    labels = tf.constant(labels, dtype=tf.float32)
    return bboxes, labels

def process_example(image_id):
    image_id = tf.compat.as_str_any(image_id.numpy())
    image_path = os.path.join(IMAGE_DIR, f"{image_id.rstrip()}.jpg")
    image_data = tf.io.read_file(image_path)
    image = tf.io.decode_jpeg(image_data, channels=3)
    path = os.path.join(ANNOTATION_DIR, f"{image_id.rstrip()}.xml")
    bboxes, labels = parse_annotation(path)
    return image, bboxes, labels

In [0]:
def get_dataset(split, shuffle_files=True, shuffle_buffer_size=1000):
    split_file = os.path.join(IMAGESET_DIR, f"{split}.txt")
    with open(split_file, "r") as f:
        image_ids = [x.strip() for x in f.readlines()]

    ds = tf.data.Dataset.from_tensor_slices(image_ids)

    if shuffle_files:
        ds = ds.shuffle(shuffle_buffer_size)

    ds = ds.map(
        lambda x: tf.py_function(
            func=process_example, inp=[x], Tout=[tf.uint8, tf.float32, tf.int64]
        ),
        num_parallel_calls=tf.data.AUTOTUNE,
    )
    ds = ds.map(
        lambda image, bbox, label: {
            "image": tf.ensure_shape(image, [None, None, 3]),
            "objects": {
                "bbox": tf.ensure_shape(bbox, [None, 4]),
                "label": tf.ensure_shape(label, [None]),
            },
        }
    )
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds = get_dataset("trainval", shuffle_files=True)
eval_ds = get_dataset("test", shuffle_files=True)

In [0]:
example = next(iter(train_ds))

plot_bounding_box_gallery(
    np.array([example["image"]]),
    bounding_box_format="rel_yxyx",
    y_true={
        "boxes": np.array([example["objects"]["bbox"]]),
        "labels": np.array([example["objects"]["label"]]),
    },
    scale=8,
    class_mapping=CLASSES,
)

##### Bounding box formats

##### Setting up an image preprocessing and augmentation pipeline

In [0]:
import keras

BBOX_FORMAT = "yxyx"

def parse_record(record):
    image = record["image"]
    h, w = tf.shape(image)[0], tf.shape(image)[1]
    rel_boxes = record["objects"]["bbox"]
    abs_boxes = keras.utils.bounding_boxes.convert_format(
        rel_boxes,
        source="rel_yxyx",
        target=BBOX_FORMAT,
        height=h,
        width=w,
    )
    labels = tf.cast(record["objects"]["label"], dtype=tf.int32)
    return {
        "images": image,
        "bounding_boxes": {
            "boxes": abs_boxes,
            "labels": labels,
        },
    }

In [0]:
from keras.visualization import plot_bounding_box_gallery

IMAGE_SIZE = (640, 640)
BATCH_SIZE = 4

resizing = keras.layers.Resizing(
    height=IMAGE_SIZE[0],
    width=IMAGE_SIZE[1],
    interpolation="bilinear",
    pad_to_aspect_ratio=True,
    bounding_box_format=BBOX_FORMAT,
)

max_box_layer = keras.layers.MaxNumBoundingBoxes(
    max_number=100,
    bounding_box_format=BBOX_FORMAT,
)

data_augmentation_layers = [
    keras.layers.RandomFlip(mode="horizontal", bounding_box_format=BBOX_FORMAT),
]

def prepare_dataset(ds, batch_size=4):
    ds = ds.map(parse_record)
    ds = ds.map(lambda x: resizing(x))
    for layer in data_augmentation_layers:
        ds = ds.map(lambda x: layer(x))
    ds = ds.map(max_box_layer)
    ds = ds.batch(batch_size, drop_remainder=True)
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds_prepared = prepare_dataset(train_ds, batch_size=BATCH_SIZE)
eval_ds_prepared = prepare_dataset(eval_ds, batch_size=BATCH_SIZE)

first_images_unprepared = next(iter(train_ds.take(1)))

plot_bounding_box_gallery(
    np.array([first_images_unprepared["image"]]),
    bounding_box_format="rel_yxyx",
    y_true={
        "boxes": np.array([first_images_unprepared["objects"]["bbox"]]),
        "labels": np.array([first_images_unprepared["objects"]["label"]]),
    },
    scale=4,
    class_mapping=CLASSES,
)

first_images_prepared = next(iter(train_ds_prepared.unbatch().take(1)))

plot_bounding_box_gallery(
    np.array([first_images_prepared["images"]]),
    bounding_box_format="yxyx",
    y_true={
        "boxes": np.array([first_images_prepared["bounding_boxes"]["boxes"]]),
        "labels": np.array([first_images_prepared["bounding_boxes"]["labels"]]),
    },
    scale=4,
    class_mapping=CLASSES,
)

##### Fine-tuning the RetinaNet object detection model

In [0]:
import keras_hub

model = keras_hub.models.ImageObjectDetector.from_preset(
    "retinanet_resnet50_fpn_coco"
)

In [0]:
model_with_random_head = keras_hub.models.ImageObjectDetector.from_preset(
    "retinanet_resnet50_fpn_coco",
    num_classes=len(CLASSES),
)

In [0]:
def split_labels(x):
    return (
        x["images"],
        {
            "boxes": x["bounding_boxes"]["boxes"],
            "classes": x["bounding_boxes"]["labels"],
        },
    )

train_ds_prepared = train_ds_prepared.map(split_labels)
eval_ds_prepared = eval_ds_prepared.map(split_labels)

callbacks = [
    keras.callbacks.ModelCheckpoint(
        "pascal_voc_detection.keras",
        save_best_only=True,
        monitor="val_loss",
    )
]
history = model.fit(
    train_ds_prepared,
    validation_data=eval_ds_prepared,
    epochs=10,
    callbacks=callbacks,
)

##### Metrics, evaluation, and inference

In [0]:
import matplotlib.pyplot as plt

model = keras.models.load_model("pascal_voc_detection.keras")
images, gt_boxes = next(iter(eval_ds_prepared))
predictions = model.predict(images)

plot_bounding_box_gallery(
    images,
    bounding_box_format=BBOX_FORMAT,
    y_true={
        "boxes": gt_boxes["boxes"],
        "labels": gt_boxes["classes"],
    },
    y_pred={
        "boxes": predictions["boxes"],
        "labels": predictions["classes"],
    },
    scale=8,
    class_mapping=CLASSES,
)

### Chapter summary