# Exploring COCO & DETR for Object Detection

Let's first make sure that a GPU is available.

In [None]:
!nvidia-smi

## Prep imports and helper functions

In [None]:
# Imports
import sys
sys.path.append('/')
sys.path.append('/detr')
import os
import json
from attrdict import AttrDict

import numpy as np
import torch
from torch import nn
import torchvision.transforms as T


import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

from determined.experimental import Determined
from determined.pytorch import PyTorchTrialContext

sys.path.append('detr_coco_pytorch')
from data import CocoDetection, build_dataset
from model import build_model

In [None]:
# Utility functions
# COCO classes
CLASSES = [
    'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
    'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
    'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush'
]

# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

# This function will help us plot images and bounding boxes with class labels.
def plot_results(pil_img, class_ids, boxes):
    plt.figure(figsize=(16,10))
    plt.imshow(pil_img)
    ax = plt.gca()
    colors = COLORS * 100
    for cl, (xmin, ymin, xmax, ymax), c in zip(class_ids, boxes.tolist(), colors):
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                   fill=False, color=c, linewidth=3))
        text = f'{CLASSES[cl]}'
        ax.text(xmin, ymin, text, fontsize=15,
                bbox=dict(facecolor='yellow', alpha=0.5))
    plt.axis('off')
    plt.show()

# Standard transform
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

# COCO Dataset

We will be working with the COCO 2017 dataset so let's first get familiar with what the data looks like.  Since the notebook you're using is launched and managed from the Determined cluster, it has already been configured to have access to a google storage bucket with COCO 2017 data.  We can just point the dataset to the right bucket to load image files.

In [None]:
dataset_val = CocoDetection(
    'gcs',
    'determined-ai-coco-dataset',
    'determined-ai-coco-dataset/val2017',
    '/tmp/instances_val2017.json',
    transforms=None,
    return_masks=False,
)

The COCO 2017 dataset has around 90 classes along with hierarchical grouping of classes by super category.  We will be tasked with predicting bounding boxes and classes for the granular classes.

In [None]:
dataset_val.coco.cats

Let's take a look at one of the images along with the associated target bounding boxes and classes.  As you can see, boxes can overlap and there can be multiple instances of the same object in one image.

In [None]:
datapoint = dataset_val[237]

In [None]:
plot_results(datapoint[0], datapoint[1]['labels'], datapoint[1]['boxes'])    


# Overview of DETR architecture (from FAIR's [DETR Demo](https://colab.research.google.com/github/facebookresearch/detr/blob/colab/notebooks/detr_demo.ipynb#scrollTo=h91rsIPl7tVl))

![](detr_coco_pytorch/imgs/detr_architecture.png)

In [None]:
class DETRdemo(nn.Module):
    """
    Demo DETR implementation.

    Demo implementation of DETR in minimal number of lines, with the
    following differences wrt DETR in the paper:
    * learned positional encoding (instead of sine)
    * positional encoding is passed at input (instead of attention)
    * fc bbox predictor (instead of MLP)
    The model achieves ~40 AP on COCO val5k and runs at ~28 FPS on Tesla V100.
    Only batch size 1 supported.
    """
    def __init__(self, num_classes, hidden_dim=256, nheads=8,
                 num_encoder_layers=6, num_decoder_layers=6):
        super().__init__()

        # create ResNet-50 backbone
        self.backbone = resnet50()
        del self.backbone.fc

        # create conversion layer
        self.conv = nn.Conv2d(2048, hidden_dim, 1) 

        # create a default PyTorch transformer
        self.transformer = nn.Transformer(
            hidden_dim, nheads, num_encoder_layers, num_decoder_layers)

        # prediction heads, one extra class for predicting non-empty slots
        # note that in baseline DETR linear_bbox layer is 3-layer MLP
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)
        self.linear_bbox = nn.Linear(hidden_dim, 4)

        # output positional encodings (object queries)
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))

        # spatial positional encodings
        # note that in baseline DETR we use sine positional encodings
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))

    def forward(self, inputs):
        # propagate inputs through ResNet-50 up to avg-pool layer
        x = self.backbone.conv1(inputs)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)

        # convert from 2048 to 256 feature planes for the transformer
        h = self.conv(x) # output will be 256 x h x w

        # construct positional encodings
        H, W = h.shape[-2:]
        pos = torch.cat([
            self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
            self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
        ], dim=-1).flatten(0, 1).unsqueeze(1) # hw x 1 x 256

        # propagate through the transformer
        # add positional encoding to cnn features
        tr_enc_input = pos + 0.1 * h.flatten(2).permute(2, 0, 1) # hw x batch_size x 256
        # Get 100 query corresponding to 100 proposals ready to input into deoder
        tr_dec_input = self.query_pos.unsqueeze(1) # 100 x 1 x 256
        h = self.transformer(tr_enc_input, tr_dec_input).transpose(0, 1) # batch_size x 100 x 256
                             
        
        # finally project transformer outputs to class labels and bounding boxes
        return {'pred_logits': self.linear_class(h), 
                'pred_boxes': self.linear_bbox(h).sigmoid()}

In [None]:
# DETR in Determined
from model_def import DETRTrial

In [None]:
??DETRTrial

# Submit experiment to Determined Cluster

Once the trial definition is ready to go, we can configure an experiment with the desired hyperparameters and resources with a `.yaml` config file.  This separation between config and code promotes reproducibility down the road.  We also track the environment and code used for each experiment so that users can easily replicate their results in the future.  

The magic of Determined is that users need to change very few things to run advanced ML workflows such as multi-node distributed training and advanced hyperparameter search.  

In [None]:
! cat detr_coco_pytorch/finetune_gcs.yaml

### You can test this out by running `det e create finetune_gcs.yaml .` from the `detr_coco_pytorch` folder using the terminal.

# Loading models from registry

After an experiment finishes running, we can track the resulting checkpoints if desired in the model registry.  This can be done through the command line (see below) or programmatically with python.  

`det model create <model_name>`

`det model register-version <model_name> <checkpoint_uuid>`

Once a checkpoint has been registered under a model, it can be easily accessed for future use.

In [None]:
model = Determined().get_model("detr")
ckpt = model.get_version()
ckpt_path = ckpt.download()
ckpt = torch.load(os.path.join(ckpt_path, 'state_dict.pth'))
metadata = json.load(open(os.path.join(ckpt_path, 'metadata.json'), 'r'))
hparams = AttrDict(metadata['hparams'])
model, _, postprocessor = build_model(hparams, 1)
model.load_state_dict(ckpt['models_state_dict'][0])

In [None]:
model.eval()
raw_img = dataset_val[772][0]
img = transform(raw_img).unsqueeze(0)
outputs = model(img)

# keep only predictions with 0.9+ confidence
probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
keep = probas.max(-1).values > 0.9

bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], raw_img.size)
plot_results(raw_img, probas[keep].argmax(-1), bboxes_scaled)

## Visualizing decoder attention weights (modified from [DETR notebook](https://colab.research.google.com/github/facebookresearch/detr/blob/colab/notebooks/detr_attention.ipynb#scrollTo=PcxWAOzOYTEn))

In [None]:
def visualize_decoder_weights(model, raw_img, img):
    conv_features, dec_attn_weights = [], []

    hooks = [
        model.backbone[-2].register_forward_hook(
            lambda self, input, output: conv_features.append(output)
        ),
        model.transformer.decoder.layers[-1].multihead_attn.register_forward_hook(
            lambda self, input, output: dec_attn_weights.append(output[1])
        ),
    ]

    # propagate through the model
    outputs = model(img)

    for hook in hooks:
        hook.remove()

    # don't need the list anymore
    conv_features = conv_features[0]
    dec_attn_weights = dec_attn_weights[0]
    
    # get the feature map shape
    h, w = conv_features['0'].tensors.shape[-2:]

    fig, axs = plt.subplots(ncols=len(bboxes_scaled), nrows=2, figsize=(22, 7))
    colors = COLORS * 100
    for idx, ax_i, (xmin, ymin, xmax, ymax) in zip(keep.nonzero(), axs.T, bboxes_scaled):
        ax = ax_i[0]
        ax.imshow(dec_attn_weights[0, idx].view(h, w).detach())
        ax.axis('off')
        ax.set_title(f'query id: {idx.item()}')
        ax = ax_i[1]
        ax.imshow(raw_img)
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                   fill=False, color='blue', linewidth=3))
        ax.axis('off')
        ax.set_title(CLASSES[probas[idx].argmax()])
    fig.tight_layout()

In [None]:
visualize_decoder_weights(model, raw_img, img)

# Configure HP search experiment

To tune hyperparameters of our finetune experiment, we simply need to modify a few fields in the experiment config.

In [None]:
! cat detr_coco_pytorch/finetune_adaptive.yaml

### You can test this out by running `det e create finetune_adaptive.yaml .` from the `detr_coco_pytorch` folder using the terminal.