In [1]:
import torch
import torchvision
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import ToTensor



In [2]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

num_classes = 6
in_features = model.roi_heads.box_predictor.cls_score.in_features

model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

## model

In [3]:
root_path = "data/images"
ann_path = 'data/train.json'
coco_det = datasets.CocoDetection(
    root=root_path, annFile=ann_path, transform=ToTensor()
)

loading annotations into memory...
Done (t=0.04s)
creating index...
index created!


## labels

In [4]:
# from tqdm import tqdm
#
# labels = set()
# for img, targets in tqdm(iter(coco_det)):
#     for target in targets:
#         labels.add(target['category_id'])
# labels

labels = {87, 131, 318, 588, 1034}
labels = sorted(list(labels))
label_idx = {l: i for i, l in enumerate(sorted(list(labels)))}
label_idx

{87: 0, 131: 1, 318: 2, 588: 3, 1034: 4}

In [5]:
def collate_fn_coco(batch):
    return tuple(zip(*batch))


data_loader = torch.utils.data.DataLoader(
    coco_det, batch_size=2, shuffle=True, num_workers=0, collate_fn=collate_fn_coco)
# For Training
images, targets = next(iter(data_loader))



In [6]:
# - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
#   ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
# - labels (Int64Tensor[N]): the class label for each ground-truth box
# print(targets)


def process_targets(targets):
    out = []
    for target in targets:
        print(target)
        boxes = []
        for t in target:
            x1, y1, w, h, = t['bbox']
            boxes.append([x1, y1, x1 + w, y1 + h])
        boxes = torch.tensor(boxes)
        labels = torch.tensor([label_idx[t['category_id']] for t in target], dtype=torch.int64)
        out.append({'boxes': boxes,
                    'labels': labels})
    return out


targets = process_targets(targets)

[{'id': 4569006, 'image_id': 5482560941588709616, 'freebase_id': '/m/017ftj', 'category_id': 1034, 'iscrowd': False, 'bbox': [442.24, 274.69, 154.88, 56.35], 'area': 8726.89}]
[{'id': 6155903, 'image_id': 7471380254599643684, 'freebase_id': '/m/017ftj', 'category_id': 1034, 'iscrowd': False, 'bbox': [144.0, 264.32, 111.36, 52.48], 'area': 5844.17}]


In [9]:
# images = list(image for image in images)
# targets = [{k: v for k, v in t.items()} for t in targets]
output = model(images, targets)  # Returns losses and detections
output
# For inference
# model.eval()
# x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
# predictions = model(images)
# predictions

{'loss_classifier': tensor(2.0073, grad_fn=<NllLossBackward>),
 'loss_box_reg': tensor(0.0119, grad_fn=<DivBackward0>),
 'loss_objectness': tensor(0.0722, grad_fn=<BinaryCrossEntropyWithLogitsBackward>),
 'loss_rpn_box_reg': tensor(0.0082, grad_fn=<DivBackward0>)}