In [1]:
import torch
import torch.nn as nn
import torch.utils.data
import torchvision
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision import models, datasets, tv_tensors
from torchvision.models import resnet50, ResNet50_Weights
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights 
from torchvision.models.detection import maskrcnn_resnet50_fpn
from transformers import pipeline

from torchvision.models.detection.rpn import concat_box_prediction_layers
from torchvision.models.detection.roi_heads import fastrcnn_loss
from torchvision.transforms.functional import to_pil_image, pil_to_tensor

In [2]:
class AddDepth(torch.nn.Module):
    def forward(self, image, label):
        pipe = pipeline(task="depth-estimation",
                    model="LiheYoung/depth-anything-small-hf")
        depth = pipe(to_pil_image(image))["depth"]
        depth_tensor = pil_to_tensor(depth)
        rgbd = torch.concatenate([image, depth_tensor], dim=0)
        return rgbd, label

In [3]:
class ToDepth:
    def __call__(self, image):
        pipe = pipeline(task="depth-estimation",
                        model="LiheYoung/depth-anything-small-hf")
        depth = pipe(image)["depth"]
        depth_tensor = pil_to_tensor(depth)
        rgbd = torch.concatenate([pil_to_tensor(image), depth_tensor], dim=0)
        return rgbd

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}()"

In [4]:
# transform = weights.transforms()
transform = transforms.Compose([
    # AddDepth()
    ToDepth()
])

In [6]:
BATCH_SIZE = 2

In [7]:
train_dataset = datasets.CocoDetection(root="../data/coco/train2017", 
                                       annFile="../data/coco/annotations/instances_train2017.json",
                                       transform=transform)
train_dataset = datasets.wrap_dataset_for_transforms_v2(train_dataset, target_keys=["boxes", "labels", "masks"])
print("\nTrain samples size:", len(train_dataset), end='\n\n')

val_dataset = datasets.CocoDetection(root="../data/coco/val2017", 
                                     annFile="../data/coco/annotations/instances_val2017.json",
                                     transform=transform)
val_dataset = datasets.wrap_dataset_for_transforms_v2(val_dataset, target_keys=["boxes", "labels", "masks"])                                     
print("\nValidation samples size:", len(val_dataset), end='\n\n')

train_loader = DataLoader(dataset=train_dataset, 
                          batch_size=BATCH_SIZE,
                          shuffle=False,
                          collate_fn=lambda batch: tuple(zip(*batch)))

val_loader = DataLoader(dataset=val_dataset, 
                        batch_size=BATCH_SIZE,
                        shuffle=False,
                        collate_fn=lambda batch: tuple(zip(*batch)))

loading annotations into memory...
Done (t=9.35s)
creating index...
index created!

Train samples size: 118287

loading annotations into memory...
Done (t=0.27s)
creating index...
index created!

Validation samples size: 5000



In [8]:
def eval_forward(model, images, targets):
    # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
    """
    Args:
        images (list[Tensor]): images to be processed
        targets (list[Dict[str, Tensor]]): ground-truth boxes present in the image (optional)
    Returns:
        result (list[BoxList] or dict[Tensor]): the output from the model.
            It returns list[BoxList] contains additional fields
            like `scores`, `labels` and `mask` (for Mask R-CNN models).
    """
    model.eval()

    original_image_sizes: List[Tuple[int, int]] = []
    for img in images:
        val = img.shape[-2:]
        assert len(val) == 2
        original_image_sizes.append((val[0], val[1]))

    images, targets = model.transform(images, targets)

    # Check for degenerate boxes
    # TODO: Move this to a function
    if targets is not None:
        for target_idx, target in enumerate(targets):
            boxes = target["boxes"]
            degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
            if degenerate_boxes.any():
                # print the first degenerate box
                bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
                degen_bb: List[float] = boxes[bb_idx].tolist()
                raise ValueError(
                    "All bounding boxes should have positive height and width."
                    f" Found invalid box {degen_bb} for target at index {target_idx}."
                )

    features = model.backbone(images.tensors)
    if isinstance(features, torch.Tensor):
        features = OrderedDict([("0", features)])
    model.rpn.training = True
    # model.roi_heads.training=True

    # proposals, proposal_losses = model.rpn(images, features, targets)
    features_rpn = list(features.values())
    objectness, pred_bbox_deltas = model.rpn.head(features_rpn)
    anchors = model.rpn.anchor_generator(images, features_rpn)

    num_images = len(anchors)
    num_anchors_per_level_shape_tensors = [o[0].shape for o in objectness]
    num_anchors_per_level = [s[0] * s[1] * s[2]
                             for s in num_anchors_per_level_shape_tensors]
    objectness, pred_bbox_deltas = concat_box_prediction_layers(
        objectness, pred_bbox_deltas)
    # apply pred_bbox_deltas to anchors to obtain the decoded proposals
    # note that we detach the deltas because Faster R-CNN do not backprop through
    # the proposals
    proposals = model.rpn.box_coder.decode(pred_bbox_deltas.detach(), anchors)
    proposals = proposals.view(num_images, -1, 4)
    proposals, scores = model.rpn.filter_proposals(
        proposals, objectness, images.image_sizes, num_anchors_per_level)

    proposal_losses = {}
    assert targets is not None
    labels, matched_gt_boxes = model.rpn.assign_targets_to_anchors(
        anchors, targets)
    regression_targets = model.rpn.box_coder.encode(matched_gt_boxes, anchors)
    loss_objectness, loss_rpn_box_reg = model.rpn.compute_loss(
        objectness, pred_bbox_deltas, labels, regression_targets
    )
    proposal_losses = {
        "loss_objectness": loss_objectness,
        "loss_rpn_box_reg": loss_rpn_box_reg,
    }

    # detections, detector_losses = model.roi_heads(features, proposals, images.image_sizes, targets)
    image_shapes = images.image_sizes
    proposals, matched_idxs, labels, regression_targets = model.roi_heads.select_training_samples(
        proposals, targets)
    box_features = model.roi_heads.box_roi_pool(
        features, proposals, image_shapes)
    box_features = model.roi_heads.box_head(box_features)
    class_logits, box_regression = model.roi_heads.box_predictor(box_features)

    result: List[Dict[str, torch.Tensor]] = []
    detector_losses = {}
    loss_classifier, loss_box_reg = fastrcnn_loss(
        class_logits, box_regression, labels, regression_targets)
    detector_losses = {"loss_classifier": loss_classifier,
                       "loss_box_reg": loss_box_reg}
    boxes, scores, labels = model.roi_heads.postprocess_detections(
        class_logits, box_regression, proposals, image_shapes)
    num_images = len(boxes)
    for i in range(num_images):
        result.append(
            {
                "boxes": boxes[i],
                "labels": labels[i],
                "scores": scores[i],
            }
        )
    detections = result
    detections = model.transform.postprocess(
        detections, images.image_sizes, original_image_sizes)  # type: ignore[operator]
    model.rpn.training = False
    model.roi_heads.training = False
    losses = {}
    losses.update(detector_losses)
    losses.update(proposal_losses)
    return losses, detections

In [9]:
# def filter_images_with_labels(dataset):
#     filtered_indices = []
#     for idx in range(len(dataset)):
#         _, ann = dataset[idx]
#         if ('boxes' in ann) and ('labels' in ann):  # If annotations exist
#             filtered_indices.append(idx)
#     return filtered_indices

# len(filter_images_with_labels(train_dataset_m))

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("\nAvailable device:", device)

# weights = ResNet50_Weights.IMAGENET1K_V1
# model = resnet50(weights=weights)
# model = fasterrcnn_resnet50_fpn_v2(weights=FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1)
model = models.get_model("maskrcnn_resnet50_fpn_v2", weights=None, weights_backbone=None).train()

model_depth = models.get_model("maskrcnn_resnet50_fpn_v2", weights=None, weights_backbone=None).train()
model_depth.backbone.body.conv1 = torch.nn.Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)


Available device: cuda


In [11]:
params = [p for p in model_depth.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

num_epochs = 2
model_depth = model_depth.to(device)

for epoch in range(num_epochs):
    model_depth.train()
    total_train_loss = 0
    for images, targets in train_loader:
        if 'boxes' not in targets[0].keys():
            break
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        images = [image.to(device) for image in images]
        
        optimizer.zero_grad()
        loss_dict = model_depth(images, targets)

        del images
        del targets

        losses = sum(loss for loss in loss_dict.values())
        losses.backward()
        optimizer.step()

        total_train_loss += losses.item()
    avg_train_loss = total_train_loss / len(train_loader)
    
    model_depth.eval()
    total_val_loss = 0
    for images, targets in val_loader:
        if 'boxes' not in targets[0].keys():
            break
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        images = [image.to(device) for image in images]
        # loss_dict = model_depth(images, targets)
        losses, predictions = eval_forward(model_depth, images, targets)
        losses = sum(loss for loss in loss_dict.values())
        total_val_loss += losses.item()

        del images
        del targets
        
    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch + 1} || Train Loss: {avg_train_loss} || Val Loss: {avg_val_loss}")
    lr_scheduler.step()

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [62]:
for images, targets in val_loader:
    images = [image.to(device) for image in images]
    if 'boxes' not in targets[0].keys():
        break
    targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
    break

In [67]:
losses, predictions = eval_forward(model, images, targets)
losses = sum(loss for loss in loss_dict.values())

({'loss_classifier': tensor(0.2373, device='cuda:0', grad_fn=<NllLossBackward0>),
  'loss_box_reg': tensor(0.0131, device='cuda:0', grad_fn=<DivBackward0>),
  'loss_objectness': tensor(1.8196, device='cuda:0',
         grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
  'loss_rpn_box_reg': tensor(5.6260, device='cuda:0', grad_fn=<DivBackward0>)},
 [{'boxes': tensor([], device='cuda:0', size=(0, 4), grad_fn=<StackBackward0>),
   'labels': tensor([], device='cuda:0', dtype=torch.int64),
   'scores': tensor([], device='cuda:0', grad_fn=<IndexBackward0>)},
  {'boxes': tensor([], device='cuda:0', size=(0, 4), grad_fn=<StackBackward0>),
   'labels': tensor([], device='cuda:0', dtype=torch.int64),
   'scores': tensor([], device='cuda:0', grad_fn=<IndexBackward0>)}])

In [8]:
def eval_forward(model, images, targets):
    # type: (List[Tensor], Optional[List[Dict[str, Tensor]]]) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]
    """
    Args:
        images (list[Tensor]): images to be processed
        targets (list[Dict[str, Tensor]]): ground-truth boxes present in the image (optional)
    Returns:
        result (list[BoxList] or dict[Tensor]): the output from the model.
            It returns list[BoxList] contains additional fields
            like `scores`, `labels` and `mask` (for Mask R-CNN models).
    """
    model.eval()

    original_image_sizes: List[Tuple[int, int]] = []
    for img in images:
        val = img.shape[-2:]
        assert len(val) == 2
        original_image_sizes.append((val[0], val[1]))

    images, targets = model.transform(images, targets)

    # Check for degenerate boxes
    # TODO: Move this to a function
    if targets is not None:
        for target_idx, target in enumerate(targets):
            boxes = target["boxes"]
            degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
            if degenerate_boxes.any():
                # print the first degenerate box
                bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
                degen_bb: List[float] = boxes[bb_idx].tolist()
                raise ValueError(
                    "All bounding boxes should have positive height and width."
                    f" Found invalid box {degen_bb} for target at index {target_idx}."
                )

    features = model.backbone(images.tensors)
    if isinstance(features, torch.Tensor):
        features = OrderedDict([("0", features)])
    model.rpn.training=True
    #model.roi_heads.training=True


    #####proposals, proposal_losses = model.rpn(images, features, targets)
    features_rpn = list(features.values())
    objectness, pred_bbox_deltas = model.rpn.head(features_rpn)
    anchors = model.rpn.anchor_generator(images, features_rpn)

    num_images = len(anchors)
    num_anchors_per_level_shape_tensors = [o[0].shape for o in objectness]
    num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors]
    objectness, pred_bbox_deltas = concat_box_prediction_layers(objectness, pred_bbox_deltas)
    # apply pred_bbox_deltas to anchors to obtain the decoded proposals
    # note that we detach the deltas because Faster R-CNN do not backprop through
    # the proposals
    proposals = model.rpn.box_coder.decode(pred_bbox_deltas.detach(), anchors)
    proposals = proposals.view(num_images, -1, 4)
    proposals, scores = model.rpn.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level)

    proposal_losses = {}
    assert targets is not None
    labels, matched_gt_boxes = model.rpn.assign_targets_to_anchors(anchors, targets)
    regression_targets = model.rpn.box_coder.encode(matched_gt_boxes, anchors)
    loss_objectness, loss_rpn_box_reg = model.rpn.compute_loss(
        objectness, pred_bbox_deltas, labels, regression_targets
    )
    proposal_losses = {
        "loss_objectness": loss_objectness,
        "loss_rpn_box_reg": loss_rpn_box_reg,
    }

    #####detections, detector_losses = model.roi_heads(features, proposals, images.image_sizes, targets)
    image_shapes = images.image_sizes
    proposals, matched_idxs, labels, regression_targets = model.roi_heads.select_training_samples(proposals, targets)
    box_features = model.roi_heads.box_roi_pool(features, proposals, image_shapes)
    box_features = model.roi_heads.box_head(box_features)
    class_logits, box_regression = model.roi_heads.box_predictor(box_features)

    result: List[Dict[str, torch.Tensor]] = []
    detector_losses = {}
    loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
    detector_losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
    boxes, scores, labels = model.roi_heads.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
    num_images = len(boxes)
    for i in range(num_images):
        result.append(
            {
                "boxes": boxes[i],
                "labels": labels[i],
                "scores": scores[i],
            }
        )
    detections = result
    detections = model.transform.postprocess(detections, images.image_sizes, original_image_sizes)  # type: ignore[operator]
    model.rpn.training=False
    model.roi_heads.training=False
    losses = {}
    losses.update(detector_losses)
    losses.update(proposal_losses)
    return losses, detections

In [7]:
loss_dict

{'loss_classifier': tensor(0.2635, grad_fn=<NllLossBackward0>),
 'loss_box_reg': tensor(0.0336, grad_fn=<DivBackward0>),
 'loss_objectness': tensor(10.7953, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(45.5220, grad_fn=<DivBackward0>)}

In [10]:
for images, targets in train_loader:
    images = images.to(device)
    # images = tuple(image.to(device) for image in images)

    # Extract bounding boxes and labels for each image in the batch
    boxes = [target['bbox'].to(device) for target in targets]
    labels = [target['category_id'].to(device) for target in targets]

TypeError: list indices must be integers or slices, not str

In [12]:
# for images, targets in train_loader:
#     images = list(image.to(device) for image in images)
#     targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
#     print(type(images))
#     print(images[0].shape)
#     print(type(targets))
#     break

In [23]:
images[[0],:,:,:].shape

torch.Size([1, 3, 224, 224])

In [None]:
model = model.to(device).eval()
for images, targets in train_loader:
    images = images.to(device)
    # targets = targets.to(device)

    # Forward pass
    outputs = model(images)
    loss = criterion(outputs, targets)
    break


In [None]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model = model.to(device)

# Training loop
for epoch in range(EPOCH):
    for images, targets in train_loader:
        images = images.to(device)
        targets = targets.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, targets)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # ... (Calculate and print metrics, etc.) ...

    # ... (Validation step) ...

In [48]:
for data in train_loader:
    print(data[1][3][0]['bbox'])
    print(len(data[1][0]))
    break

[120.07, 71.83, 134.49, 153.08]
20


In [4]:
model.fc

Linear(in_features=2048, out_features=1000, bias=True)

[]