In [1]:
import torch
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor

def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn_v2(weights="DEFAULT")

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(
        in_features_mask,
        hidden_layer,
        num_classes
    )

    return model

In [2]:
from torchvision.transforms import v2 as T


def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(T.ToPureTensor())
    return T.Compose(transforms)

In [3]:
import numpy as np
import os
import torch
import torch.utils.data
import torchvision
from PIL import Image
from pycocotools.coco import COCO

class CurrentDataset(torch.utils.data.Dataset):
    def __init__(self, root, annotation, transforms=None):
        self.root = root
        self.transforms = transforms
        self.coco = COCO(annotation)
        self.ids = list(sorted(self.coco.imgs.keys()))

    def __getitem__(self, index):
        # Own coco file
        coco = self.coco
        # Image ID
        img_id = self.ids[index]
        # List: get annotation id from coco
        ann_ids = coco.getAnnIds(imgIds=img_id)
        # Dictionary: target coco_annotation file for an image
        coco_annotation = coco.loadAnns(ann_ids)
        # path for input image
        path = coco.loadImgs(img_id)[0]['file_name']
        # open the input image
        img = Image.open(os.path.join(self.root, path))

        # number of objects in the image
        num_objs = len(coco_annotation)

        # Bounding boxes for objects
        # In coco format, bbox = [xmin, ymin, width, height]
        # In pytorch, the input should be [xmin, ymin, xmax, ymax]
        boxes = [0, 0, 0, 0]
        for i in range(num_objs):
            xmin = coco_annotation[i]['bbox'][0]
            ymin = coco_annotation[i]['bbox'][1]
            xmax = xmin + coco_annotation[i]['bbox'][2]
            ymax = ymin + coco_annotation[i]['bbox'][3]
            boxes.append([xmin, ymin, xmax, ymax])
        

        # Masks
        masks = [0, 0, 0, 0]
        for i in range(num_objs):
            if i >= len(coco_annotation):
                break
            try:
                masks.append(coco.annToMask(coco_annotation[i]))
            except:
                # remove the image from current lists because of the error
                num_objs -= 1
                boxes.pop(i)
                coco_annotation.pop(i)
                

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        masks = torch.as_tensor(masks, dtype=torch.uint8)

        # Labels (In my case, I only one class: target class or background)
        labels = [1 for _ in range(num_objs)]
        labels = torch.as_tensor(labels, dtype=torch.int64)
        # count = num_objs
        # while count < 4:
        #     labels
        #     count += 1
        # # Tensorise img_id
        # img_id = torch.tensor([img_id])
        # # Size of bbox (Rectangular)
        # areas = []
        # for i in range(num_objs):
        #     areas.append(coco_annotation[i]['area'])
        # areas = torch.as_tensor(areas, dtype=torch.float32)
        # # Iscrowd
        # iscrowd = torch.zeros((num_objs,), dtype=torch.int64)



        # # Assuming boxes, labels, and masks are 1D tensors
        # boxes = boxes.view(-1, 1)
        # labels = labels.view(-1, 1)
        # masks = masks.view(-1, 1)
        # # Padding boxes
        # if len(boxes) < 4:
        #     padding = 4 - len(boxes)
        #     boxes = torch.nn.functional.pad(boxes, (0, 0, 0, padding), 'constant', 0)

        # # Padding labels
        # if len(labels) < 4:
        #     padding = 4 - len(labels)
        #     labels = torch.nn.functional.pad(labels, (0, padding), 'constant', 0)

        # # Padding masks
        # if len(masks) < 4:
        #     padding = 4 - len(masks)
        #     masks = torch.nn.functional.pad(masks, (0, 0, 0, padding), 'constant', 0)

        # Annotation is in dictionary format
        my_annotation1 = {}
        my_annotation2 = {}
        my_annotation3 = {}
        my_annotation1["boxes"] = boxes
        my_annotation2["labels"] = labels
        # my_annotation["image_id"] = img_id
        # my_annotation["area"] = areas
        # my_annotation["iscrowd"] = iscrowd
        my_annotation3["masks"] = masks

        import torch.nn.functional as F

        #if self.transforms is not None:
        #    img = self.transforms(img)

        return torch.from_numpy(np.array(img)), my_annotation1, my_annotation2, my_annotation3

    def __len__(self):
        return len(self.ids)

In [4]:
#from currentdataset import CurrentDataset
import numpy as np
# path to your own data and coco file
train_data_dir = 'data/train'
train_coco = 'data/_trainannotations.coco.json'

# create own Dataset
my_dataset = CurrentDataset(root=train_data_dir,
                          annotation=train_coco,
                          transforms=get_transform(train=True)
                          )
print(np.array(my_dataset[0][0]).shape)

# collate_fn needs for batch
def collate_fn(batch):
    return tuple(zip(*batch))

# Batch size
train_batch_size = 100

# own DataLoader
data_loader = torch.utils.data.DataLoader(my_dataset,
                                          batch_size=train_batch_size,
                                          shuffle=True,
                                          num_workers=0,
                                        #  collate_fn=collate_fn
                                        )

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


TypeError: must be real number, not list

In [None]:
max_b = float('-inf')
max_c = float('-inf')
max_d = float('-inf')
for i in range(len(my_dataset)):
    a, b, c, d = my_dataset[i]
    max_b = max(len(b['boxes']), max_b)
    max_c = max(len(c['labels']), max_c)
    max_d = max(len(d['masks']), max_d)
    
print(max_b, max_c, max_d)

KeyboardInterrupt: 

In [None]:
for i, (imgs, a, b, c) in enumerate(data_loader, 1):
    print(i)
    break

RuntimeError: stack expects each tensor to be equal size, but got [4, 1] at entry 0 and [12, 1] at entry 5

In [None]:
my_dataset[0]

(tensor([[[ 28,  44,  15],
          [ 29,  45,  16],
          [ 30,  46,  17],
          ...,
          [ 62,  89,  34],
          [ 62,  86,  34],
          [100, 124,  74]],
 
         [[ 61,  77,  48],
          [ 49,  65,  36],
          [ 34,  50,  21],
          ...,
          [ 77, 105,  47],
          [ 73,  98,  43],
          [ 89, 114,  59]],
 
         [[ 74,  90,  61],
          [ 67,  83,  54],
          [ 59,  75,  46],
          ...,
          [130, 158,  97],
          [108, 134,  73],
          [ 96, 122,  61]],
 
         ...,
 
         [[179, 187, 114],
          [180, 188, 115],
          [180, 188, 115],
          ...,
          [ 25,  37,  37],
          [ 20,  32,  32],
          [ 16,  28,  28]],
 
         [[187, 195, 122],
          [188, 196, 123],
          [188, 196, 123],
          ...,
          [ 28,  40,  40],
          [ 28,  40,  40],
          [ 28,  40,  40]],
 
         [[187, 195, 122],
          [188, 196, 123],
          [189, 197, 124],
   

In [None]:
flag = 0
for i in range(len(my_dataset)):
    a, b, c = my_dataset[i][0].shape
    if a != 300 or b != 300 or c != 3:
        print('hello')

hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello


KeyboardInterrupt: 

In [None]:
my_dataset[1][0].shape

torch.Size([1080, 1920, 3])

In [None]:
# import torchvision.transforms as transforms
# from torchvision.transforms.functional import to_tensor
# transform = transforms.Compose([transforms.PILToTensor()])

# 2 classes; Only target class or background
num_classes = 2
num_epochs = 1
model = get_model_instance_segmentation(num_classes)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# move model to the right device
model.to(device)
    
# parameters
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

len_dataloader = len(data_loader)

for epoch in range(num_epochs):
    model.train()
    i = 0
    
    for i, (imgs, a, b, c) in enumerate(data_loader, 1):
        i += 1
        # imgs = list(to_tensor(img).to(device) for img in imgs)
        #imgs = list(img.to(device) for img in imgs)
        #annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
        imgs = imgs.to(device)
        haha = torch.tensor([a, b, c]).to(device)
        # print(transform(imgs[0]))
        #(annotations['boxes'][0])
        loss_dict = model(imgs, haha)
        losses = sum(loss for loss in loss_dict.values())
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        print(f'Iteration: {i}/{len_dataloader}, Loss: {losses}')

RuntimeError: stack expects each tensor to be equal size, but got [4, 1] at entry 0 and [8, 1] at entry 1

In [None]:
# import matplotlib.pyplot as plt

# from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks


# image = torchvision.io.read_image("data/train/gss14_jpg.rf.c4e60d0c2e4b36c452fac9e49c3ac43e.jpg")
# eval_transform = get_transform(train=False)

# model.eval()
# with torch.no_grad():
#     x = eval_transform(image)
#     # convert RGBA -> RGB and move to device
#     x = x[:3, ...].to(device)
#     predictions = model([x, ])
#     pred = predictions[0]


# image = (255.0 * (image - image.min()) / (image.max() - image.min())).to(torch.uint8)
# image = image[:3, ...]
# pred_labels = [f"person: {score:.3f}" for label, score in zip(pred["labels"], pred["scores"])]
# print(pred_labels)
# pred_boxes = pred["boxes"].long()
# output_image = draw_bounding_boxes(image, pred_boxes, pred_labels, colors="red")

# masks = (pred["masks"] > 0.7).squeeze(1)
# output_image = draw_segmentation_masks(output_image, masks, alpha=0.5, colors="blue")


# plt.figure(figsize=(12, 12))
# plt.imshow(output_image.permute(1, 2, 0))