In [0]:
from torch.utils.data import Dataset
import os
from PIL import Image
import xml.etree.ElementTree as ET
import torch


class CustomDataSet(Dataset):

    def __init__(self, root_dir, classes=[], is_train=True, transforms=None):
        self.root_dir = root_dir
        self.annotations_path = os.path.join(root_dir, 'Annotations')
        self.images_path = os.path.join(root_dir, 'JPEGImages')
        if is_train:
            self.image_sets = open(os.path.join(root_dir, 'ImageSets/Main/train.txt'), 'r')
        else:
            self.image_sets = open(os.path.join(root_dir, 'ImageSets/Main/validation.txt'), 'r')
        self.ids = sorted([each_value.strip() for each_value in self.image_sets.readlines()])
        self.image_sets.close()
        self.classes = classes
        self.transforms = transforms

    def __getitem__(self, index):
        image_id = self.ids[index] + '.jpg'
        image = Image.open(os.path.join(self.images_path, image_id))
        if self.transforms is not None:
            image = self.transforms(image)
        xml_file = self.ids[index] + '.xml'
        annotations = ET.parse(os.path.join(self.annotations_path, xml_file))
        root = annotations.getroot()
        boxes = []
        labels = []
        is_crowd = []
        for each_object in root.iter('object'):
            xmin = float(each_object.find('bndbox/xmin').text)
            ymin = float(each_object.find('bndbox/ymin').text)
            xmax = float(each_object.find('bndbox/xmax').text)
            ymax = float(each_object.find('bndbox/ymax').text)
            label = each_object.find('name').text
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(self.classes.index(label))
            is_crowd.append(0)

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        image_id = torch.tensor([index])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        is_crowd = torch.as_tensor(is_crowd, dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        # target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = is_crowd

        return image, target

    def __len__(self):
        return len(self.ids)


In [0]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torchvision


def get_model_object_detection(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False)
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
    return model


In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
torch.cuda.is_available()

True

In [19]:
from torch.utils.data import DataLoader
import torch
import torchvision.transforms as T
import sys

transforms = T.Compose([T.Resize(256),
                        T.CenterCrop(224),
                        T.ToTensor(),
                        T.Normalize(mean=[0.485, 0.456, 0.406],
                                    std=[0.229, 0.224, 0.225])])

root_dir = '/content/drive/My Drive/Colab Notebooks/aeroplane/Datasets/VOC2007/'

print(root_dir)


# collate_fn needs for batch
def collate_fn(batch):
    return tuple(zip(*batch))


train_dataset = CustomDataSet(root_dir=root_dir, classes=['aeroplane'], is_train=True, transforms=transforms)
test_dataset = CustomDataSet(root_dir=root_dir, classes=['aeroplane'], is_train=False, transforms=transforms)


data_loader = DataLoader(train_dataset, batch_size=1, shuffle=False, num_workers=1, collate_fn=collate_fn)
test_data_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=1, collate_fn=collate_fn)

# 2 classes; Only target class or background
num_classes = 2
num_epochs = 1
model = get_model_object_detection(num_classes)

if torch.cuda.is_available():
  model.cuda()
# move model to the right device

# parameters
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

len_dataloader = len(data_loader)

for epoch in range(num_epochs):
    model.train()
    i = 0
    for images, annotations in data_loader:
        i += 1
        imgs = list(img.cuda() for img in images)

        model_inputs = [{k: v.cuda() for k, v in t.items()} for t in annotations]
        loss_dict = model(imgs, model_inputs)
        print(loss_dict)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        print(f'Iteration: {i}/{len_dataloader}, Loss: {losses}')

#model.eval()

#for images, annotations in test_data_loader:
    #test_dict = model(images)
    #print(test_dict)
test_img = test_dataset[0]
test_dict = model(images)
print(test_dict)

/content/drive/My Drive/Colab Notebooks/aeroplane/Datasets/VOC2007/
{'loss_classifier': tensor(0.7156, device='cuda:0', grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0., device='cuda:0', grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.6976, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_rpn_box_reg': tensor(0.0194, device='cuda:0', grad_fn=<DivBackward0>)}
Iteration: 1/168, Loss: 1.4325106143951416
{'loss_classifier': tensor(0.4329, device='cuda:0', grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0., device='cuda:0', grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.6986, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>), 'loss_rpn_box_reg': tensor(29.3007, device='cuda:0', grad_fn=<DivBackward0>)}
Iteration: 2/168, Loss: 30.432092666625977
{'loss_classifier': tensor(0.2305, device='cuda:0', grad_fn=<NllLossBackward>), 'loss_box_reg': tensor(0., device='cuda:0', grad_fn=<DivBackward0>), 'loss_objectness': tensor(0.6921, device='cu