In [4]:
import os
import numpy as np
import torch 
import torchvision
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import utils
import torch.nn as nn
import torch.nn.functional as nn
import copy
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

In [15]:
#Inpyt image size
ISIZE = (800, 800)

# Imagenet statistics
imagenet_stats = np.array([[0.485, 0.456, 0.406] , [0.229, 0.224, 0.225]])


In [16]:
# helper functions

def normalize(im):
    # convert image to float 
    im = im / 255.
    """ Normalize with image net stats"""
    return (im - imagenet_stats[0])/imagenet_stats[1]

In [None]:
# load data set and tranforms

class PennFudanDataset(Object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all the image files, sorting them to ensure they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
        
    def __getitem__(self, idx):
        # load image and masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks" , self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        img = img.resize(ISIZE)
        img = np.array(img)
        img = normalize(img)
        img = img.transpose(2,0,1)
        img = torch.as_tensor(img, dtype = torch.float32)
        
        mask = Image.open(mask_path)
        mask = mask.resize(ISIZE)
        mask = np.array(mask)
        obj_ids = np.unique(mask)            # instances are encoded as different colors (0--backhroung)
        obj_ids = obj_ids[1:]                # first id is background remove it
        # split the color-encoded mask into a set of binary masks (i.e true or false)
        masks = mask == obj_ids[:, None, None]
        
        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []

        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
            
        # convert to torch tensor
        boxes = torch.as_tensor(boxes, dtype = torch.float32)   # box dims
        masks = torch.as_tensor(masks, dtype=torch.uint8)       # true or false
        labels = torch.ones((num_objs,) , dtype = torch.int64)  # no od persons

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])   # area
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        return img, target
    
    def __len__(self):
        return len(self.imgs)