In [104]:
import os
import numpy as np
import torch 
import torchvision
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import utils
import torch.nn as nn
import torch.nn.functional as F
import copy
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

In [105]:
#Inpyt image size
ISIZE = (800, 800)

# Imagenet statistics
imagenet_stats = np.array([[0.485, 0.456, 0.406] , [0.229, 0.224, 0.225]])

## Anchor Box Specs
ratios = [0.5, 1, 2]
anchor_scales = [8, 16, 32]

In [106]:
# helper functions

def normalize(im):
    # convert image to float 
    im = im / 255.
    """ Normalize with image net stats"""
    return (im - imagenet_stats[0])/imagenet_stats[1]

def train_val_dataset(dataset, val_split = 0.1):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size = val_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['val']   = Subset(dataset, val_idx)
    return datasets

def collate_fn(batch): 
    return tuple(zip(*batch))                     # without this there is a error while looping dataloader

In [107]:
# load data set and tranforms

class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all the image files, sorting them to ensure they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
        
    def __getitem__(self, idx):
        # load image and masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks" , self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        img = img.resize(ISIZE)
        img = np.array(img)
        img = normalize(img)
        img = img.transpose(2,0,1)
        img = torch.as_tensor(img, dtype = torch.float32)
        
        mask = Image.open(mask_path)
        mask = mask.resize(ISIZE)
        mask = np.array(mask)
        obj_ids = np.unique(mask)            # instances are encoded as different colors (0--backhroung)
        obj_ids = obj_ids[1:]                # first id is background remove it
        # split the color-encoded mask into a set of binary masks (i.e true or false)
        masks = mask == obj_ids[:, None, None]
        
        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []

        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
            
        # convert to torch tensor
        boxes = torch.as_tensor(boxes, dtype = torch.float32)   # box dims
        masks = torch.as_tensor(masks, dtype=torch.uint8)       # true or false
        labels = torch.ones((num_objs,) , dtype = torch.int64)  # no of persons

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])   # area
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        return img, target
    
    def __len__(self):
        return len(self.imgs)

In [108]:
dataset = PennFudanDataset('./Data/PennFudanPed/', None)

datasets = train_val_dataset(dataset)
print(len(datasets['train']))
print(len(datasets['val']))

data_loader = torch.utils.data.DataLoader(datasets['train'], batch_size=15, shuffle=True, collate_fn=collate_fn)
data_loader_val = torch.utils.data.DataLoader(datasets['val'], batch_size=15, shuffle=True, collate_fn=collate_fn)

153
17


In [109]:
is_cuda= False
if torch.cuda.is_available():
    is_cuda = True
print(is_cuda)

model = torchvision.models.vgg16(pretrained = True)
if is_cuda:
    model.cuda()
model.eval()

for param in model.features.parameters():
    param.requires_grad = False
    
fe = list(model.features)
req_features = []
for j, i in enumerate(fe[0:30]):
    req_features.append(i)                          # Remove last pooling layer
    print(i)
    
#print(req_features)

False
Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplac

In [110]:
idx =0
epoch =0
for images, targets in data_loader:
    idx += 1
    num_batch = len(images)
    print(f'batch --> {idx} , epoch --> {epoch}')
    imgs_torch_all = torch.stack([item for item in images])
    imgs_clone     = imgs_torch_all.clone()   # copies to a new tensor

    for feature in req_features:
        imgs_clone = feature(imgs_clone)
        
    print(imgs_clone)
    break;

batch --> 1 , epoch --> 0
tensor([[[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00]],

         [[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0729e+00,
           9.5813e-01, 7.2522e-01],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 2.4301e+00,
           1.7030e+00, 9.7506e-01],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 2.3024e+00,
           1.4392e+00, 7.1420e-01],
          ...,
          [0.0000e+00, 0.0000e+00

In [111]:
print(imgs_clone.shape)
print(imgs_clone.shape[2 : ])

torch.Size([15, 512, 50, 50])
torch.Size([50, 50])


In [112]:
def bbox_generation(images, targets, X_FM, Y_FM):
    global ratios
    global anchor_scales
    num_batch = len(images)
    X_IMG, Y_IMG = images[0].shape[1:]                 # 800
    bbox_all = [item['boxes'] for item in targets]
    labels_all = [item['labels'] for item in targets]
    
    sub_sampling_x = int(X_IMG/X_FM)     # 16
    sub_sampling_y = int(Y_IMG/Y_FM)
    
    anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype = np.float32)
    ctr_x = np.arange(sub_sampling_x, (X_FM+1) * sub_sampling_x, sub_sampling_x)
    ctr_y = np.arange(sub_sampling_y, (Y_FM+1) * sub_sampling_y, sub_sampling_y)
    ctr = np.zeros( (len(ctr_x)*len(ctr_y) , 2), dtype = np.float32)  # 2500 * 2
    
    index = 0
    for x in range(len(ctr_x)):
        for y in range(len(ctr_y)):
            ctr[index, 1] = ctr_x[x] - 8
            ctr[index, 0] = ctr_y[y] - 8
            index += 1
            
    anchors = np.zeros((X_FM * Y_FM * 9 , 4 ))                # because anchors are rectangles and need 4 things to calculate
    index = 0
    for ctr_y, ctr_x in ctr:
        for i in range(len(ratios)):
            for j in range(len(anchor_scales)):
                h = sub_sampling_x * anchor_scales[j] * np.sqrt(ratios[i])
                w = sub_sampling_y * anchor_scales[j] * np.sqrt(1./ ratios[i])
                anchors[index, 0] = ctr_y - h / 2
                anchors[index, 1] = ctr_x - w / 2.
                anchors[index, 2] = ctr_y + h / 2.
                anchors[index, 3] = ctr_x + w / 2.
                index += 1
                
    index_inside = np.where(
           (anchors[:, 0] >= 0) &
           (anchors[:, 1] >= 0) &
           (anchors[:, 2] >= Y_IMG) &
           (anchors[:, 3] <= X_IMG))[0]         # 2380 anchors
    
    label = np.empty((len(index_inside), ), dtype = np.int32)
    label.fill(-1)
    valid_anchors = anchors[index_inside]
    
    # intersection over union
    ious_all = []
    for bx in bbox_all:
        ious = np.empty((len(label), bx.size()[0]), dtype = np.float32)
        ious.fill(0)
        for num1, i in enumerate(valid_anchors):
            ya1, xa1, ya2, xa2 = i
            anchor_area = (ya2 - ya1) * (xa2 - xa1)
            for num2, j in enumerate(bx):
                yb1, xb1, yb2, xb2 = j
                box_area = (yb2 - yb1) * (xb2 - xb1)
                inter_x1 = max([xb1, xa1])
                inter_y1 = max([yb1, ya1])
                inter_x2 = min([xb2, xa2])
                inter_y2 = min([yb2, ya2])
                if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
                    iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
                    iou = iter_area / (anchor_area + box_area - iter_area)
                else :
                    iou = 0
                ious[num1, num2] = iou
        ious_all.append(ious) 
        
    
    gt_argmax_ious_all= []
    gt_max_ious_all = []
    for ious_ in ious_all:
        gt_argmax_ious = ious_.argmax(axis = 0)
        gt_max_ious    = ious_[gt_argmax_ious, np.arange(ious_.shape[1])]
        gt_argmax_ious_all.append(gt_argmax_ious)
        gt_max_ious_all.append(gt_max_ious)
        
    argmax_ious_all = []
    max_ious_all = []
    for ious_ in ious_all:
        argmax_ious = ious_.argmax(axis = 1)
        max_ious = ious_[np.arange(len(label)), argmax_ious]
        argmax_ious_all.append(argmax_ious)
        max_ious_all.append(max_ious)
        
    gt_argmax_ious_all = []       # there will be batch_size number of items in gt_max_ious_all
    for gt_max_ious_, ious_ in zip(gt_max_ious_all, ious_all):
        gt_argmax_ious = np.where(ious_ == gt_max_ious_)[0]
        gt_argmax_ious_all.append(gt_argmax_ious)
        
    pos_iou_threshold = 0.7
    neg_iou_threshold = 0.3
    
    label_all = []
    for n in range(num_batch):
        l = copy.deepcopy(label)
        l[max_ious_all[n] < neg_iou_threshold] = 0
        l[gt_argmax_ious_all[n]] = 1
        l[max_ious_all[n] >= pos_iou_threshold] = 1
        label_all.append(l)
        
        
    pos_ratio = 0.5
    n_sample  = 256
    n_pos     = int(pos_ratio * n_sample) # 128
    
    for n in range(num_batch):
        pos_index = np.where(label_all[n] == 1)[0]
        if len(pos_index) > n_pos:
            disable_index = np.random.choice(pos_index, size = (len(pos_index) - n_pos), replace = False)
            label_all[n][disable_index] = -1
            
        n_neg = n_sample - np.sum(label_all[n] == 1)
        neg_index = np.where(label_all[n] == 0)[0]
        if len(neg_index) > n_neg:
            disable_index = np.random.choice(neg_index, size = (len(neg_index) - n_neg), replace = False)
            label_all[n][disable_index] = -1
             
    max_iou_bbox_all = []
    for n in range(num_batch):
        max_iou_bbox_all.append(bbox_all[n][argmax_ious_all[n]])
        
    
    # Anchor Box
    height = valid_anchors[:, 2] - valid_anchors[:, 0]
    width = valid_anchors[:, 3] - valid_anchors[:, 1]
    ctr_y = valid_anchors[:, 0] + 0.5 * height
    ctr_x = valid_anchors[:, 1] + 0.5 * width
    
    # Ground Truth
    base_height_all = []
    base_width_all = []
    base_ctr_y_all = []
    base_ctr_x_all = []
    
    for n in range(num_batch):
        base_height = max_iou_bbox_all[n][:, 2] - max_iou_bbox_all[n][:, 0]
        base_width = max_iou_bbox_all[n][:, 3] - max_iou_bbox_all[n][:, 1]
        base_ctr_y = max_iou_bbox_all[n][:, 0] + 0.5 * base_height
        base_ctr_x = max_iou_bbox_all[n][:, 1] + 0.5 * base_width
        base_height_all.append(base_height)
        base_width_all.append(base_width)
        base_ctr_y_all.append(base_ctr_y)
        base_ctr_x_all.append(base_ctr_x)
        
    #Prevent devide by 0
    eps = np.finfo(height.dtype).eps
    height = np.maximum(height, eps)
    width = np.maximum(width, eps)    
    
    # d_{} calculation
    anchor_locs_all = []
    for n in range(num_batch):
        dy = (base_ctr_y_all[n].numpy() - ctr_y) / height
        dx = (base_ctr_x_all[n].numpy() - ctr_x) / width
        dh = np.log(base_height_all[n].numpy()/ height)
        dw = np.log(base_width_all[n].numpy() / width)
        anchor_locs_all.append(np.vstack((dy, dx,dh, dw)).transpose())
        
    anchor_labels_all = []
    for n in range(num_batch):
        anchor_labels = np.empty((len(anchors),), dtype=label_all[n].dtype)
        anchor_labels.fill(-1)
        anchor_labels[index_inside] = label_all[n]
        anchor_labels_all.append(anchor_labels)
    anchor_labels_all_merge = np.stack(anchor_labels_all, 0)  
    
    anchor_locations_all = []
    for n in range(num_batch):
        anchor_locations = np.empty((len(anchors), anchors.shape[1]), dtype=anchor_locs_all[n].dtype)
        anchor_locations.fill(0)
        anchor_locations[index_inside, :] = anchor_locs_all[n]
        anchor_locations_all.append(anchor_locations)
        
    anchor_locations_all_merge = np.stack(anchor_locations_all, 0) 
    return anchor_locations_all_merge, anchor_labels_all_merge, anchors
                

In [113]:
"""for i, t in data_loader:
    #print(i[0].shape[1:])
    anchor_locations_all_merge, anchor_labels_all_merge, anchors = bbox_generation(i, t, 50, 50)
    print(anchor_locations_all_merge)
    print('-- -- ')
    print(anchor_labels_all_merge)
    break;
"""    

"for i, t in data_loader:\n    #print(i[0].shape[1:])\n    anchor_locations_all_merge, anchor_labels_all_merge, anchors = bbox_generation(i, t, 50, 50)\n    print(anchor_locations_all_merge)\n    print('-- -- ')\n    print(anchor_labels_all_merge)\n    break;\n"

In [114]:
def train_epochs(req_features, model, optimizer, train_dl, val_dl, epochs = 10, rpn_lambda = 10):
    for epoch in range(epochs):
        model.train()   # https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch
        total = 0
        sum_loss = 0
        sum_loss_cls = 0
        sum_loss_loc = 0
        idx = 0
        for images, targets in train_dl:
            idx += 1
            num_batch = len(images)
            print(f'batch --> {idx} , epoch --> {epoch}')
            imgs_torch_all = torch.stack([item for item in images])
            imgs_clone     = imgs_torch_all.clone()   # copies to a new tensor
            
            for feature in req_features:
                imgs_clone = feature(imgs_clone)
                
            X_FM, Y_FM = imgs.clone.shape[2:]
            anchor_locations_all_merge, anchor_labels_all_merge, anchors = bbox_generation(images, targets, X_FM, Y_FM)
            pred_anchor_locs, pred_cls_scores, objectness_score = model(k)
            
         

In [115]:
class RPN(nn.Module):
    
    def __init__(self, in_channels = 512, mid_channels = 512, n_anchor = 9):
        super(RNN, self).__init__()
        self.in_channels  = in_channels   # depends on output of feature map in vgg 16 its 512
        self.mid_channels = mid_channels
        self.n_anchor     = n_anchor      # no of anchors in each loacation
        
        self.conv1 = nn.Conv2d(self.in_channels, self.mid_channels, kernel_size = 3, stride = 1, padding =1)
        self.reg_layer = nn.Conv2d(self.mid_channels, n_anchor * 4 , 1 , 1 , 0)
        self.cls_layer = nn.Conv2d(self.mid_channels, n_anchor * 2 , 1 , 1 , 0)        # see the paper
        
        
        # Conv sliding layer
        self.conv1.weight.data.normal_(0,0.01)
        self.conv1.bias.data.zero_()
        # Regression Layer
        self.ref_layer.weight.data.normal_(0,0.01)
        self.ref_layer.bias.data.zero_()
        # Classification Layer = RPN
        self.cls_layer.weight.data.normal_(0,0.01)
        self.cls_layer.bias.data.zero_()
            
    def forward(self, k): 
        bat_num = k.shape[0]
        x = self.conv1(k)
        pred_anchor_boxes = self.reg_layer(x)
        pred_cls_scores   = self.cls_layer(x)
        
        pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(bat_num, -1, 4)
        pred_cls_scores = pred_cls_scores.permute(0, 2, 3, 1).contiguous()
        objectness_score = pred_cls_scores.view(bat_num, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(bat_num, -1)
        pred_cls_scores  = pred_cls_scores.view(bat_num, -1, 2)
        
        return pred_anchor_locs, pred_cls_scores, objectness_score

In [116]:
learning_rate = 0.0015
num_epochs    = 20
model = RPN()

parameters = model.parameters()
optimizer = torch.optim.Adam(parameters , lr = learning_rate)

NameError: name 'RNN' is not defined

In [None]:
model = train_epochs(req_features, model, optimizer, data_loader, None, epochs = num_epochs)

In [None]:
# About dilation max pooling https://datascience.stackexchange.com/questions/28881/what-is-dilated-pooling-and-how-it-works-mathematically