In [1]:
import os
import numpy as np
import torch 
import torchvision
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import utils
import torch.nn as nn
import torch.nn.functional as nn
import copy
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

In [35]:
#Inpyt image size
ISIZE = (800, 800)

# Imagenet statistics
imagenet_stats = np.array([[0.485, 0.456, 0.406] , [0.229, 0.224, 0.225]])

## Anchor Box Specs
ratios = [0.5, 1, 2]
anchor_scales = [8, 16, 32]

In [25]:
# helper functions

def normalize(im):
    # convert image to float 
    im = im / 255.
    """ Normalize with image net stats"""
    return (im - imagenet_stats[0])/imagenet_stats[1]

def train_val_dataset(dataset, val_split = 0.1):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size = val_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['val']   = Subset(dataset, val_idx)
    return datasets

def collate_fn(batch): 
    return tuple(zip(*batch))                     # without this there is a error while looping dataloader

In [26]:
# load data set and tranforms

class PennFudanDataset(object):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        # load all the image files, sorting them to ensure they are aligned
        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
        
    def __getitem__(self, idx):
        # load image and masks
        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
        mask_path = os.path.join(self.root, "PedMasks" , self.masks[idx])
        img = Image.open(img_path).convert("RGB")
        img = img.resize(ISIZE)
        img = np.array(img)
        img = normalize(img)
        img = img.transpose(2,0,1)
        img = torch.as_tensor(img, dtype = torch.float32)
        
        mask = Image.open(mask_path)
        mask = mask.resize(ISIZE)
        mask = np.array(mask)
        obj_ids = np.unique(mask)            # instances are encoded as different colors (0--backhroung)
        obj_ids = obj_ids[1:]                # first id is background remove it
        # split the color-encoded mask into a set of binary masks (i.e true or false)
        masks = mask == obj_ids[:, None, None]
        
        # get bounding box coordinates for each mask
        num_objs = len(obj_ids)
        boxes = []

        for i in range(num_objs):
            pos = np.where(masks[i])
            xmin = np.min(pos[1])
            xmax = np.max(pos[1])
            ymin = np.min(pos[0])
            ymax = np.max(pos[0])
            boxes.append([xmin, ymin, xmax, ymax])
            
        # convert to torch tensor
        boxes = torch.as_tensor(boxes, dtype = torch.float32)   # box dims
        masks = torch.as_tensor(masks, dtype=torch.uint8)       # true or false
        labels = torch.ones((num_objs,) , dtype = torch.int64)  # no of persons

        image_id = torch.tensor([idx])
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])   # area
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
        
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = masks
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        return img, target
    
    def __len__(self):
        return len(self.imgs)

In [27]:
dataset = PennFudanDataset('./Data/PennFudanPed/', None)

datasets = train_val_dataset(dataset)
print(len(datasets['train']))
print(len(datasets['val']))

data_loader = torch.utils.data.DataLoader(datasets['train'], batch_size=15, shuffle=True, collate_fn=collate_fn)
data_loader_val = torch.utils.data.DataLoader(datasets['val'], batch_size=15, shuffle=True, collate_fn=collate_fn)

153
17


In [28]:
is_cuda= False
if torch.cuda.is_available():
    is_cuda = True
print(is_cuda)

model = torchvision.models.vgg16(pretrained = True)
if is_cuda:
    model.cuda()
model.eval()

for param in model.features.parameters():
    param.requires_grad = False
    
fe = list(model.features)
req_features = []
for j, i in enumerate(fe[0:30]):
    req_features.append(i)                          # Remove last pooling layer
    print(i)
    
#print(req_features)

False
Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplac

In [32]:
idx =0
epoch =0
for images, targets in data_loader:
    idx += 1
    num_batch = len(images)
    print(f'batch --> {idx} , epoch --> {epoch}')
    imgs_torch_all = torch.stack([item for item in images])
    imgs_clone     = imgs_torch_all.clone()   # copies to a new tensor

    for feature in req_features:
        imgs_clone = feature(imgs_clone)
        
    print(imgs_clone)
    break;

batch --> 1 , epoch --> 0
tensor([[[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

         [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

         [[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.000

In [34]:
print(imgs_clone.shape)
print(imgs_clone.shape[2 : ])

torch.Size([15, 512, 50, 50])
torch.Size([50, 50])


In [53]:
def bbox_generation(images, targets, X_FM, Y_FM):
    global ratios
    global anchor_scales
    num_batch = len(images)
    X_IMG, Y_IMG = images[0].shape[1:]                 # 800
    bbox_all = [item['boxes'] for item in targets]
    labels_all = [item['labels'] for item in targets]
    
    sub_sampling_x = int(X_IMG/X_FM)     # 16
    sub_sampling_y = int(Y_IMG/Y_FM)
    
    anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype = np.float32)
    ctr_x = np.arange(sub_sampling_x, (X_FM+1) * sub_sampling_x, sub_sampling_x)
    ctr_y = np.arange(sub_sampling_y, (Y_FM+1) * sub_sampling_y, sub_sampling_y)
    index = 0
    ctr = np.zeros( (len(ctr_x)*len(ctr_y) , 2), dtype = np.float32)  # 2500 * 2
    
    for x in range(len(ctr_x)):
        for y in range(len(ctr_y)):
            ctr[index, 1] = ctr_x[x] - 8
            ctr[index, 0] = ctr_y[y] - 8
            index += 1
            
    anchors = np.zeros((X_FM * Y_FM * 9 , 4 ))                # because anchors are rectangles and need 4 things to calculate
    index = 0
    for ctr_y, ctr_x in ctr:
        for i in range(len(ratios)):
            for j in range(len(anchor_scales)):
                h = sub_sampling_x * anchor_scales[j] * np.sqrt(ratios[i])
                w = sub_sampling_y * anchor_scales[j] * np.sqrt(1./ ratios[i])
                anchors[index, 0] = ctr_y - h / 2
                anchors[index, 1] = ctr_x - w / 2.
                anchors[index, 2] = ctr_y + h / 2.
                anchors[index, 3] = ctr_x + w / 2.
                index += 1
                
    index_inside = np.where(
           (anchors[:, 0] >= 0) &
           (anchors[:, 1] >= 0) &
           (anchors[:, 2] >= Y_IMG) &
           (anchors[:, 3] <= X_IMG))[0]         # 2380 anchors
    
    label = np.empty((len(index_inside), ), dtype = np.int32)
    label.fill(-1)
    valid_anchors = anchors[index_inside]
    
    

In [54]:
for i, t in data_loader:
    #print(i[0].shape[1:])
    bbox_generation(i, t, 50, 50)
    break;

In [41]:
print(np.arange(16 , (50 + 1)* 16 , 16))

[ 16  32  48  64  80  96 112 128 144 160 176 192 208 224 240 256 272 288
 304 320 336 352 368 384 400 416 432 448 464 480 496 512 528 544 560 576
 592 608 624 640 656 672 688 704 720 736 752 768 784 800]


In [46]:
print(np.zeros((25 , 2), dtype = np.float32))

[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [None]:
def train_epochs(req_features, model, optimizer, train_dl, val_dl, epochs = 10, rpn_lambda = 10):
    for epoch in range(epochs):
        model.train()   # https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch
        total = 0
        sum_loss = 0
        sum_loss_cls = 0
        sum_loss_loc = 0
        idx = 0
        for images, targets in train_dl:
            idx += 1
            num_batch = len(images)
            print(f'batch --> {idx} , epoch --> {epoch}')
            imgs_torch_all = torch.stack([item for item in images])
            imgs_clone     = imgs_torch_all.clone()   # copies to a new tensor
            
            for feature in req_features:
                imgs_clone = feature(imgs_clone)
                
            X_FM, Y_FM = imgs.clone.shape[2:]
            anchor_locations_all_merge, anchor_labels_all_merge, anchors = bbox_generation(images, targets, X_FM, Y_FM)
         

In [None]:
class RPN(nn.Module):
    
    def __init__(self, in_channels = 512, mid_channels = 512, n_anchor = 9):
        super(RNN, self).__init__()
        self.in_channels  = in_channels   # depends on output of feature map in vgg 16 its 512
        self.mid_channels = mid_channels
        self.n_anchor     = n_anchor      # no of anchors in each loacation
        
        self.conv1 = nn.Conv2d(self.in_channels, self.mid_channels, kernel_size = 3, stride = 1, padding =1)
        self.reg_layer = nn.Conv2d(self.mid_channels, n_anchor * 4 , 1 , 1 , 0)
        self.cls_layer = nn.Conv2d(self.mid_channels, n_anchor * 2 , 1 , 1 , 0)        # see the paper
        
        
        # Conv sliding layer
        self.conv1.weight.data.normal_(0,0.01)
        self.conv1.bias.data.zero_()
        # Regression Layer
        self.ref_layer.weight.data.normal_(0,0.01)
        self.ref_layer.bias.data.zero_()
        # Classification Layer = RPN
        self.cls_layer.weight.data.normal_(0,0.01)
        self.cls_layer.bias.data.zero_()
            
    def forward(self, k): 
        bat_num = k.shape[0]
        x = self.conv1(k)
        pred_anchor_boxes = self.reg_layer(x)
        pred_cls_scores   = self.cls_layer(x)

In [None]:
learning_rate = 0.0015
num_epochs    = 20
model = RPN()

parameters = model.parameters()
optimizer = torch.optim.Adam(parameters , lr = learning_rate)

In [None]:
model = train_epochs(req_features, model, optimizer, data_loader, None, epochs = num_epochs)

In [None]:
# About dilation max pooling https://datascience.stackexchange.com/questions/28881/what-is-dilated-pooling-and-how-it-works-mathematically