In [13]:
import torch
import numpy as np
from torchvision.ops.roi_pool import RoIPool
from torchvision.ops.roi_align import RoIAlign

In [6]:
class TorchROIPool(object):

    def __init__(self, output_size, scaling_factor):
        """ROI max pooling works by dividing the hxw RoI window into an HxW grid of 
           approximately size h/H x w/W and then max-pooling the values in each
           sub-window. Pooling is applied independently to each feature map channel.
        """
        self.output_size = output_size
        self.scaling_factor = scaling_factor

    def _roi_pool(self, features):
        """Given scaled and extracted features, do channel wise pooling
        to return features of fixed size self.output_size, self.output_size

        Args:
            features (np.Array): scaled and extracted features of shape
            num_channels, proposal_width, proposal_height
        """

        num_channels, h, w = features.shape

        w_stride = w/self.output_size
        h_stride = h/self.output_size

        res = torch.zeros((num_channels, self.output_size, self.output_size))
        res_idx = torch.zeros((num_channels, self.output_size, self.output_size))
        for i in range(self.output_size):
            for j in range(self.output_size):
                
                # important to round the start and end, and then conver to int
                w_start = int(np.floor(j*w_stride))
                w_end = int(np.ceil((j+1)*w_stride))
                h_start = int(np.floor(i*h_stride))
                h_end = int(np.ceil((i+1)*h_stride))

                # limiting start and end based on feature limits
                w_start = min(max(w_start, 0), w)
                w_end = min(max(w_end, 0), w)
                h_start = min(max(h_start, 0), h)
                h_end = min(max(h_end, 0), h)

                patch = features[:, h_start: h_end, w_start: w_end]
                max_val, max_idx = torch.max(patch.reshape(num_channels, -1), dim=1)
                res[:, i, j] = max_val
                res_idx[:, i, j] = max_idx

        return res, res_idx

    def __call__(self, feature_layer, proposals):
        """Given feature layers and a list of proposals, it returns pooled
        respresentations of the proposals. Proposals are scaled by scaling factor
        before pooling.

        Args:
            feature_layer (np.Array): Feature layer of size (num_channels, width,
            height)
            proposals (list of np.Array): Each element of the list represents a bounding
            box as (x,y,w,h)

        Returns:
            np.Array: Shape len(proposals), channels, self.output_size, self.output_size
        """

        batch_size, num_channels, _, _ = feature_layer.shape

        # first scale proposals based on self.scaling factor 
        scaled_proposals = torch.zeros_like(proposals)

        # the rounding by torch.ceil is important for ROI pool
        scaled_proposals[:, 0] = torch.ceil(proposals[:, 0] * self.scaling_factor)
        scaled_proposals[:, 1] = torch.ceil(proposals[:, 1] * self.scaling_factor)
        scaled_proposals[:, 2] = torch.ceil(proposals[:, 2] * self.scaling_factor)
        scaled_proposals[:, 3] = torch.ceil(proposals[:, 3] * self.scaling_factor)

        res = torch.zeros((len(proposals), num_channels, self.output_size,
                        self.output_size))
        res_idx = torch.zeros((len(proposals), num_channels, self.output_size,
                        self.output_size))
        for idx in range(len(proposals)):
            proposal = scaled_proposals[idx]
            # adding 1 to include the end indices from proposal
            extracted_feat = feature_layer[0, :, proposal[1].to(dtype=torch.int8):proposal[3].to(dtype=torch.int8)+1,
                                           proposal[0].to(dtype=torch.int8):proposal[2].to(dtype=torch.int8)+1]
            res[idx], res_idx[idx] = self._roi_pool(extracted_feat)

        return res

In [3]:
torch.set_default_tensor_type(torch.cuda.DoubleTensor)

# create feature layer, proposals and targets
num_proposals = 10
feat_layer = torch.randn(1, 64, 32, 32)

proposals = torch.zeros((num_proposals, 4))
proposals[:, 0] = torch.randint(0, 16, (num_proposals,))
proposals[:, 1] = torch.randint(0, 16, (num_proposals,))
proposals[:, 2] = torch.randint(16, 32, (num_proposals,))
proposals[:, 3] = torch.randint(16, 32, (num_proposals,))

In [14]:
my_roi_pool = TorchROIPool(3, 2**-1)
roi_pool1 = my_roi_pool(feat_layer, proposals)

roi_pool = RoIPool(3, 2**-1)
roi_pool2 = roi_pool(feat_layer, [proposals])

In [46]:
roi_pool1[0,0,:,:]

tensor([[2.4133, 2.7335, 1.6860],
        [2.6391, 2.1293, 2.1205],
        [3.0989, 2.8901, 2.8901]])

torch.Size([1, 64, 32, 32])

In [33]:
scaled_proposals = torch.zeros_like(proposals)

# the rounding by torch.ceil is important for ROI pool
scaled_proposals[:, 0] = torch.ceil(proposals[:, 0] * 2**-1)
scaled_proposals[:, 1] = torch.ceil(proposals[:, 1] * 2**-1)
scaled_proposals[:, 2] = torch.ceil(proposals[:, 2] * 2**-1)
scaled_proposals[:, 3] = torch.ceil(proposals[:, 3] * 2**-1)

# res = torch.zeros((len(proposals), num_channels, self.output_size,
#                 self.output_size))
# res_idx = torch.zeros((len(proposals), num_channels, self.output_size,
#                 self.output_size))
# for idx in range(len(proposals)):
#     proposal = scaled_proposals[idx]
#     # adding 1 to include the end indices from proposal
#     extracted_feat = feature_layer[0, :, proposal[1].to(dtype=torch.int8):proposal[3].to(dtype=torch.int8)+1,
#                                    proposal[0].to(dtype=torch.int8):proposal[2].to(dtype=torch.int8)+1]
#     res[idx], res_idx[idx] = self._roi_pool(extracted_feat)

In [35]:
scaled_proposals[0]

tensor([ 0.,  8., 15., 16.])

In [36]:
res = torch.zeros((10, 64, 3, 3))

In [42]:
feat_layer[0, :, 0:15 , 8:16]

torch.Size([64, 15, 8])