In [14]:
import torch as t
import numpy as np

In [32]:
features = t.randint(0, 255, (1, 3, 13, 13)).float()
rois = t.randint(0, 416, (2, 5)).float()
rois[:, 0] = 0
rois

tensor([[  0., 387.,  74., 142., 337.],
        [  0.,  62.,  71., 244., 409.]])

In [34]:
rois[0][1] = rois[0][3]
rois[0][3] += 100
rois

tensor([[  0., 142.,  74., 242., 337.],
        [  0.,  62.,  71., 244., 409.]])

In [53]:
class RoIPool(t.nn.Module):
    def __init__(self, spacial_scale=1. / 32, pooled_height=7, pooled_width=7):
        super(RoIPool, self).__init__()
        
        self.spacial_scale = spacial_scale
        self.pooled_height = pooled_height
        self.pooled_width = pooled_width

    def forward(self, features, rois):
        """
        RoIPooling 2D.
        :param features: (N, C, H, W) -> a batch of feature maps
        :param rois: (num_roi, 5) -> each roi is (feature_index_of_batch, x_min, y_min, x_max, y_max)
        :param spacial_scale: scale ratio -- size_feature / size_roi
        :param pooled_height: target height after pooling
        :param pooled_width: target width after pooling
        :return: pooled target with shape (num_roi, C, pooled_height, pooled_width)
        """

        _, C, H, W = features.shape
        num_roi = rois.shape[0]
        # pooled target, project each roi to the corresponding feature map
        pooled = t.zeros(num_roi, C, self.pooled_height, self.pooled_width)

        for idx, roi in enumerate(rois):
            # 1st quantization
            bbox = np.round(roi[1:].numpy() * self.spacial_scale).astype(int)
#             print("bounding boxes:", bbox)
            
            bbox_w, bbox_h = max(bbox[2] - bbox[0] + 1, 1), max(bbox[3] - bbox[1] + 1, 1)
            bin_w, bin_h = float(bbox_w) / self.pooled_width, float(bbox_h) / self.pooled_height

            for i in range(self.pooled_height):
                # 2nd quantization
                y_start = bbox[1] + int(np.floor(i * bin_h))
                y_end = bbox[1] + int(np.ceil((i + 1) * bin_h))
#                 print("y_start:", y_start)
#                 print("y_end:", y_end)
                
                
                # clip to range [0, H - 1]
                y_start = np.clip(y_start, 0, H - 1)
                y_end = np.clip(y_end, 0, H - 1)

                for j in range(self.pooled_width):
                    x_start = bbox[0] + int(np.float(j * bin_w))
                    x_end = bbox[0] + int(np.ceil((j + 1) * bin_w))
#                     print("x_start:", x_start)
#                     print("x_end:", x_end)

                    # clip to range [0, W - 1]
                    x_start = np.clip(x_start, 0, W - 1)
                    x_end = np.clip(x_end, 0, W - 1)

                    if y_end > y_start and x_end > x_start:
                        feature_idx = int(roi[0].item())
                        f = features[feature_idx]
                        p = pooled[idx]
                        p[:, i, j] = t.max(t.max(f[:, y_start:y_end, x_start:x_end], dim=1)[0], dim=1)[0]

        return pooled

In [54]:
roi_pooling = RoIPool(pooled_height=2, pooled_width=2)
out = roi_pooling(features, rois)
out

tensor([[[[252., 252.],
          [241., 241.]],

         [[242., 247.],
          [247., 247.]],

         [[225., 199.],
          [245., 245.]]],


        [[[240., 252.],
          [239., 239.]],

         [[249., 247.],
          [243., 247.]],

         [[253., 245.],
          [245., 245.]]]])

In [49]:
class RoIPyTorchMaxPooling(t.nn.Module):
    def __init__(self, spacial_scale=1. / 32, pooled_height=7, pooled_width=7):
        super(RoIPyTorchMaxPooling, self).__init__()
        
        self.spacial_scale = spacial_scale
        self.pooled_height = pooled_height
        self.pooled_width = pooled_width

    def forward(self, features, rois):
        """
        RoIPooling 2D.
        :param features: (N, C, H, W) -> a batch of feature maps
        :param rois: (num_roi, 5) -> each roi is (feature_index_of_batch, x_min, y_min, x_max, y_max)
        :param spacial_scale: scale ratio -- size_feature / size_roi
        :param pooled_height: target height after pooling
        :param pooled_width: target width after pooling
        :return: pooled target with shape (num_roi, C, pooled_height, pooled_width)
        """
        _, C, H, W = features.shape
        num_roi = rois.shape[0]
        # pooled target, project each roi to the corresponding feature map
        pooled = t.zeros(num_roi, C, self.pooled_height, self.pooled_width)

        for idx, roi in enumerate(rois):
            # quantization
            bbox = np.round(roi[1:].numpy() * self.spacial_scale).astype(np.int)
            # clip to the range of feature size
            np.clip(bbox[0::2], 0, W - 1, out=bbox[0::2])
            np.clip(bbox[1::2], 0, H - 1, out=bbox[1::2])
            
#             print("bounding box:", bbox)

            if bbox[0] < bbox[2] and bbox[1] < bbox[3]:
                feature_idx = int(roi[0].item())
                f = features[feature_idx]
                projected_roi = f[:, bbox[0]:bbox[2], bbox[1]:bbox[3]]
                # Max Pooling
                pooled_projected_roi = t.nn.functional.adaptive_max_pool2d(
                    projected_roi,
                    (self.pooled_height, self.pooled_height)
                )
                pooled[idx] = pooled_projected_roi

        return pooled

In [51]:
roi_pooling_pytorch = RoIPyTorchMaxPooling(pooled_height=2, pooled_width=2)
out = roi_pooling_pytorch(features, rois)
out

bounding box: [ 4  2  8 11]
bounding box: [ 2  2  8 13]


tensor([[[[252., 252.],
          [241., 241.]],

         [[223., 155.],
          [249., 247.]],

         [[198., 225.],
          [245., 245.]]],


        [[[198., 244.],
          [252., 220.]],

         [[247., 247.],
          [249., 247.]],

         [[253., 225.],
          [245., 235.]]]])

In [86]:
class RoIAlign(t.nn.Module):
    def __init__(self, spacial_scale=1. / 32, pooled_height=7, pooled_width=7, num_sample=4):
        super(RoIAlign, self).__init__()
        
        self.spacial_scale = spacial_scale
        self.pooled_height = pooled_height
        self.pooled_width = pooled_width
        self.num_sample = num_sample

    def forward(self, features, rois):
        _, C, H, W = features.shape
        num_roi = rois.shape[0]
        pooled = t.zeros(num_roi, C, self.pooled_height, self.pooled_width)

        for idx, roi in enumerate(rois):
            # no quantization
            bbox = roi[1:].numpy() * self.spacial_scale
            # clip to the range of the size of feature map
            np.clip(bbox[0::2], 0, W - 1, out=bbox[0::2])
            np.clip(bbox[1::2], 0, H - 1, out=bbox[1::2])
            print("bbox:", bbox)
            
            # size of each bin
            bin_w = (bbox[2] - bbox[0]) / self.pooled_width
            bin_h = (bbox[3] - bbox[1]) / self.pooled_height
            print("bin width:", bin_w)
            print("bin_height:", bin_h)

            for i in range(self.pooled_height):
                # no quantization
                bin_start_y = bbox[1] + i * bin_h
                bin_end_y = bbox[1] + (i + 1) * bin_h

                # clip to [0, H - 1]
                bin_start_y = np.clip(bin_start_y, 0, H - 1)
                bin_end_y = np.clip(bin_end_y, 0, H - 1)

                for j in range(self.pooled_width):
                    # No quantization
                    bin_start_x = bbox[0] + j * bin_w
                    bin_end_x = bbox[0] + (j + 1) * bin_w

                    # clip to [0, W - 1]
                    bin_start_x = np.clip(bin_start_x, 0, W - 1)
                    bin_end_x = np.clip(bin_end_x, 0, W - 1)

                    if bin_start_y < bin_end_y and bin_start_x < bin_end_x:
                        k = int(np.sqrt(self.num_sample))
                        sub_bin_w, sub_bin_h = bin_w / k, bin_h / k
                        # center point position of top left sub bin
                        sub_bin_tl_xc = bin_start_x + sub_bin_w / 2
                        sub_bin_tl_yc = bin_start_y + sub_bin_h / 2
                        # center point position of each sub bin
                        sub_bin_c = np.zeros((k, k, 2))

                        for m in range(k):
                            yc = sub_bin_tl_yc + m * sub_bin_h

                            for n in range(k):
                                xc = sub_bin_tl_xc + n * sub_bin_w
                                sub_bin_c[m, n] = [xc, yc]

                        batch_index = int(roi[0].item())
                        f = features[batch_index]
                        # interpolated value on target position
                        # (C, k, k)
#                         print("feature:", f)
#                         print("sub bin center:", sub_bin_c)
                        interpolated_f = self.interpolation(f, sub_bin_c)
#                         print("interpolated:", interpolated_f)
                        p = pooled[idx]
                        # max pooling
                        p[:, i, j] = t.nn.functional.adaptive_max_pool2d(interpolated_f, 1).squeeze()

        return pooled

    @staticmethod
    def interpolation(feature, pos):
        out_c, H, W = feature.shape
        out_h, out_w = pos.shape[:2]
        out = t.zeros((out_c, out_h, out_w))

        for i in range(out_h):
            for j in range(out_w):
                x, y = pos[i][j][0], pos[i][j][1]
                x0, y0 = int(np.floor(x)), int(np.floor(y))
                x0, y0 = np.clip(x0, 0, W - 2), np.clip(y0, 0, H - 2)
                x1, y1 = x0 + 1 , y0 + 1

                # pixel value of 4 adjacent point
                # left top, right top, left bottom, right bottom
                lt, rt, lb, rb = feature[:, y0, x0], feature[:, y0, x1], feature[:, y1, x0], feature[:, y1, x1]
                # interpolation in x direction
                # middle top, middle bottom
                mt, mb = (x - x0) * rt + (x1 - x) * lt, (x - x0) * rb + (x1- x) * lb
                # interpolation in y direction
                pixel_xy = (y - y0) * mb + (y1 - y) * mt
                out[:, i, j] = pixel_xy

        return out

In [87]:
roi_align = RoIAlign(pooled_height=2, pooled_width=2)
out = roi_align(features, rois)
out

bbox: [ 4.4375   2.3125   7.5625  10.53125]
bin width: 1.5625
bin_height: 4.109375
sub bin center: [[[4.828125   3.33984375]
  [5.609375   3.33984375]]

 [[4.828125   5.39453125]
  [5.609375   5.39453125]]]
interpolated: tensor([[[ 79.2077,  58.1603],
         [101.0943, 134.1624]],

        [[176.6879, 178.0975],
         [182.2933, 162.4406]],

        [[145.8712, 111.7143],
         [ 59.3114,  41.9276]]])
sub bin center: [[[6.390625   3.33984375]
  [7.171875   3.33984375]]

 [[6.390625   5.39453125]
  [7.171875   5.39453125]]]
interpolated: tensor([[[ 43.4799,  48.2321],
         [142.4601, 119.0259]],

        [[188.4308, 192.5371],
         [145.9031, 138.9342]],

        [[ 76.4543,  63.5789],
         [ 50.1445,  91.2532]]])
sub bin center: [[[4.828125   7.44921875]
  [5.609375   7.44921875]]

 [[4.828125   9.50390625]
  [5.609375   9.50390625]]]
interpolated: tensor([[[141.5564, 153.1356],
         [120.1500, 128.6876]],

        [[154.3686, 112.7633],
         [ 72.0949, 170.

tensor([[[[134.1624, 142.4601],
          [153.1356, 126.5182]],

         [[182.2933, 192.5371],
          [170.3421, 174.9823]],

         [[145.8712,  91.2532],
          [178.7250, 200.1117]]],


        [[[145.9168, 102.6183],
          [128.7469, 201.6103]],

         [[219.4224, 190.7277],
          [195.0827, 197.0193]],

         [[153.1397, 107.9158],
          [184.9332, 146.1621]]]])