/
deformable_convolution_2d_sampler.py
149 lines (117 loc) · 5.58 KB
/
deformable_convolution_2d_sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import numpy
from chainer import backend
from chainer.functions.array import broadcast
from chainer.functions.array import concat
from chainer.functions.array import pad as pad_module
from chainer.functions.array import spatial_transformer_sampler
from chainer.functions.math import matmul
def deformable_convolution_2d_sampler(x, offset, W, b=None, stride=1, pad=0):
"""Two-dimensional deformable convolution function using computed offset.
This is an implementation of two-dimensional deformable convolution from
`Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.
It takes four variables: the input image ``x``, the offset image
``offset``, the filter weight ``W``, and the bias vector ``b``.
Notation: here is the notation for the dimensionalities.
- :math:`n` is the batch size.
- :math:`c_I` and :math:`c_O` are the number of the input and output,
respectively.
- :math:`h` and :math:`w` are the height and width of the input image,
respectively.
- :math:`k_H` and :math:`k_W` are the height and width of the filters,
respectively.
- :math:`s_Y` and :math:`s_X` are the strides of the filter.
- :math:`p_H` and :math:`p_W` are the spatial padding sizes.
The output size :math:`(h_O, w_O)` is determined by the following
equations:
.. math::
h_O &= (h + 2p_H - k_H) / s_Y + 1,\\\\
w_O &= (w + 2p_W - k_W) / s_X + 1.
Args:
x (:class:`~chainer.Variable` or :ref:`ndarray`):
Input variable of shape :math:`(n, c_I, h, w)`.
offset (:class:`~chainer.Variable` or :ref:`ndarray`):
Offset variable of shape
:math:`(n, 2 \\cdot k_H \\cdot k_W, h_O, w_O)`. The first
:math:`k_H \\cdot k_W` index of the second axis corresponds to
the offsets in the horizontal direction. The last
:math:`k_H \\cdot k_W` index of the second axis corresponds to
the offsets in the vertical direction.
W (:class:`~chainer.Variable` or :ref:`ndarray`):
Weight variable of shape :math:`(c_O, c_I, k_H, k_W)`.
b (:class:`~chainer.Variable` or :ref:`ndarray`):
Bias variable of length :math:`c_O` (optional).
stride (int or pair of ints): Stride of filter applications.
``stride=s`` and ``stride=(s, s)`` are equivalent.
pad (int or pair of ints): Spatial padding width for input arrays.
``pad=p`` and ``pad=(p, p)`` are equivalent.
Returns:
~chainer.Variable: Output variable.
Deformable convolution adds 2D offsets to the regular grid sampling
locations in the standard convolution. It enables free form deformation of
the sampling grid.
See `Jifeng Dai, Haozhi Qi, Yuwen Xiong, Yi Li, Guodong Zhang, Han Hu,
Yichen Wei. Deformable Convolutional Networks
<https://arxiv.org/abs/1703.06211>`_
If the bias vector is given, then it is added to all spatial locations of
the output of convolution.
.. seealso::
:class:`~chainer.links.DeformableConvolution2D` to manage the model
parameters ``W`` and ``b``.
.. admonition:: Example
>>> x = np.random.uniform(0, 1, (2, 3, 4, 7)).astype(np.float32)
>>> offset = np.random.uniform(
... 0, 1, (2, 2 * 3 * 3, 2, 5)).astype(np.float32)
>>> W = np.random.uniform(0, 1, (4, 3, 3, 3)).astype(np.float32)
>>> b = np.random.uniform(0, 1, (4,)).astype(np.float32)
>>> y = F.deformable_convolution_2d_sampler(x, offset, W, b)
>>> y.shape
(2, 4, 2, 5)
"""
sy, sx = _pair(stride)
ph, pw = _pair(pad)
out_c, _, kh, kw = W.shape
n, c, h, w = x.shape
_, khkw2, out_h, out_w = offset.shape
if khkw2 != 2 * kh * kw:
raise ValueError(
'The shape of the offset does not match the kernel size')
grid = _offset2grid(offset, kh, kw, sy, sx, ph, pw, h, w)
grid = grid.reshape(n, 2, kh * kw, out_h * out_w)
x_pad = pad_module.pad(x, ((0, 0), (0, 0), (ph, ph), (pw, pw)), 'constant')
x_st = spatial_transformer_sampler.spatial_transformer_sampler(
x_pad, grid)
x_st = x_st.transpose(0, 3, 1, 2).reshape(n * out_h * out_w, c * kh * kw)
W = W.transpose(1, 2, 3, 0).reshape(c * kh * kw, out_c)
y = matmul.matmul(x_st, W)
y = y.reshape(n, out_h, out_w, out_c).transpose(0, 3, 1, 2)
if b is not None:
b = broadcast.broadcast_to(b[None, :, None, None], y.shape)
y += b
return y
def _offset2grid(offset, kh, kw, sy, sx, ph, pw, h, w):
n, khkw2, out_h, out_w = offset.shape
khkw = int(khkw2 / 2)
xp = backend.get_array_module(offset)
ys, xs = xp.meshgrid(
xp.arange(0, sy * out_h, sy, dtype=numpy.float32),
xp.arange(0, sx * out_w, sx, dtype=numpy.float32), indexing='ij',
copy=False
)
filter_offset_x = xp.tile(xp.arange(kw, dtype=numpy.float32), kh)
filter_offset_y = xp.repeat(xp.arange(kh, dtype=numpy.float32), kw)
x_coord = (offset[:, :khkw] + xs[None, None] +
filter_offset_x[None, :, None, None])
y_coord = (offset[:, khkw:] + ys[None, None] +
filter_offset_y[None, :, None, None])
# The values of this variable is clipped in range [-1, 1].
# The coordinate (-1, -1) corresponds to the upper-left
# corner of the input image.
x_coord = (x_coord / (w + 2 * pw - 1) - 0.5) * 2
y_coord = (y_coord / (h + 2 * ph - 1) - 0.5) * 2
# Shape of `coord` is (n, 2 * kh * kw, out_h, out_w)
coord = concat.concat([x_coord, y_coord], axis=1)
return coord
def _pair(x):
if hasattr(x, '__getitem__'):
return x
return x, x