/
simplified_dropconnect.py
175 lines (145 loc) · 6.04 KB
/
simplified_dropconnect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import numpy
from chainer import cuda
from chainer import function
from chainer.utils import type_check
from chainer import variable
def _as_mat(x):
if x.ndim == 2:
return x
return x.reshape(len(x), -1)
def _matmul(a, b, xp):
if xp is numpy:
# numpy 1.9 does not support matmul.
# So we use numpy.einsum instead of numpy.matmul.
return xp.einsum('...jk,...kl->...jl', a, b)
else:
return xp.matmul(a, b)
class SimplifiedDropconnect(function.Function):
"""Linear unit regularized by simplified dropconnect."""
def __init__(self, ratio, mask=None, use_batchwise_mask=True):
self.ratio = ratio
self.mask = mask
self.use_batchwise_mask = use_batchwise_mask
def check_type_forward(self, in_types):
n_in = in_types.size()
type_check.expect(2 <= n_in, n_in <= 3)
x_type, w_type = in_types[:2]
type_check.expect(
x_type.dtype.kind == 'f',
w_type.dtype.kind == 'f',
x_type.ndim >= 2,
w_type.ndim == 2,
type_check.prod(x_type.shape[1:]) == w_type.shape[1],
)
if type_check.eval(n_in) == 3:
b_type = in_types[2]
type_check.expect(
b_type.dtype == x_type.dtype,
b_type.ndim == 1,
b_type.shape[0] == w_type.shape[0],
)
if self.mask is not None:
if self.use_batchwise_mask:
type_check.expect(
self.mask.shape[0] == x_type.shape[0],
self.mask.shape[1:] == w_type.shape,
)
else:
type_check.expect(self.mask.shape == w_type.shape)
def forward(self, inputs):
scale = inputs[1].dtype.type(1. / (1 - self.ratio))
xp = cuda.get_array_module(*inputs)
if self.mask is None:
if self.use_batchwise_mask:
mask_shape = (inputs[0].shape[0], inputs[1].shape[0],
inputs[1].shape[1])
else:
mask_shape = (inputs[1].shape[0], inputs[1].shape[1])
if xp == numpy:
self.mask = xp.random.rand(*mask_shape) >= self.ratio
else:
self.mask = xp.random.rand(*mask_shape,
dtype=numpy.float32) >= self.ratio
elif isinstance(self.mask, variable.Variable):
self.mask = self.mask.data
x = _as_mat(inputs[0])
W = inputs[1] * scale * self.mask
# (i)jk,ik->ij
y = _matmul(W, x[:, :, None], xp)
y = y.reshape(y.shape[0], y.shape[1]).astype(x.dtype, copy=False)
if len(inputs) == 3:
b = inputs[2]
y += b
return y,
def backward(self, inputs, grad_outputs):
scale = inputs[1].dtype.type(1. / (1 - self.ratio))
x = _as_mat(inputs[0])
W = inputs[1] * scale * self.mask
gy = grad_outputs[0]
xp = cuda.get_array_module(*inputs)
# ij,(i)jk->ik
gx = _matmul(gy[:, None, :], W, xp).reshape(inputs[0].shape)
gx = gx.astype(x.dtype, copy=False)
# ij,ik,ijk->jk
gW = (gy[:, :, None] * x[:, None, :] * self.mask).sum(0) * scale
gW = gW.astype(W.dtype, copy=False)
if len(inputs) == 3:
gb = gy.sum(0)
return gx, gW, gb
else:
return gx, gW
def simplified_dropconnect(x, W, b=None, ratio=.5, train=True, mask=None,
use_batchwise_mask=True):
"""Linear unit regularized by simplified dropconnect.
Simplified dropconnect drops weight matrix elements randomly with
probability ``ratio`` and scales the remaining elements by factor
``1 / (1 - ratio)``.
It accepts two or three arguments: an input minibatch ``x``, a weight
matrix ``W``, and optionally a bias vector ``b``. It computes
:math:`Y = xW^\\top + b`.
In testing mode, zero will be used as simplified dropconnect ratio instead
of ``ratio``.
Notice:
This implementation cannot be used for reproduction of the paper.
There is a difference between the current implementation and the
original one.
The original version uses sampling with gaussian distribution before
passing activation function, whereas the current implementation averages
before activation.
Args:
x (chainer.Variable or :class:`numpy.ndarray` or cupy.ndarray):
Input variable. Its first dimension ``n`` is assumed
to be the *minibatch dimension*. The other dimensions are treated
as concatenated one dimension whose size must be ``N``.
W (~chainer.Variable): Weight variable of shape ``(M, N)``.
b (~chainer.Variable): Bias variable (optional) of shape ``(M,)``.
ratio (float):
Dropconnect ratio.
train (bool):
If ``True``, executes simplified dropconnect.
Otherwise, simplified dropconnect function works as a linear
function.
mask (None or chainer.Variable or numpy.ndarray or cupy.ndarray):
If ``None``, randomized dropconnect mask is generated.
Otherwise, The mask must be ``(n, M, N)`` or ``(M, N)`` shaped
array, and `use_batchwise_mask` is ignored.
Main purpose of this option is debugging.
`mask` array will be used as a dropconnect mask.
use_batchwise_mask (bool):
If ``True``, dropped connections depend on each sample in
mini-batch.
Returns:
~chainer.Variable: Output variable.
.. seealso:: :class:`~chainer.links.Dropconnect`
.. seealso::
Li, W., Matthew Z., Sixin Z., Yann L., Rob F. (2013).
Regularization of Neural Network using DropConnect.
International Conference on Machine Learning.
`URL <http://cs.nyu.edu/~wanli/dropc/>`_
"""
if not train:
ratio = 0
if b is None:
return SimplifiedDropconnect(ratio, mask, use_batchwise_mask)(x, W)
else:
return SimplifiedDropconnect(ratio, mask, use_batchwise_mask)(x, W, b)