-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
softmax_cross_entropy.py
561 lines (482 loc) · 21.4 KB
/
softmax_cross_entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
import numpy
import six
import chainer
from chainer import backend
from chainer.backends import cuda
from chainer import function_node
from chainer.functions.activation import log_softmax
from chainer.utils import type_check
from chainer import variable
import chainerx
def _broadcast_to(array, shape):
if hasattr(numpy, 'broadcast_to'):
return numpy.broadcast_to(array, shape)
dummy = numpy.empty(shape, array.dtype)
return numpy.broadcast_arrays(array, dummy)[0]
def _check_class_weight_option(class_weight):
if class_weight is not None:
if class_weight.ndim != 1:
raise ValueError('class_weight.ndim should be 1')
if class_weight.dtype.kind != 'f':
raise ValueError('The dtype of class_weight should be \'f\'')
if isinstance(class_weight, variable.Variable):
raise ValueError('class_weight should be a numpy.ndarray or '
'cupy.ndarray, not a chainer.Variable')
def _check_reduce_option(reduce):
if reduce not in ('mean', 'no'):
raise ValueError(
'only \'mean\' and \'no\' are valid for \'reduce\', but \'%s\' is '
'given' % reduce)
def _check_input_values(x, t, ignore_label):
# Extract the raw ndarray as Variable.__ge__ is not implemented.
# We assume that t is already an ndarray.
if isinstance(x, variable.Variable):
x = x.data
if not (((0 <= t) &
(t < x.shape[1])) |
(t == ignore_label)).all():
msg = ('Each label `t` need to satisfy '
'`0 <= t < x.shape[1] or t == %d`' % ignore_label)
raise ValueError(msg)
def _reduction_dtype(x_dtype):
# Returns the dtype for accumulation and output of reduction.
# For float16 input, float32 is used.
# Otherwise the same dtype as the input is used.
if x_dtype == numpy.float16:
return numpy.float32
return x_dtype
class SoftmaxCrossEntropy(function_node.FunctionNode):
"""Softmax activation followed by a cross entropy loss."""
normalize = True
y = None
# Coefficient of normalization. Only used if reduce='mean'.
_coeff = None
soft_target = False
eps = 1e-7
def __init__(self, normalize=True, cache_score=True, class_weight=None,
ignore_label=-1, reduce='mean',
soft_target_loss='cross-entropy'):
self.normalize = normalize
self.cache_score = cache_score
_check_class_weight_option(class_weight)
self.class_weight = class_weight
self.ignore_label = ignore_label
_check_reduce_option(reduce)
self.reduce = reduce
self.soft_target_loss = soft_target_loss
def check_type_forward(self, in_types):
type_check._argname(in_types, ('x', 't'))
x_type, t_type = in_types
if t_type.dtype.kind == 'i':
type_check.expect(
x_type.dtype.kind == 'f',
t_type.dtype.kind == 'i',
t_type.ndim == x_type.ndim - 1,
x_type.shape[0] == t_type.shape[0],
x_type.shape[2:] == t_type.shape[1:],
)
else:
# assume t is soft_target
type_check.expect(
x_type.dtype.kind == 'f',
t_type.dtype.kind == 'f',
x_type.shape == t_type.shape,
)
def _is_chainerx_supported(self, input_arrays):
# Determines if the specified configuration of inputs and parameters
# are supported in `forward_chainerx` implementation.
# TODO(niboshi): Support these conditions.
if self.class_weight is not None:
return False
if self.ignore_label != -1:
return False
if self.reduce != 'mean':
return False
x, t = input_arrays
if x.ndim != 2:
return False
return True
def forward_chainerx(self, inputs):
# TODO(niboshi): Current implementation is only intended to support
# MNIST example.
x, t = inputs
num_classes = x.shape[1]
score = chainerx.log_softmax(x, axis=1)
mask = (t[:, chainerx.newaxis] == chainerx.arange(
num_classes, dtype=t.dtype, device=x.device)).astype(score.dtype)
# TODO(beam2d): implement mean
y = -(score * mask).sum() * (1 / x.shape[0])
return y,
def forward_cpu(self, inputs):
class_weight = backend.from_chx(self.class_weight)
self.retain_inputs((0, 1))
x, t = inputs
if x.ndim == t.ndim and x.shape == t.shape:
self.soft_target = True
if chainer.is_debug() and not self.soft_target:
_check_input_values(x, t, self.ignore_label)
log_y = log_softmax._log_softmax(x)
if self.cache_score:
self.y = numpy.exp(log_y)
if self.soft_target:
return self._soft_target_loss(numpy, x, t, log_y)
if class_weight is not None:
shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
log_y *= _broadcast_to(class_weight.reshape(shape), x.shape)
log_yd = numpy.rollaxis(log_y, 1)
log_yd = log_yd.reshape(len(log_yd), -1)
t_valid = t != self.ignore_label
t = t * t_valid
log_p = log_yd[t.ravel(), numpy.arange(t.size)]
log_p *= t_valid.ravel()
if self.reduce == 'mean':
# deal with the case where the SoftmaxCrossEntropy is
# unpickled from the old version
if self.normalize:
count = t_valid.sum()
else:
count = len(x)
self._coeff = 1.0 / max(count, 1)
# Perform reduction in a promoted dtype
reduc_dtype = _reduction_dtype(x.dtype)
y = log_p.sum(keepdims=True, dtype=reduc_dtype)
y = y * (-self._coeff)
y = y.astype(x.dtype, copy=False)
return y.reshape(()),
else:
return -log_p.reshape(t.shape),
def forward_gpu(self, inputs):
class_weight = backend.from_chx(self.class_weight)
self.retain_inputs((0, 1))
x, t = inputs
if x.ndim == t.ndim and x.shape == t.shape:
self.soft_target = True
cupy = cuda.cupy
if chainer.is_debug() and not self.soft_target:
_check_input_values(x, t, self.ignore_label)
if x.size == 0:
y = cupy.zeros(t.shape, dtype=x.dtype)
if self.cache_score:
self.y = y
if self.reduce == 'mean':
return y.sum(),
else:
return y,
log_y = log_softmax._log_softmax(x)
if self.cache_score:
self.y = cupy.exp(log_y)
if self.soft_target:
return self._soft_target_loss(cupy, x, t, log_y)
if class_weight is not None:
shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape)
log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
if self.reduce == 'mean':
# Reduction is performed in a promoted dtype
reduc_dtype = _reduction_dtype(x.dtype)
if self.normalize:
count = (t != self.ignore_label).sum(dtype=reduc_dtype)
count = cupy.maximum(1, count)
coeff = 1. / count
else:
coeff = cupy.array(1. / max(1, len(t)), dtype=reduc_dtype)
self._coeff = coeff
ret = cuda.reduce(
'S t, raw T log_y, int32 n_channel, raw U coeff, '
'S ignore_label',
'U out',
't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
'a + b', 'out = static_cast<U>(a * -coeff[0])', '0',
'crossent_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1],
self._coeff, self.ignore_label)
ret = ret.astype(log_y.dtype, copy=False)
else:
ret = cuda.elementwise(
'S t, raw T log_y, int32 n_channel, T ignore', 'T out',
'''
if (t == ignore) {
out = 0;
} else {
out = -log_y[i * n_channel + t];
}
''',
'softmax_crossent_no_reduce_fwd'
)(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label)
ret = ret.reshape(t.shape)
return ret,
def _soft_target_loss(self, xp, x, t, log_y):
if self.soft_target_loss == 'kl-divergence':
ret = xp.sum(t * (xp.log(t + self.eps) - log_y), axis=1)
else:
ret = -xp.sum(t * log_y), axis=1)
if self.reduce == 'mean':
self._coeff = 1.0 / (x.size / x.shape[1])
ret = ret.sum(keepdims=True) * self._coeff
return ret.reshape(()),
else:
return ret,
def backward(self, input_indexes, grad_outputs):
func_grad = _SoftmaxCrossEntropyGrad_NoDoubleBackprop(
self.ignore_label, self.class_weight, self.y, self._coeff,
self.soft_target)
inputs = self.get_retained_inputs()
return func_grad.apply(inputs + grad_outputs) + (None,)
class _SoftmaxCrossEntropyGrad_NoDoubleBackprop(function_node.FunctionNode):
# A backward implementation which does not support double-backprop.
def __init__(self, ignore_label, class_weight, y, coeff, soft_target):
self.ignore_label = ignore_label
self.class_weight = class_weight
self.y = y
self.coeff = coeff
self.soft_target = soft_target
def forward_cpu(self, inputs_and_grad_outputs):
x, t, gloss = inputs_and_grad_outputs
if x.size == 0:
return numpy.zeros(x.shape, dtype=x.dtype), None
if self.y is not None:
y = self.y.copy()
else:
y = log_softmax._log_softmax(x)
numpy.exp(y, out=y)
t_valid = t != self.ignore_label
t = t * t_valid
if self.soft_target:
gx = y - t
elif y.ndim == 2:
gx = y
gx[numpy.arange(len(t)), t] -= 1
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
c = c[numpy.arange(len(t)), t]
gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape)
gx *= t_valid.reshape((len(t), 1))
else:
# in the case where y.ndim is higher than 2,
# we think that a current implementation is inefficient
# because it yields two provisional arrays for indexing.
n_unit = t.size // len(t)
gx = y.reshape(y.shape[0], y.shape[1], -1)
fst_index = numpy.arange(t.size) // n_unit
trd_index = numpy.arange(t.size) % n_unit
gx[fst_index, t.ravel(), trd_index] -= 1
if self.class_weight is not None:
shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
c = _broadcast_to(self.class_weight.reshape(shape), x.shape)
c = c.reshape(gx.shape)
c = c[fst_index, t.ravel(), trd_index]
c = c.reshape(y.shape[0], 1, -1)
gx *= _broadcast_to(c, gx.shape)
gx *= t_valid.reshape((len(t), 1, -1))
gx = gx.reshape(y.shape)
if self.coeff is not None:
gx *= gloss * self.coeff
else:
gx *= gloss[:, None]
return gx,
def forward_gpu(self, inputs_and_grad_outputs):
class_weight = cuda.to_gpu(self.class_weight)
cupy = cuda.cupy
x, t, gloss = inputs_and_grad_outputs
if x.size == 0:
return cupy.zeros(x.shape, dtype=x.dtype), None
if self.y is not None:
y = self.y
else:
y = log_softmax._log_softmax(x)
cupy.exp(y, out=y)
n_unit = t.size // len(t)
if self.coeff is not None:
coeff = self.coeff
else:
gloss = gloss[:, None, ...]
coeff = cupy.array(1, dtype=gloss.dtype) # dtype does not matter
if self.soft_target:
gx = gloss * coeff * (y - t)
elif self.class_weight is None:
gx = cuda.elementwise(
'T y, S t, T gloss, U coeff, S n_channel, S n_unit, '
'S ignore_label',
'T gx',
'''
const int c = (i / n_unit % n_channel);
if (t == ignore_label) {
gx = T(0);
} else {
gx = static_cast<T>(gloss * coeff * (y - (c == t)));
}
''',
'softmax_crossent_bwd')(
y, cupy.expand_dims(t, 1), gloss, coeff, x.shape[1],
n_unit, self.ignore_label)
else:
gx = cuda.elementwise(
'T y, raw T w, S t, T gloss, U coeff, '
'S n_channel, S n_unit, S ignore_label',
'T gx',
'''
const int c = (i / n_unit % n_channel);
if (t == ignore_label) {
gx = T(0);
} else {
gx = static_cast<T>(
gloss * coeff * (y - (c == t)) * w[t]);
}
''',
'softmax_crossent_weight_bwd')(
y, class_weight, cupy.expand_dims(t, 1), gloss, coeff,
x.shape[1], n_unit, self.ignore_label)
return gx,
def backward(self, input_indexes, grad_outputs):
raise RuntimeError(
'F.softmax_cross_entropy was called with '
'\'enable_double_backprop=False\' argument, but double-backprop '
'is actually being performed. Please specify '
'\'enable_double_backprop=True\' explicitly.')
def _double_backward_softmax_cross_entropy(x, t, normalize, class_weight,
ignore_label, reduce, is_chainerx):
if isinstance(t, variable.Variable):
t = t.data
F = chainer.functions
_check_class_weight_option(class_weight)
_check_reduce_option(reduce)
if chainer.is_debug():
_check_input_values(x, t, ignore_label)
loss = -chainer.functions.log_softmax(x)
if class_weight is not None:
shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
class_weight = F.broadcast_to(class_weight.reshape(shape), x.shape)
# TODO(niboshi): Remove this workaround after ChainerX supports
# type promotion.
if is_chainerx:
class_weight = F.cast(class_weight, x.dtype)
loss = loss * class_weight
in_use = (t != ignore_label).astype(x.dtype)
loss = F.rollaxis(loss, 1, loss.ndim)
loss = F.reshape(loss, (-1, loss.shape[-1]))
# Replace ignore_label value with one valid for F.select_item below.
t = t.clip(0, loss.shape[1] - 1)
loss = F.select_item(loss, t.ravel())
loss = F.reshape(loss, t.shape)
loss = loss * in_use
if reduce == 'mean':
reduc_dtype = _reduction_dtype(x.dtype)
if normalize:
# TODO(niboshi): Use in_use.sum(dtype=reduc_dtype) once chainerx
# supports dtype argument.
count = in_use.astype(reduc_dtype, copy=False).sum()
else:
count = len(x)
count = max(count, 1.)
if reduc_dtype == loss.dtype:
loss = F.sum(loss / count)
else:
# Sum in a promoted dtype
loss = F.cast(loss, reduc_dtype)
loss = F.sum(loss / count)
loss = F.cast(loss, x.dtype)
return loss
def softmax_cross_entropy(
x, t, normalize=True, cache_score=True, class_weight=None,
ignore_label=-1, reduce='mean', enable_double_backprop=False,
soft_target_loss='cross-entropy'):
"""Computes cross entropy loss for pre-softmax activations.
Args:
x (:class:`~chainer.Variable` or :ref:`ndarray`):
Variable holding a multidimensional array whose element indicates
unnormalized log probability: the first axis of the variable
represents the number of samples, and the second axis represents
the number of classes. While this function computes a usual softmax
cross entropy if the number of dimensions is equal to 2, it
computes a cross entropy of the replicated softmax if the number of
dimensions is greater than 2.
t (:class:`~chainer.Variable` or :ref:`ndarray`):
Variable holding a signed integer vector of ground truth
labels. If ``t[i] == ignore_label``, corresponding ``x[i]`` is
ignored.
When the dtype is float, this function treats ``t`` as an array
holding probability distribution of labels, in other words, soft
targets. In this case, the shape of ``t`` must be the same as the
shape of ``x``. Note that the loss is calculated using cross
entropy or KL divergence.
normalize (bool): If ``True``, this function normalizes the cross
entropy loss across all instances. If ``False``, it only
normalizes along a batch size.
cache_score (bool): When it is ``True``, the function stores result
of forward computation to use it on backward computation. It
reduces computational cost though consumes more memory.
If ``enable_double_backprop`` option is ``True``, this option
is forcibly turned off and the function does not cache
the intermediate value.
class_weight (:ref:`ndarray`):
An array that contains constant weights that will be multiplied
with the loss values along with the second dimension. The shape of
this array should be ``(x.shape[1],)``. If this is not ``None``,
each class weight ``class_weight[i]`` is actually multiplied to
``y[:, i]`` that is the corresponding log-softmax output of ``x``
and has the same shape as ``x`` before calculating the actual loss
value.
ignore_label (int): Label value you want to ignore. Its default value
is ``-1``. See description of the argument `t`.
reduce (str): A string that determines whether to reduce the loss
values. If it is ``'mean'``, it computes the sum of the individual
cross entropy and normalize it according to ``normalize`` option.
If it is ``'no'``, this function computes cross entropy for each
instance and does not normalize it (``normalize`` option is
ignored). In this case, the loss value of the ignored instance,
which has ``ignore_label`` as its target value, is set to ``0``.
enable_double_backprop (bool): If ``True``, this function uses
implementation that supports higher order differentiation.
If ``False``, it uses single-backprop implementation.
This function use the single-backprop version because we expect
it is faster. So, if you need second or higher derivatives,
you need to turn it on explicitly.
soft_target_loss (str): A string that determines what type of
method is used to calculate soft target loss. If
``'cross-entropy'`` and ``'kl-divergence'``, cross-entropy and
KL divergence are used for loss calculation.
Returns:
~chainer.Variable: A variable holding a scalar array of the cross
entropy loss. If ``reduce`` is ``'mean'``, it is a scalar array.
If ``reduce`` is ``'no'``, the shape is same as that of ``t``.
.. note::
This function is differentiable only by ``x``.
.. admonition:: Example
>>> x = np.array([[-1, 0, 1, 2], [2, 0, 1, -1]]).astype(np.float32)
>>> x
array([[-1., 0., 1., 2.],
[ 2., 0., 1., -1.]], dtype=float32)
>>> t = np.array([3, 0]).astype(np.int32)
>>> t
array([3, 0], dtype=int32)
>>> y = F.softmax_cross_entropy(x, t)
>>> y
variable(0.44018972)
>>> log_softmax = -F.log_softmax(x)
>>> expected_loss = np.mean([log_softmax[row, column].data \
for row, column in enumerate(t)])
>>> y.array == expected_loss
True
"""
is_chainerx = (
chainerx.is_available() and backend.get_array_module(x) is chainerx)
if soft_target_loss not in ('cross-entropy', 'kl-divergence'):
raise ValueError('soft_target_loss must be \'cross-entropy\' or '
'\'kl-divergence\'.')
if is_chainerx or not enable_double_backprop:
# Optimized implementation.
# For non-ChainerX, forward and backward are supported but
# double-backprop is not supported.
# For ChainerX, even forward is supported for only specific
# configuration of inputs and parameters, which is tested with
# `SoftmaxCrossEntropy._is_chainerx_supported()`.
func = SoftmaxCrossEntropy(
normalize, cache_score, class_weight, ignore_label, reduce,
soft_target_loss)
if not is_chainerx or func._is_chainerx_supported((x, t)):
loss, = func.apply((x, t))
return loss
# Generic double-backprop-enabled but unoptimized implementation
return _double_backward_softmax_cross_entropy(
x, t, normalize, class_weight, ignore_label, reduce, is_chainerx)