$$ l(X, \mathbf{y}) = L = (l_1, l_2, \cdots, l_N)^T \qquad \text{if  reduction='none'}$$

1. $$ l_n = - \mathbf{w}_{\mathbf{y}_n} X_{n, \mathbf{y}_n} $$

2. $$ \mathbf{w}_c = \mathrm{weight}[c] $$

where $ X$ is the input, $\mathbf{y}$ is the target, $\mathbf{w}$ is the weight, and N is the batch size

$$
\begin{equation}
l(X, \mathbf{y}) =\begin{cases}
		\sum_{n}^N \frac{1}{\sum_{n=1}^N \mathbf{w}_{\mathbf{y}_n} } l_n  , & \text{if  reduction='mean'} \\
        \sum_{n}^N l_n, & \text{if  reduction='sum'}
     \end{cases}
\end{equation}
$$

* Input: $(N, C)$ where $C$ = number of classes

* Target: $(N)$ where each value is $ 0 \leq \text{targets}[i] \leq C-1 $

* Output: If reduction is 'none', shape (N)

In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn

In [2]:
X = torch.arange(24, dtype=torch.float32).reshape(4, 6)  # X类别数为6
X

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [ 6.,  7.,  8.,  9., 10., 11.],
        [12., 13., 14., 15., 16., 17.],
        [18., 19., 20., 21., 22., 23.]])

In [3]:
y = torch.tensor([2, 1, 4, 0])

# weight(类别权重)的大小必须与X的类别数相等,即weight.shape=torch.Size([6])
'''
weight (Tensor, optional) –
    a manual rescaling weight given to each class.
    If given, it has to be a Tensor of size C. Otherwise, it is treated as if having all ones.
'''
weight = torch.tensor([1, 2, 3, 4, 5, 6], dtype=torch.float32)
print(F.nll_loss(X, y, weight=weight,  # 默认weight=None,此时weight为全为1的张量
                 reduction='mean'))  # 默认reduction='mean',参考BCELos
print(F.nll_loss(X, y, weight=weight, reduction='sum'))
print(F.nll_loss(X, y, weight=weight, reduction='none'))

tensor(-10.7273)
tensor(-118.)
tensor([ -6., -14., -80., -18.])


In [4]:
# target为0的损失直接为0(NLP任务中可指定ignore_index填充项)
'''
gnore_index (int, optional) –
    Specifies a target value that is ignored and does not contribute to the input gradient.
    When size_average is True, the loss is averaged over non-ignored targets.
'''
nn.NLLLoss(reduction='none', ignore_index=0)(X, y)

tensor([ -2.,  -7., -16.,   0.])

In [5]:
print(nn.NLLLoss(weight=weight, reduction='sum')(X, y))
print(nn.NLLLoss(weight=weight, reduction='mean')(X, y))  # 默认reduction='mean'

tensor(-118.)
tensor(-10.7273)


### 上式计算步骤如下:

1. 计算$ X_{n, \mathbf{y}_n} $

In [6]:
step1 = X[[0, 1, 2, 3], y]  # 即X[0, 2], X[1, 1], X[2, 4], X[3, 0]
step1

tensor([ 2.,  7., 16., 18.])

3. 若reduction='none'

In [7]:
re_none = - step1 * weight[y]
re_none

tensor([ -6., -14., -80., -18.])

4. 若reduction='sum'

In [8]:
re_mean = torch.sum(re_none)
re_mean

tensor(-118.)

5. 若reduction='mean'

In [9]:
weight_1 = 1 / torch.sum(weight[y])
weight_1

tensor(0.0909)

In [10]:
torch.sum(re_none * weight_1)

tensor(-10.7273)