In [1]:
import torch
import torch.nn.functional as F
import numpy as np
print(torch.__version__)
print(np.__version__)

1.6.0
1.19.1


In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [3]:
var = torch.tensor([[10,3], [2,8], [10,2], [3,7]], dtype=torch.float32)
print(var)
n,d = var.size()
print('n=', n, ', d=', d)

print('torch.sigmoid(tensor):')
print(torch.sigmoid(var))

print('F.log_softmax(tensor, dim=1):')
print(F.log_softmax(var, dim=1))

print('F.log_softmax(torch.sigmoid(tensor), dim=1):')
print(F.log_softmax(torch.sigmoid(var), dim=1))

tensor([[10.,  3.],
        [ 2.,  8.],
        [10.,  2.],
        [ 3.,  7.]])
n= 4 , d= 2
torch.sigmoid(tensor):
tensor([[1.0000, 0.9526],
        [0.8808, 0.9997],
        [1.0000, 0.8808],
        [0.9526, 0.9991]])
F.log_softmax(tensor, dim=1):
tensor([[-9.1142e-04, -7.0009e+00],
        [-6.0025e+00, -2.4757e-03],
        [-3.3540e-04, -8.0003e+00],
        [-4.0181e+00, -1.8150e-02]])
F.log_softmax(torch.sigmoid(tensor), dim=1):
tensor([[-0.6697, -0.7171],
        [-0.7543, -0.6355],
        [-0.6353, -0.7545],
        [-0.7167, -0.6702]])


In [4]:
rho_val = 1/var.size(1)
print('rho_val = ', rho_val, '. because d=', d)
rho_mat = torch.tensor([rho_val] * np.ones(var.size())).to(device)
print('rho_mat')
print(rho_mat)

rho_val =  0.5 . because d= 2
rho_mat
tensor([[0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000]], dtype=torch.float64)


In [5]:
vr = torch.sigmoid(var).to(device)  # sigmoid because we need the probability distributions
#https://discuss.pytorch.org/t/kl-divergence-produces-negative-values/16791/13
#KLDLoss(p, q), sum(q) needs to equal one
#p = log_softmax(tensor)
loss_ret_sum = torch.nn.functional.kl_div(F.log_softmax(vr, dim=1), rho_mat, reduction='sum')
print(loss_ret_sum)

loss_ret_batchmean = torch.nn.functional.kl_div(F.log_softmax(vr, dim=1), rho_mat, reduction='batchmean')
print(loss_ret_batchmean)

tensor(0.0041, dtype=torch.float64)
tensor(0.0010, dtype=torch.float64)


In [6]:
def kl_divergence(bottleneck, reduction):
    bt = torch.sigmoid(bottleneck)  # sigmoid because we need the probability distributions
    rho_val = 1/bt.size(1)
    rho_mat = torch.tensor([rho_val] * np.ones(bt.size())).to(device)
    #https://discuss.pytorch.org/t/kl-divergence-produces-negative-values/16791/13
    #KLDLoss(p, q), sum(q) needs to equal one
    #p = log_softmax(tensor)
    loss_ret_1 = torch.nn.functional.kl_div(F.log_softmax(bt, dim=1).to(device), rho_mat, reduction=reduction)
    # torch.sum(rho * torch.log(rho / bottleneck) + (1 - rho) * torch.log((1 - rho) / (1 - bottleneck)))
    return loss_ret_1

In [7]:
loss_ret_sum = kl_divergence(var, reduction='sum')
print(loss_ret_sum)
loss_ret_batchmean = kl_divergence(var, reduction='batchmean')
print(loss_ret_batchmean)

tensor(0.0041, dtype=torch.float64)
tensor(0.0010, dtype=torch.float64)


In [8]:
def kl_divergence_ones(bt, reduction, apply_sigmoid, apply_log_softmax):
    if apply_sigmoid:
        bt = torch.sigmoid(bt)  # sigmoid because we need the probability distributions
    rho_mat = torch.zeros(bt.size(), dtype=torch.float32).to(device)
    _, preds = torch.max(bt, 1)
    rho_mat[range(bt.size(0)), preds] = 1
    if apply_log_softmax:
        loss_ret_1 = F.kl_div(F.log_softmax(bt, dim=1).to(device), rho_mat, reduction=reduction)
    else:
        loss_ret_1 = F.kl_div(bt, rho_mat, reduction=reduction)
    return loss_ret_1

In [9]:
var = torch.tensor([[10,3,11], [15,2,8], [7,10,2], [2,3,7]], dtype=torch.float32)
print(var)

tensor([[10.,  3., 11.],
        [15.,  2.,  8.],
        [ 7., 10.,  2.],
        [ 2.,  3.,  7.]])


In [10]:
bt = torch.sigmoid(var)  # sigmoid because we need the probability distributions
print(bt)

tensor([[1.0000, 0.9526, 1.0000],
        [1.0000, 0.8808, 0.9997],
        [0.9991, 1.0000, 0.8808],
        [0.8808, 0.9526, 0.9991]])


In [11]:
rho_mat = torch.zeros(bt.size(), dtype=torch.float32)
print(rho_mat)
_, preds = torch.max(bt, 1)
print(preds)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])
tensor([2, 0, 1, 2])


In [12]:
rho_mat[range(bt.size(0)), preds] = 1
print(rho_mat)

tensor([[0., 0., 1.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])


In [13]:
reduction='sum'
apply_sigmoid=False
apply_log_softmax=False
loss_ret_sum = kl_divergence_ones(var, reduction=reduction, apply_sigmoid=apply_sigmoid, apply_log_softmax=apply_log_softmax)
print('reduction(', reduction, '),apply_sigmoid(', apply_sigmoid ,'), apply_log_softmax(',apply_log_softmax,'), loss_ret_sum(',loss_ret_sum, ')')

reduction='sum'
apply_sigmoid=False
apply_log_softmax=True
loss_ret_sum = kl_divergence_ones(var, reduction=reduction, apply_sigmoid=apply_sigmoid, apply_log_softmax=apply_log_softmax)
print('reduction(', reduction, '),apply_sigmoid(', apply_sigmoid ,'), apply_log_softmax(',apply_log_softmax,'), loss_ret_sum(',loss_ret_sum, ')')

reduction='sum'
apply_sigmoid=True
apply_log_softmax=False
loss_ret_sum = kl_divergence_ones(var, reduction=reduction, apply_sigmoid=apply_sigmoid, apply_log_softmax=apply_log_softmax)
print('reduction(', reduction, '),apply_sigmoid(', apply_sigmoid ,'), apply_log_softmax(',apply_log_softmax,'), loss_ret_sum(',loss_ret_sum, ')')

reduction='sum'
apply_sigmoid=True
apply_log_softmax=True
loss_ret_sum = kl_divergence_ones(var, reduction=reduction, apply_sigmoid=apply_sigmoid, apply_log_softmax=apply_log_softmax)
print('reduction(', reduction, '),apply_sigmoid(', apply_sigmoid ,'), apply_log_softmax(',apply_log_softmax,'), loss_ret_sum(',loss_ret_sum, ')')




reduction='batchmean'
apply_sigmoid=False
apply_log_softmax=False
loss_ret_sum = kl_divergence_ones(var, reduction=reduction, apply_sigmoid=apply_sigmoid, apply_log_softmax=apply_log_softmax)
print('reduction(', reduction, '),apply_sigmoid(', apply_sigmoid ,'), apply_log_softmax(',apply_log_softmax,'), loss_ret_sum(',loss_ret_sum, ')')

reduction='batchmean'
apply_sigmoid=False
apply_log_softmax=True
loss_ret_sum = kl_divergence_ones(var, reduction=reduction, apply_sigmoid=apply_sigmoid, apply_log_softmax=apply_log_softmax)
print('reduction(', reduction, '),apply_sigmoid(', apply_sigmoid ,'), apply_log_softmax(',apply_log_softmax,'), loss_ret_sum(',loss_ret_sum, ')')

reduction='batchmean'
apply_sigmoid=True
apply_log_softmax=False
loss_ret_sum = kl_divergence_ones(var, reduction=reduction, apply_sigmoid=apply_sigmoid, apply_log_softmax=apply_log_softmax)
print('reduction(', reduction, '),apply_sigmoid(', apply_sigmoid ,'), apply_log_softmax(',apply_log_softmax,'), loss_ret_sum(',loss_ret_sum, ')')

reduction='batchmean'
apply_sigmoid=True
apply_log_softmax=True
loss_ret_sum = kl_divergence_ones(var, reduction=reduction, apply_sigmoid=apply_sigmoid, apply_log_softmax=apply_log_softmax)
print('reduction(', reduction, '),apply_sigmoid(', apply_sigmoid ,'), apply_log_softmax(',apply_log_softmax,'), loss_ret_sum(',loss_ret_sum, ')')


reduction( sum ),apply_sigmoid( False ), apply_log_softmax( False ), loss_ret_sum( tensor(-43.) )
reduction( sum ),apply_sigmoid( False ), apply_log_softmax( True ), loss_ret_sum( tensor(0.3881) )
reduction( sum ),apply_sigmoid( True ), apply_log_softmax( False ), loss_ret_sum( tensor(-3.9990) )
reduction( sum ),apply_sigmoid( True ), apply_log_softmax( True ), loss_ret_sum( tensor(4.2484) )
reduction( batchmean ),apply_sigmoid( False ), apply_log_softmax( False ), loss_ret_sum( tensor(-10.7500) )
reduction( batchmean ),apply_sigmoid( False ), apply_log_softmax( True ), loss_ret_sum( tensor(0.0970) )
reduction( batchmean ),apply_sigmoid( True ), apply_log_softmax( False ), loss_ret_sum( tensor(-0.9998) )
reduction( batchmean ),apply_sigmoid( True ), apply_log_softmax( True ), loss_ret_sum( tensor(1.0621) )
