In [1]:
import torch
import torch.nn.functional as F
import numpy as np
print(torch.__version__)
print(np.__version__)

1.4.0
1.17.2


In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

cuda:0


In [3]:
var = torch.tensor([[10,3], [2,8], [10,2], [3,7]], dtype=torch.float32)
print(var)
n,d = var.size()
print('n=', n, ', d=', d)

print('torch.sigmoid(tensor):')
print(torch.sigmoid(var))

print('F.log_softmax(tensor, dim=1):')
print(F.log_softmax(var, dim=1))

print('F.log_softmax(torch.sigmoid(tensor), dim=1):')
print(F.log_softmax(torch.sigmoid(var), dim=1))

tensor([[10.,  3.],
        [ 2.,  8.],
        [10.,  2.],
        [ 3.,  7.]])
n= 4 , d= 2
torch.sigmoid(tensor):
tensor([[1.0000, 0.9526],
        [0.8808, 0.9997],
        [1.0000, 0.8808],
        [0.9526, 0.9991]])
F.log_softmax(tensor, dim=1):
tensor([[-9.1142e-04, -7.0009e+00],
        [-6.0025e+00, -2.4757e-03],
        [-3.3540e-04, -8.0003e+00],
        [-4.0181e+00, -1.8150e-02]])
F.log_softmax(torch.sigmoid(tensor), dim=1):
tensor([[-0.6697, -0.7171],
        [-0.7543, -0.6355],
        [-0.6353, -0.7545],
        [-0.7167, -0.6702]])


In [4]:
rho_val = 1/var.size(1)
print('rho_val = ', rho_val, '. because d=', d)
rho_mat = torch.tensor([rho_val] * np.ones(var.size())).to(device)
print('rho_mat')
print(rho_mat)

rho_val =  0.5 . because d= 2
rho_mat
tensor([[0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000],
        [0.5000, 0.5000]], device='cuda:0', dtype=torch.float64)


In [5]:
vr = torch.sigmoid(var).to(device)  # sigmoid because we need the probability distributions
#https://discuss.pytorch.org/t/kl-divergence-produces-negative-values/16791/13
#KLDLoss(p, q), sum(q) needs to equal one
#p = log_softmax(tensor)
loss_ret_sum = torch.nn.functional.kl_div(F.log_softmax(vr, dim=1), rho_mat, reduction='sum')
print(loss_ret_sum)

loss_ret_batchmean = torch.nn.functional.kl_div(F.log_softmax(vr, dim=1), rho_mat, reduction='batchmean')
print(loss_ret_batchmean)

tensor(0.0041, device='cuda:0', dtype=torch.float64)
tensor(0.0010, device='cuda:0', dtype=torch.float64)


In [6]:
def kl_divergence(bottleneck, reduction):
    bt = torch.sigmoid(bottleneck)  # sigmoid because we need the probability distributions
    rho_val = 1/bt.size(1)
    rho_mat = torch.tensor([rho_val] * np.ones(bt.size())).to(device)
    #https://discuss.pytorch.org/t/kl-divergence-produces-negative-values/16791/13
    #KLDLoss(p, q), sum(q) needs to equal one
    #p = log_softmax(tensor)
    loss_ret_1 = torch.nn.functional.kl_div(F.log_softmax(bt, dim=1).to(device), rho_mat, reduction=reduction)
    # torch.sum(rho * torch.log(rho / bottleneck) + (1 - rho) * torch.log((1 - rho) / (1 - bottleneck)))
    return loss_ret_1

In [7]:
loss_ret_sum = kl_divergence(var, reduction='sum')
print(loss_ret_sum)
loss_ret_batchmean = kl_divergence(var, reduction='batchmean')
print(loss_ret_batchmean)

tensor(0.0041, device='cuda:0', dtype=torch.float64)
tensor(0.0010, device='cuda:0', dtype=torch.float64)
