In [11]:
using StatsFuns
gkl_gen(Q, P) = sum(xlogy(p, p/q) for (p, q) in zip(P, Q)) + sum(Q) - sum(P)

gkl_gen (generic function with 1 method)

In [2]:
n = 10^7
P = rand(n)
Q = abs.(randn(n))
gkl_gen(Q, P), gkl_gen(P, Q)

(3.65515495541793e6, 5.469150841913857e6)

In [3]:
gkl_gen(Q, Q)

0.0

In [4]:
gkl_gen(reverse(Q), Q)

5.527102509252604e6

In [12]:
using BenchmarkTools
@benchmark gkl_gen(Q, P)

BechmarkTools.Trial: 38 samples with 1 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m125.456 ms[22m[39m … [35m154.701 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m128.830 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m132.459 ms[22m[39m ± [32m  8.811 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m█[39m▄[39m [39m▄[39m▄[39m [39m█[34m▁[39m[39m [39m [39m▁[39m▄[39m [39m [39m [32m▄[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m▆[39m█

In [6]:
using LoopVectorization

unsafe_xlogy(x, y) = x*log(y)
safe_xlogy(x, y) = iszero(x) ? x : x*log(y)

function gkl_lv(Q, P)
    s = sum(Q) - sum(P)
    @tturbo for i in eachindex(P, Q)
        s += unsafe_xlogy(P[i], P[i]/Q[i])
    end
    s
end

gkl_lv(Q, P)



3.655154955417989e6

In [7]:
@benchmark gkl_lv(Q, P)

BechmarkTools.Trial: 286 samples with 1 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m15.188 ms[22m[39m … [35m 22.806 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m17.371 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m17.477 ms[22m[39m ± [32m887.245 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂[39m [39m [39m▂[39m [39m▂[39m▇[39m▅[34m█[39m[32m▅[39m[39m▃[39m▄[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▃[39m▁[39m▃[39m▁[

In [8]:
Threads.nthreads()

12