https://discourse.julialang.org/t/fast-performance-of-array-comprehension-without-allocations/65352

https://github.com/JuliaSIMD/LoopVectorization.jl/issues/307

In [1]:
using BenchmarkTools
using LoopVectorization

function assign!(u, x, y, z)
    @turbo for k in 1:length(z)
        for j in 1:length(y)
            for i in 1:length(x)
                u[i, j, k] = sin(x[i]) + sin(y[j]) + sin(z[k])
            end 
        end 
    end 
end

itot = 384 
dx = 1. / itot
x = dx*collect(0:itot-1); y = dx*collect(0:itot-1); z = dx*collect(0:itot-1)
u = zeros(itot+8, itot+8, itot+8)

uv = @view u[5:5+itot-1, 5:5+itot-1, 5:5+itot-1]
xx, yy, zz = reshape(x, (:, 1, 1)), reshape(y, (1, :, 1)), reshape(z, (1, 1, :))

assign!(uv, x, y, z)
a = deepcopy(uv)
uv[:, :, :] = [ sin(x) + sin(y) + sin(z) for x=x, y=y, z=z ]
b = deepcopy(uv)
@. uv = sin(xx) + sin(yy) + sin(zz)
c = deepcopy(uv)
@show a ≈ b ≈ c

print("LoopVectorization.@turbo:")
@btime assign!($uv, $x, $y, $z)
print("comprehension:           ")
@btime $uv[:, :, :] = [ sin(x) + sin(y) + sin(z) for x=$x, y=$y, z=$z ]
print("broadcast:               ")
@btime @. $uv = sin($xx) + sin($yy) + sin($zz);

a ≈ b ≈ c = true
LoopVectorization.@turbo:  39.741 ms (0 allocations: 0 bytes)
comprehension:             782.135 ms (2 allocations: 432.00 MiB)
broadcast:                 707.889 ms (0 allocations: 0 bytes)


In [2]:
using Pkg
Pkg.status("SLEEFPirates")
Pkg.status("LoopVectorization")

[32m[1m      Status[22m[39m `D:\.julia\environments\v1.6\Project.toml`
 [90m [476501e8] [39mSLEEFPirates v0.6.23 `https://github.com/JuliaSIMD/SLEEFPirates.jl.git#master`
[32m[1m      Status[22m[39m `D:\.julia\environments\v1.6\Project.toml`
 [90m [bdcacae8] [39mLoopVectorization v0.12.56 `https://github.com/JuliaSIMD/LoopVectorization.jl.git#master`
