In [1]:
using Tullio, LoopVectorization

I, J, K = 400, 200, 100
f1 = rand(I, 1, K)
f2 = rand(I, J, 1)
dev = rand(I, 1, 1)

function make_R(f1, f2, dev)
    I, J, K = axes(dev, 1), axes(f2, 2), axes(f1, 3)
    R = similar(dev, I, J, K)
    for i in I, j in J, k in K
        @inbounds R[i, j, k] = f1[i, 1, k] * f2[i, j, 1] / dev[i, 1, 1]
    end
    R
end

R1 = make_R(f1, f2, dev)
@tullio R2[i, j, k] := f1[i, 1, k] * f2[i, j, 1] / dev[i, 1, 1]
R3 = @. f1 * f2 / dev
R4 = @turbo @. f1 * f2 / dev
R5 = @tturbo @. f1 * f2 / dev

@show R1 ≈ R2 ≈ R3 ≈ R4 ≈ R5
@show typeof(R1) size(R1);

R1 ≈ R2 ≈ R3 ≈ R4 ≈ R5 = true
typeof(R1) = Array{Float64, 3}
size(R1) = (400, 200, 100)




In [2]:
using BenchmarkTools
@btime R1 = make_R($f1, $f2, $dev)
@btime @tullio R2[i, j, k] := $f1[i, 1, k] * $f2[i, j, 1] / $dev[i, 1, 1]
@btime R3 = @. $f1 * $f2 / $dev
@btime R4 = @turbo @. $f1 * $f2 / $dev
@btime R5 = @tturbo @. $f1 * $f2 / $dev
;

  38.469 ms (2 allocations: 61.04 MiB)
  13.323 ms (152 allocations: 61.05 MiB)
  10.020 ms (2 allocations: 61.04 MiB)
  10.425 ms (2 allocations: 61.04 MiB)
  7.464 ms (2 allocations: 61.04 MiB)
