In [1]:
Threads.nthreads()

12

In [2]:
using LoopVectorization
using BenchmarkTools

function L!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    for j in J
        for i in I
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function M!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    for i in I
        for j in J
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function L_inbounds!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    @inbounds for j in J
        for i in I
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function M_inbounds!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    @inbounds for i in I
        for j in J
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function L_turbo!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    @turbo for j in J
        for i in I
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function M_turbo!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    @turbo for i in I
        for j in J
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function L_turbo2!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    for j in J
        @turbo for i in I
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function M_turbo2!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    for i in I
        @turbo for j in J
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function L_tturbo!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    @tturbo for j in J
        for i in I
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function M_tturbo!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    @tturbo for i in I
        for j in J
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function L_tturbo2!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    for j in J
        @tturbo for i in I
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

function M_tturbo2!(foo, bar)
    I, J = axes(foo, 1)[begin+1:end-1], axes(foo, 2)[begin+1:end-1]
    for i in I
        @tturbo for j in J
            foo[i, j] = bar[i, j-1] + bar[i, j+1] + bar[i-1, j] + bar[i+1, j] - 4bar[i, j]
        end
    end
end

M_tturbo2! (generic function with 1 method)

In [3]:
n = 100
x = y = range(-1, 1; length=n)
f(x, y) = exp(-2(x^2+y^2))
bar = f.(x', y)

foo = zero(bar)
foo_inbounds = zero(bar)
foo_turbo = zero(bar)
foo_turbo2 = zero(bar)
foo_tturbo = zero(bar)
foo_tturbo2 = zero(bar)
Foo = zero(bar)
Foo_inbounds = zero(bar)
Foo_turbo = zero(bar)
Foo_turbo2 = zero(bar)
Foo_tturbo = zero(bar)
Foo_tturbo2 = zero(bar)

L!(foo, bar)
L_inbounds!(foo_inbounds, bar)
L_turbo!(foo_turbo, bar)
L_turbo2!(foo_turbo2, bar)
L_tturbo!(foo_tturbo, bar)
L_tturbo2!(foo_tturbo2, bar)
M!(Foo, bar)
M_inbounds!(Foo_inbounds, bar)
M_turbo!(Foo_turbo, bar)
M_turbo2!(Foo_turbo2, bar)
M_tturbo!(Foo_tturbo, bar)
M_tturbo2!(Foo_tturbo2, bar)

foo == foo_inbounds == foo_turbo == foo_turbo2 == foo_tturbo == foo_tturbo2 ==
Foo == Foo_inbounds == Foo_turbo == Foo_turbo2 == Foo_tturbo == Foo_tturbo2

true

In [4]:
@btime L!($foo, $bar)

  14.100 μs (0 allocations: 0 bytes)


In [5]:
@btime M!($foo, $bar)

  17.700 μs (0 allocations: 0 bytes)


In [6]:
@btime L_inbounds!($foo, $bar)

  3.225 μs (0 allocations: 0 bytes)


In [7]:
@btime M_inbounds!($foo, $bar)

  8.367 μs (0 allocations: 0 bytes)


In [8]:
@btime L_turbo!($foo, $bar)

  2.811 μs (0 allocations: 0 bytes)


In [9]:
@btime M_turbo!($foo, $bar)

  2.922 μs (0 allocations: 0 bytes)


In [10]:
@btime L_turbo2!($foo, $bar)

  2.644 μs (0 allocations: 0 bytes)


In [11]:
@btime M_turbo2!($foo, $bar)

  14.700 μs (0 allocations: 0 bytes)


In [12]:
@btime L_tturbo!($foo, $bar)

  1.820 μs (0 allocations: 0 bytes)


In [13]:
@btime M_tturbo!($foo, $bar)

  1.580 μs (0 allocations: 0 bytes)


In [14]:
@btime L_tturbo2!($foo, $bar)

  3.212 μs (0 allocations: 0 bytes)


In [15]:
@btime M_tturbo2!($foo, $bar)

  22.200 μs (0 allocations: 0 bytes)
