https://discourse.julialang.org/t/is-there-a-maximum-f-op-itrs-in-julia/63868/11

In [1]:
versioninfo()

Julia Version 1.6.1
Commit 6aaedecc44 (2021-04-23 05:59 UTC)
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-11.0.1 (ORCJIT, skylake)
Environment:
  JULIA_DEPOT_PATH = D:\.julia
  JULIA_NUM_THREADS = 12
  JULIA_PYTHONCALL_EXE = D:\.julia\conda\3\python.exe


In [2]:
using BenchmarkTools

function max_abs(a, b)
    m = zero(promote_type(eltype(a), eltype(b)))
    for i in eachindex(a, b)
        tmp = abs(a[i] - b[i]) 
        tmp > m && (m = tmp)
    end 
    m
end

N = 500
a = randn(N, N)
b = randn(N, N)

@show VERSION
@show Threads.nthreads()
println()
@show N
print("simple for loop:     ")
@btime max_abs($a, $b)
print("maximum(generator):  ")
@btime maximum(abs(i - j) for (i, j) in zip($a, $b))
print("maximum(abs, a - b): ")
@btime maximum(abs, $a - $b)
print("maximum splat(abs∘-):")
@btime maximum(Base.splat(abs∘-), zip($a, $b))
print("mapreduce abs∘- max: ")
@btime mapreduce(abs∘-, max, $a, $b)
using Tullio
print("Tullio:              ")
@btime @tullio (max) _ := abs($a[i] - $b[i])
print("Tullio (LoopVect.):  ")
using LoopVectorization
@btime @tullio (max) _ := abs($a[i] - $b[i])

function max_abs_turbo(a, b)
    m = zero(promote_type(eltype(a), eltype(b)))
    @turbo for i in eachindex(a, b)
        m = max(m, abs(a[i] - b[i]))
    end 
    m
end

function max_abs_tturbo(a, b)
    m = zero(promote_type(eltype(a), eltype(b)))
    @tturbo for i in eachindex(a, b)
        m = max(m, abs(a[i] - b[i]))
    end 
    m
end

print("LoopVect. @turbo:    ")
@btime max_abs_turbo($a, $b)
print("LoopVect. @tturbo:   ")
@btime max_abs_tturbo($a, $b);

VERSION = v"1.6.1"
Threads.nthreads() = 12

N = 500
simple for loop:       213.500 μs (0 allocations: 0 bytes)
maximum(generator):    428.900 μs (0 allocations: 0 bytes)
maximum(abs, a - b):   431.600 μs (2 allocations: 1.91 MiB)
maximum splat(abs∘-):  428.800 μs (0 allocations: 0 bytes)
mapreduce:             460.300 μs (2 allocations: 1.91 MiB)
Tullio:                204.600 μs (1 allocation: 16 bytes)
LoopVect. Tullio:      52.200 μs (1 allocation: 16 bytes)
LoopVect. @turbo:      51.600 μs (0 allocations: 0 bytes)
LoopVect. @tturbo:     12.800 μs (0 allocations: 0 bytes)


In [3]:
max_abs(a, b) ==
maximum(abs(i - j) for (i, j) in zip(a, b)) ==
maximum(abs, a - b) ==
maximum(Base.splat(abs∘-), zip(a, b)) ==
mapreduce(abs∘-, max, a, b) ==
(@tullio (max) _ := abs(a[i] - b[i])) ==
max_abs_turbo(a, b) ==
max_abs_tturbo(a, b)

true

In [6]:
print("Tullio avx=false:    ")
@btime @tullio avx=false (max) _ := abs($a[i] - $b[i])
print("Tullio threads=false:")
@btime @tullio threads=false (max) _ := abs($a[i] - $b[i]);

Tullio avx=false:      208.900 μs (1 allocation: 16 bytes)
Tullio threads=false:  54.300 μs (1 allocation: 16 bytes)


6.2930224704881645