https://twitter.com/yujitach/status/1424030835771023363

In [1]:
VERSION

v"1.8.0-DEV.310"

In [2]:
]st

[32m[1m      Status[22m[39m `D:\OneDrive\public\0015\Untitled Arpack\Project.toml`
 [90m [7d9fca2a] [39mArpack v0.5.3 `https://github.com/JuliaLinearAlgebra/Arpack.jl.git#master`
 [90m [bdcacae8] [39mLoopVectorization v0.12.63


In [3]:
Threads.nthreads()

12

In [4]:
"""
    Original

* https://gist.github.com/yujitach/c30d7a174bbc3d3d3e40a3c0f9f9d47f
* TAB を "    " で置換
"""
module Original

using LinearAlgebra,LinearMaps
import Arpack

const L=20
    
diag_ = zeros(Float64,2^L)

function prepareDiag(diag)
    for state = 1 : 2^L
        for i = 1 : L
            j = i==L ? 1 : i+1
            diag[state] -= (((state >> (i-1))&1) == ((state >> (j-1))&1)) ? 1 : -1
        end
    end
end
    
function Hfunc!(C,B,diag)
    for state = 1 : 2^L
        C[state] = diag[state] * B[state]
    end
    for state = 1 : 2^L
        for i = 1 : L
            newstate = (state&(~(2^L))) ⊻ (1<<(i-1))
            if newstate==0
                newstate = 2^L
            end
            C[newstate] -= B[state]
        end
    end
end


println("preparing...")
prepareDiag(diag_)

println("computing the lowest eigenvalue...")
H=LinearMap((C,B)->Hfunc!(C,B,diag_),2^L,ismutating=true,issymmetric=true,isposdef=false)
@time e,v = Arpack.eigs(H,nev=1,which=:SR)
@time e,v = Arpack.eigs(H,nev=1,which=:SR)

println("obtained:")
println(e[1])

println("theoretical:")
println(-2sum([ abs(sin((n-1/2) * pi/L)) for n in 1 : L]))

end;

preparing...
computing the lowest eigenvalue...
 24.283829 seconds (3.91 M allocations: 415.084 MiB, 0.69% gc time, 4.06% compilation time)
 23.558853 seconds (1.11 k allocations: 200.048 MiB, 0.23% gc time)
obtained:
-25.49098968636477
theoretical:
-25.49098968636475


In [5]:
"""
    Rev0

* This revision is almost equivalent to the original.
* Stop using constants.
* Always pass global variables to functions as arguments.
* Swap the order of the for loop.
* Add @inbounds macro.
* Revise sum([f(x) for x in X]) to sum(f(x) for x in X).
"""
module Rev0

using LinearAlgebra, LinearMaps
import Arpack
    
function prepareDiag(L)
    diag = zeros(2^L)
    for state = 1:2^L
        for i = 1:L
            j = i==L ? 1 : i+1
            @inbounds diag[state] -= (((state >> (i-1))&1) == ((state >> (j-1))&1)) ? 1 : -1
        end
    end
    diag
end
    
function Hfunc!(C, B, diag, L)
    for state = 1:2^L
        @inbounds C[state] = diag[state] * B[state]
    end
    for i = 1:L
        for state = 1:2^L
            newstate = (state&(~(2^L))) ⊻ (1<<(i-1))
            if newstate == 0
                newstate = 2^L
            end
            @inbounds C[newstate] -= B[state]
        end
    end
end
prepareHfunc!(diag, L) = (C, B) -> Hfunc!(C, B, diag, L)

L = 20

println("preparing...")
diag_ = prepareDiag(L)

println("computing the lowest eigenvalue...")
H = LinearMap(prepareHfunc!(diag_, L), 2^L, ismutating=true, issymmetric=true, isposdef=false)
@time e, v = Arpack.eigs(H, nev=1, which=:SR)
@time e, v = Arpack.eigs(H, nev=1, which=:SR)

println("obtained:")
println(e[1])

println("theoretical:")
println(-2sum(abs(sin((n-1/2) * pi/L)) for n in 1:L))

end;

preparing...
computing the lowest eigenvalue...
 23.481924 seconds (659.00 k allocations: 235.908 MiB, 0.11% gc time, 1.12% compilation time)
 21.677027 seconds (806 allocations: 200.036 MiB, 0.08% gc time)
obtained:
-25.490989686364742
theoretical:
-25.49098968636475


In [6]:
"""
    Rev1

* Use Threads.@threads macro.
"""
module Rev1

using LinearAlgebra, LinearMaps
import Arpack
    
function prepareDiag(L)
    diag = zeros(2^L)
    for state = 1:2^L
        for i = 1:L
            j = i==L ? 1 : i+1
            @inbounds diag[state] -= (((state >> (i-1))&1) == ((state >> (j-1))&1)) ? 1 : -1
        end
    end
    diag
end
    
function Hfunc!(C, B, diag, L)
    Threads.@threads for state = 1:2^L
        @inbounds C[state] = diag[state] * B[state]
    end
    for i = 1:L
        Threads.@threads for state = 1:2^L
            newstate = (state&(~(2^L))) ⊻ (1<<(i-1))
            if newstate == 0
                newstate = 2^L
            end
            @inbounds C[newstate] -= B[state]
        end
    end
end
prepareHfunc!(diag, L) = (C, B) -> Hfunc!(C, B, diag, L)

L = 20

println("preparing...")
diag_ = prepareDiag(L)

println("computing the lowest eigenvalue...")
H = LinearMap(prepareHfunc!(diag_, L), 2^L, ismutating=true, issymmetric=true, isposdef=false)
@time e, v = Arpack.eigs(H, nev=1, which=:SR)
@time e, v = Arpack.eigs(H, nev=1, which=:SR)

println("obtained:")
println(e[1])

println("theoretical:")
println(-2sum(abs(sin((n-1/2) * pi/L)) for n in 1:L))

end;

preparing...
computing the lowest eigenvalue...
 11.949328 seconds (1.00 M allocations: 266.665 MiB, 0.16% gc time, 2.82% compilation time)
 10.844770 seconds (185.66 k allocations: 221.331 MiB, 0.15% gc time)
obtained:
-25.490989686364742
theoretical:
-25.49098968636475


In [7]:
"""
    Rev2

* Use LoopVectorization.@tturbo macro.
"""
module Rev2

using LinearAlgebra, LinearMaps
import Arpack
using LoopVectorization

function prepareDiag(L)
    diag = zeros(2^L)
    for state = 1:2^L
        for i = 1:L
            j = i==L ? 1 : i+1
            @inbounds diag[state] -= (((state >> (i-1))&1) == ((state >> (j-1))&1)) ? 1 : -1
        end
    end
    diag
end
    
function Hfunc!(C, B, diag, L)
    N = length(diag)
    @tturbo for state = 1:N
        C[state] = diag[state] * B[state]
    end
    for i = 1:L
        @tturbo for state = 1:N
            newstate = (state&(~(2^L))) ⊻ (1<<(i-1))
            c = newstate == 0
            newstate = !c*newstate + c*N # remove if statement
            C[newstate] -= B[state]
        end
    end
end
prepareHfunc!(diag, L) = (C, B) -> Hfunc!(C, B, diag, L)

L = 20

println("preparing...")
diag_ = prepareDiag(L)

println("computing the lowest eigenvalue...")
H = LinearMap(prepareHfunc!(diag_, L), 2^L, ismutating=true, issymmetric=true, isposdef=false)
@time e, v = Arpack.eigs(H, nev=1, which=:SR)
@time e, v = Arpack.eigs(H, nev=1, which=:SR)

println("obtained:")
println(e[1])

println("theoretical:")
println(-2sum(abs(sin((n-1/2) * pi/L)) for n in 1:L))

end;

preparing...
computing the lowest eigenvalue...
 17.706952 seconds (12.31 M allocations: 875.821 MiB, 1.24% gc time)
  8.895560 seconds (889 allocations: 200.039 MiB, 0.21% gc time)
obtained:
-25.490989686364763
theoretical:
-25.49098968636475


In [8]:
using BenchmarkTools
using LinearAlgebra
using Arpack: Arpack

H = Original.H
H0 = Rev0.H
H1 = Rev1.H
H2 = Rev2.H

B, C = similar(Original.diag_), similar(Original.diag_)
println("Hamiltonian bench")
print("  Original:                ")
@btime mul!($C, $H, $B)
print("  Rev0 (almost original):  ")
@btime mul!($C, $H0, $B)
print("  Rev1 (Threads.@threads): ")
@btime mul!($C, $H1, $B)
print("  Rev2 (LoopVectorization):")
@btime mul!($C, $H2, $B);

Hamiltonian bench
  Original:                  99.818 ms (0 allocations: 0 bytes)
  Rev0 (almost original):    85.333 ms (0 allocations: 0 bytes)
  Rev1 (Threads.@threads):   25.261 ms (1301 allocations: 153.84 KiB)
  Rev2 (LoopVectorization):  11.808 ms (0 allocations: 0 bytes)


In [9]:
println("Arpack.eigs bench")
print("  Original:                ")
@btime e, v = Arpack.eigs($H, nev=1, which=:SR)
print("  Rev0 (almost original):  ")
@btime e, v = Arpack.eigs($H0, nev=1, which=:SR)
print("  Rev1 (Threads.@threads): ")
@btime e, v = Arpack.eigs($H1, nev=1, which=:SR)
print("  Rev2 (LoopVectorization):")
@btime e, v = Arpack.eigs($H2, nev=1, which=:SR);

Arpack.eigs bench
  Original:                  22.863 s (1106 allocations: 200.05 MiB)
  Rev0 (almost original):    21.502 s (806 allocations: 200.04 MiB)
  Rev1 (Threads.@threads):   11.504 s (198068 allocations: 222.83 MiB)
  Rev2 (LoopVectorization):  8.535 s (878 allocations: 200.04 MiB)


In [10]:
@show a = 2
@show foo = x -> a*x
@show foo(3)
@show a = 10
@show foo(3);

a = 2 = 2
foo = (x->begin
            #= In[10]:2 =#
            a * x
        end) = var"#1#2"()
foo(3) = 6
a = 10 = 10
foo(3) = 30


In [11]:
@show a = 2
@show makebar(a) = x -> a*x
@show bar = makebar(a)
@show bar(3)
@show a = 10
@show bar(3);

a = 2 = 2
makebar(a) = begin
        #= In[11]:2 =#
        x->begin
                #= In[11]:2 =#
                a * x
            end
    end = makebar
bar = makebar(a) = var"#3#4"{Int64}(2)
bar(3) = 6
a = 10 = 10
bar(3) = 6
