* https://twitter.com/kame_no_mori/status/1734888762914820124
* https://twitter.com/genkuroki/status/1739876470682697877

>\#Julia言語 https://kamemori.com/research/fortran/speed_harmonic_oscillator_1d_ja.html のJuliaのコードに18文字追加して比較し直してみました。3回実行して最短の値を採用。<br>
><br>
>1.64秒 ← gfortran -O3<br>
>3.50秒 ← julia newton.jl<br>
>1.63秒 ← julia newton_inline_fastmath.jl (18文字追加)<br>
>1.18秒 ← julia> include("newton_inline_fastmath.jl")<br>

* https://x.com/genkuroki/status/1739999710998618530?s=20

>#Julia言語 さらなる訂正<br>
><br>
>Julia版では34文字追加しないとかなり遅くなるという結果<br>
><br>
>1.64秒 ← gfortran -O3<br>
>3.50秒 ← julia newton.jl<br>
>1.41秒 ← julia newton_inline_fastmath_inbounds_simd.jl (34文字追加)<br>
>1.06秒 ← julia> include("newton_inline_fastmath.jl")<br>

__gfortranとの比較__

<img src="https://raw.githubusercontent.com/genkuroki/public/main/0046/runge-kutta%20benchmark/2023-12-27a%20gfortran%20-O3.png">

<img src="https://raw.githubusercontent.com/genkuroki/public/main/0046/runge-kutta%20benchmark/2023-12-27b%20julia%20newton.jl.png">

<img src="https://raw.githubusercontent.com/genkuroki/public/main/0046/runge-kutta%20benchmark/2023-12-27c%20julia%20newton_inline_fastmath.jl.png">

<img src="https://raw.githubusercontent.com/genkuroki/public/main/0046/runge-kutta%20benchmark/2023-12-27d%20julia%20include.png">

<img src="https://raw.githubusercontent.com/genkuroki/public/main/0046/runge-kutta%20benchmark/2023-12-27e%20julia%20newton_inline_fastmath_inbounds_simd.jl.png">

<img src="https://raw.githubusercontent.com/genkuroki/public/main/0046/runge-kutta%20benchmark/2023-12-27f%20julia%20include%20newton_inline_fastmath_inbounds_simd.jl.png">

In [1]:
function main(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    for it = 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

function Runge_Kutta_4th(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main()
@time main()
@time main()

  3.014927 seconds (12.02 k allocations: 1.492 GiB, 0.30% gc time)
  3.122160 seconds (12.02 k allocations: 1.492 GiB, 4.20% gc time)
  3.233280 seconds (12.02 k allocations: 1.492 GiB, 3.06% gc time)


In [2]:
function main_inline(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    for it = 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th_inline(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

@inline function Runge_Kutta_4th_inline(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main_inline()
@time main_inline()
@time main_inline()

  1.832203 seconds (12.02 k allocations: 1.492 GiB, 0.24% gc time)
  1.895642 seconds (12.02 k allocations: 1.492 GiB, 3.91% gc time)
  1.998617 seconds (12.02 k allocations: 1.492 GiB, 3.53% gc time)


In [3]:
function main_inline_simd(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    @simd for it in 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th_inline(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

@inline function Runge_Kutta_4th_inline(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main_inline_simd()
@time main_inline_simd()
@time main_inline_simd()

  1.888045 seconds (12.02 k allocations: 1.492 GiB, 4.70% gc time)
  1.802914 seconds (12.02 k allocations: 1.492 GiB, 4.72% gc time)
  1.792365 seconds (12.02 k allocations: 1.492 GiB, 4.76% gc time)


In [4]:
function main_fastmath(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    for it in 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th_fastmath(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

@fastmath function Runge_Kutta_4th_fastmath(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main_fastmath()
@time main_fastmath()
@time main_fastmath()

  2.788521 seconds (12.02 k allocations: 1.492 GiB, 0.11% gc time)
  2.026241 seconds (12.02 k allocations: 1.492 GiB, 3.94% gc time)
  1.918949 seconds (12.02 k allocations: 1.492 GiB, 4.44% gc time)


In [5]:
function main_inline_fastmath(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    for it in 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th_inline_fastmath(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

@inline @fastmath function Runge_Kutta_4th_inline_fastmath(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main_inline_fastmath()
@time main_inline_fastmath()
@time main_inline_fastmath()

  1.805358 seconds (12.02 k allocations: 1.492 GiB, 4.38% gc time)
  1.219772 seconds (12.02 k allocations: 1.492 GiB, 6.41% gc time)
  1.141317 seconds (12.02 k allocations: 1.492 GiB, 6.81% gc time)


In [6]:
function main_inline_fastmath_inbounds(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    @inbounds for it in 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th_inline_fastmath(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

@inline @fastmath function Runge_Kutta_4th_inline_fastmath(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main_inline_fastmath_inbounds()
@time main_inline_fastmath_inbounds()
@time main_inline_fastmath_inbounds()

  0.989219 seconds (12.02 k allocations: 1.492 GiB, 0.26% gc time)
  1.070283 seconds (12.02 k allocations: 1.492 GiB, 6.82% gc time)
  1.105354 seconds (12.02 k allocations: 1.492 GiB, 6.49% gc time)


In [7]:
function main_inline_fastmath_inbounds_simd(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    @inbounds @simd for it in 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th_inline_fastmath(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

@inline @fastmath function Runge_Kutta_4th_inline_fastmath(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main_inline_fastmath_inbounds_simd()
@time main_inline_fastmath_inbounds_simd()
@time main_inline_fastmath_inbounds_simd()

  1.036075 seconds (12.02 k allocations: 1.492 GiB, 7.37% gc time)
  1.029063 seconds (12.02 k allocations: 1.492 GiB, 7.45% gc time)
  1.026511 seconds (12.02 k allocations: 1.492 GiB, 8.70% gc time)


In [8]:
using BenchmarkTools

nt = 10^6
println("nt = ", nt)
print("main(nt):                              "); @btime main(nt)
print("main_inline(nt):                       "); @btime main_inline(nt)
print("main_inline_simd(nt):                  "); @btime main_inline_simd(nt)
print("main_fastmath(nt):                     "); @btime main_fastmath(nt)
print("main_inline_fastmath(nt):              "); @btime main_inline_fastmath(nt)
print("main_inline_fastmath_inbounds(nt):     "); @btime main_inline_fastmath_inbounds(nt)
print("main_inline_fastmath_inbounds_simd(nt):"); @btime main_inline_fastmath_inbounds_simd(nt)

nt = 1000000
main(nt):                                28.686 ms (12016 allocations: 16.74 MiB)
main_inline(nt):                         18.043 ms (12016 allocations: 16.74 MiB)
main_inline_simd(nt):                    16.508 ms (12016 allocations: 16.74 MiB)
main_fastmath(nt):                       18.360 ms (12016 allocations: 16.74 MiB)
main_inline_fastmath(nt):                11.082 ms (12016 allocations: 16.74 MiB)
main_inline_fastmath_inbounds(nt):       10.490 ms (12016 allocations: 16.74 MiB)
main_inline_fastmath_inbounds_simd(nt):  9.763 ms (12016 allocations: 16.74 MiB)


In [9]:
versioninfo()

Julia Version 1.10.0
Commit 3120989f39 (2023-12-25 18:01 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: 12 × Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, skylake)
  Threads: 18 on 12 virtual cores
Environment:
  JULIA_DEPOT_PATH = D:\.julia
  JULIA_NUM_PRECOMPILE_TASKS = 4
  JULIA_NUM_THREADS = 12
  JULIA_PYTHONCALL_EXE = D:\.julia\conda\3\python.exe


In [10]:
function main_inline_fastmath_inbounds_simd2(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    @inline @inbounds @simd for it in 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th_fastmath(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

@fastmath function Runge_Kutta_4th_fastmath(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main_inline_fastmath_inbounds_simd2()
@time main_inline_fastmath_inbounds_simd2()
@time main_inline_fastmath_inbounds_simd2()

  0.994817 seconds (12.02 k allocations: 1.492 GiB, 0.61% gc time)
  1.161595 seconds (12.02 k allocations: 1.492 GiB, 10.32% gc time)
  1.189117 seconds (12.02 k allocations: 1.492 GiB, 8.33% gc time)


In [11]:
print("main_inline_fastmath_inbounds_simd2(nt):"); @btime main_inline_fastmath_inbounds_simd2(nt)

main_inline_fastmath_inbounds_simd2(nt):  9.821 ms (12016 allocations: 16.74 MiB)
