https://x.com/kame_no_mori/status/1734888762914820124?s=61&t=_KnHkB3gSNKRbi3Ce1GncA

In [1]:
function main(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    for it = 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th!(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

function Runge_Kutta_4th!(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main()
@time main()
@time main()

  2.962083 seconds (12.02 k allocations: 1.492 GiB, 0.44% gc time)
  3.107633 seconds (12.02 k allocations: 1.492 GiB, 4.14% gc time)
  3.056604 seconds (12.02 k allocations: 1.492 GiB, 3.32% gc time)


In [2]:
function main_inline(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    for it = 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th_inline!(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

@inline function Runge_Kutta_4th_inline!(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main_inline()
@time main_inline()
@time main_inline()

  1.891347 seconds (12.02 k allocations: 1.492 GiB, 5.51% gc time)
  1.896394 seconds (12.02 k allocations: 1.492 GiB, 5.30% gc time)
  1.894819 seconds (12.02 k allocations: 1.492 GiB, 5.14% gc time)


In [3]:
function main_inline_simd(nt = 100000000)
    mass = 1.0
    k = 1.0
    dt = 1e-2

    xt = zeros(Float64, nt+1)
    vt = zeros(Float64, nt+1)

    x = 0.0
    v = 1.0

    @simd for it in 1:nt+1
        xt[it] = x
        vt[it] = v
        x, v = Runge_Kutta_4th_inline!(x, v, dt, mass, k)
    end

    open("result_julia.out", "w") do file
        for it = nt-999:nt
            println(file, "$(it*dt) $(xt[it]) $(vt[it])")
        end
    end
end

@inline function Runge_Kutta_4th_inline!(x, v, dt, mass, k)
    x1 = v
    v1 = force(x, mass, k)

    x2 = v + 0.5 * dt * v1
    v2 = force(x + 0.5 * x1 * dt, mass, k)

    x3 = v + 0.5 * dt * v2
    v3 = force(x + 0.5 * x2 * dt, mass, k)

    x4 = v + dt * v3
    v4 = force(x + x3 * dt, mass, k)

    x += (x1 + 2 * x2 + 2 * x3 + x4) * dt / 6
    v += (v1 + 2 * v2 + 2 * v3 + v4) * dt / 6

    return x, v
end

function force(x, mass, k)
    return -x * k / mass
end

@time main_inline_simd()
@time main_inline_simd()
@time main_inline_simd()

  1.735821 seconds (12.02 k allocations: 1.492 GiB, 5.66% gc time)
  1.764188 seconds (12.02 k allocations: 1.492 GiB, 5.74% gc time)
  1.687790 seconds (12.02 k allocations: 1.492 GiB, 5.49% gc time)


In [4]:
using BenchmarkTools

nt = 10^6
@btime main(nt)
@btime main_inline(nt)
@btime main_inline_simd(nt)

  28.581 ms (12016 allocations: 16.74 MiB)
  17.674 ms (12016 allocations: 16.74 MiB)
  15.820 ms (12016 allocations: 16.74 MiB)


In [5]:
versioninfo()

Julia Version 1.9.4
Commit 8e5136fa29 (2023-11-14 08:46 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: 12 × Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, skylake)
  Threads: 13 on 12 virtual cores
Environment:
  JULIA_DEPOT_PATH = D:\.julia
  JULIA_NUM_PRECOMPILE_TASKS = 4
  JULIA_NUM_THREADS = 12
  JULIA_PYTHONCALL_EXE = D:\.julia\conda\3\python.exe
