In [1]:
function mcpi(L=10^9)
    c = 0
    for i in 1:L
        c += rand()^2 + rand()^2 ≤ 1
    end
    4c/L
end

@time mcpi()
@time mcpi()
@time mcpi()

  2.955247 seconds
  2.954732 seconds
  2.944167 seconds


3.141618232

In [2]:
function mcpi_threads(L=10^9)
    c = zeros(Int, Threads.nthreads())
    Threads.@threads for i in 1:L
        tid = Threads.threadid()
        c[tid] += rand()^2 + rand()^2 ≤ 1
    end
    4sum(c)/L
end

@time mcpi_threads()
@time mcpi_threads()
@time mcpi_threads()

  0.601257 seconds (14.41 k allocations: 963.438 KiB, 49.35% compilation time)
  0.589864 seconds (185 allocations: 18.344 KiB)
  0.582573 seconds (185 allocations: 18.266 KiB)


3.14165014

In [3]:
using LoopVectorization

# The result shall be 4.0 or 0.0.
function mcpi_turbo_incorrect(L=10^9)
    c = 0
    @turbo for i in 1:L
        c += rand()^2 + rand()^2 ≤ 1
    end
    4c/L
end

@time mcpi_turbo_incorrect()
@time mcpi_turbo_incorrect()
@time mcpi_turbo_incorrect()

  0.010046 seconds
  0.009370 seconds
  0.010249 seconds


4.0

In [4]:
using LoopVectorization

rand_is_in_unit_circle(i) = rand()^2 + rand()^2 ≤ 1

function mcpi_turbo(L=10^9)
    c = 0
    @turbo for i in 1:L
        c += rand_is_in_unit_circle(i)
    end
    4c/L
end

@time mcpi_turbo()
@time mcpi_turbo()
@time mcpi_turbo()

  0.505831 seconds
  0.481002 seconds
  0.473454 seconds


3.141540992

In [5]:
using CUDA
using BenchmarkTools

function mcpi_cuda_count(L=10^8, t::Type{T}=Float32) where T
    4count(≤(1), sum(x->x^2, CUDA.rand(T, 2, L); dims=1)) / L
end

a = @btime mcpi(10^8)
b = @btime mcpi_threads(10^8)
c = @btime mcpi_turbo(10^8)
d = @btime mcpi_cuda_count(10^8)
a, b, c, d

  292.546 ms (0 allocations: 0 bytes)
  56.556 ms (62 allocations: 7.61 KiB)
  46.445 ms (0 allocations: 0 bytes)
  18.374 ms (122 allocations: 6.25 KiB)


(3.14146788, 3.1415758, 3.141968, 3.14149452)