In [1]:
function mcpi(L=10^9)
    c = 0
    for i in 1:L
        c += rand()^2 + rand()^2 ≤ 1
    end
    4c/L
end

@time mcpi()
@time mcpi()
@time mcpi()

  3.149878 seconds
  3.112256 seconds
  3.208715 seconds


3.141540284

In [2]:
function mcpi_threads(L=10^9)
    c = zeros(Int, Threads.nthreads())
    Threads.@threads for i in 1:L
        tid = Threads.threadid()
        c[tid] += rand()^2 + rand()^2 ≤ 1
    end
    4sum(c)/L
end

@time mcpi_threads()
@time mcpi_threads()
@time mcpi_threads()

  0.648566 seconds (14.40 k allocations: 963.016 KiB, 45.19% compilation time)
  0.618869 seconds (185 allocations: 18.344 KiB)
  0.626113 seconds (187 allocations: 18.297 KiB)


3.141655004

In [3]:
using LoopVectorization

# The result shall be 4.0 or 0.0.
function mcpi_turbo_incorrect(L=10^9)
    c = 0
    @turbo for i in 1:L
        c += rand()^2 + rand()^2 ≤ 1
    end
    4c/L
end

@time mcpi_turbo_incorrect()
@time mcpi_turbo_incorrect()
@time mcpi_turbo_incorrect()

  0.009977 seconds
  0.010647 seconds
  0.010867 seconds


0.0

In [4]:
using LoopVectorization

rand_is_in_unit_circle(i) = rand()^2 + rand()^2 ≤ 1

function mcpi_turbo(L=10^9)
    c = 0
    @turbo for i in 1:L
        c += rand_is_in_unit_circle(i)
    end
    4c/L
end

@time mcpi_turbo()
@time mcpi_turbo()
@time mcpi_turbo()

  0.506452 seconds
  0.500448 seconds
  0.500709 seconds


3.14158032

In [5]:
using CUDA
using BenchmarkTools

function mcpi_cuda_count(L=10^8, t::Type{T}=Float32) where T
    4count(≤(1), sum(x->x^2, CUDA.rand(T, 2, L); dims=1)) / L
end

a = @btime mcpi(10^8)
b = @btime mcpi_threads(10^8)
c = @btime mcpi_turbo(10^8)
d = @btime mcpi_cuda_count(10^8)
a, b, c, d

  309.031 ms (0 allocations: 0 bytes)
  60.444 ms (62 allocations: 7.61 KiB)
  48.502 ms (0 allocations: 0 bytes)
  18.552 ms (122 allocations: 6.25 KiB)


(3.1418928, 3.14175192, 3.14192928, 3.14139216)