In [1]:
function mcpi(L=10^9)
    c = 0
    for i in 1:L
        c += rand()^2 + rand()^2 ≤ 1
    end
    4c/L
end

@time mcpi()
@time mcpi()
@time mcpi()

  3.004477 seconds
  3.082193 seconds
  2.950187 seconds


3.141543924

In [2]:
function mcpi_threads(L=10^9)
    c = zeros(Int, Threads.nthreads())
    Threads.@threads for i in 1:L
        tid = Threads.threadid()
        c[tid] += rand()^2 + rand()^2 ≤ 1
    end
    4sum(c)/L
end

@time mcpi_threads()
@time mcpi_threads()
@time mcpi_threads()

  0.570726 seconds (14.41 k allocations: 963.109 KiB, 52.04% compilation time)
  0.575885 seconds (186 allocations: 18.641 KiB)
  0.567838 seconds (185 allocations: 18.266 KiB)


3.14154468

In [3]:
using LoopVectorization

# The result shall be 4.0 or 0.0.
function mcpi_turbo_incorrect(L=10^9)
    c = 0
    @turbo for i in 1:L
        c += rand()^2 + rand()^2 ≤ 1
    end
    4c/L
end

@time mcpi_turbo_incorrect()
@time mcpi_turbo_incorrect()
@time mcpi_turbo_incorrect()

  0.009913 seconds
  0.009352 seconds
  0.009505 seconds


4.0

In [4]:
using LoopVectorization

rand_is_in_unit_circle(i) = rand()^2 + rand()^2 ≤ 1

function mcpi_turbo(L=10^9)
    c = 0
    @turbo for i in 1:L
        c += rand_is_in_unit_circle(i)
    end
    4c/L
end

@time mcpi_turbo()
@time mcpi_turbo()
@time mcpi_turbo()

  0.479439 seconds
  0.479466 seconds
  0.476907 seconds


3.141378784

In [6]:
using CUDA
using Statistics

square(x) = x^2

function mcpi_cuda_count(L=10^8, t::Type{T}=Float32) where T
    4count(≤(1), sum(square, CUDA.rand(T, 2, L); dims=1)) / L
end

@time mean(mcpi_cuda_count() for _ in 1:10)
@time mean(mcpi_cuda_count() for _ in 1:10)
@time mean(mcpi_cuda_count() for _ in 1:10)

  0.398141 seconds (195.79 k allocations: 14.303 MiB, 4.28% gc time, 28.84% compilation time)
  0.222639 seconds (15.02 k allocations: 1008.812 KiB, 3.64% gc time, 6.94% compilation time)
  0.222909 seconds (15.03 k allocations: 1008.219 KiB, 3.01% gc time, 7.67% compilation time)


3.14164112