In [1]:
@show VERSION;

VERSION = v"1.7.0-DEV.650"


In [2]:
if isfile("Project.toml")
    #using Pkg
    #Pkg.activate(".")
    using Revise
end

In [3]:
using MyUtils
using Base.Threads
using Distributed
using Random

@show nthreads();

nthreads() = 8


In [4]:
?MyUtils.@my_threads

```
@my_threads
```

A macro to parallelize a `for` loop to run with multiple threads.  It splits the iteration space among multiple tasks with `prebody` and `prebody`. It runs those tasks on threads according to a scheduling policy.

Usage:

```julia
@my_threads [schedule] begin
    prebody
end for ...
    ...
end begin
    postbody
end
```


In [5]:
?MyUtils.@my_distributed

```
@my_distributed
```

A distributed memory, parallel for loop of the form:

```julia
@my_distributed begin
    prebody
end [reducer] for var = range
    body
end
```


In [6]:
function mcpi(N)
    rng = Random.default_rng()
    c = 0
    for i in 1:N
        c += ifelse(rand(rng)^2 + rand(rng)^2 ≤ 1, 1, 0)
    end
    4c/N
end

@time mcpi(10^8)
@time mcpi(10^8)
@time mcpi(10^8)

  0.343664 seconds (12 allocations: 19.656 KiB)
  0.342010 seconds
  0.321657 seconds


3.14157208

In [7]:
function mcpi_my_threads(N)
    a = Atomic{Int}(0)
    MyUtils.@my_threads begin
        rng = Random.default_rng()
        c = 0
    end for i in 1:N
        c += ifelse(rand(rng)^2 + rand(rng)^2 ≤ 1, 1, 0)
    end begin
        atomic_add!(a, c)
    end
    4a[]/N
end

@time mcpi_my_threads(10^8)
@time mcpi_my_threads(10^8)
@time mcpi_my_threads(10^8)

  0.146481 seconds (54.86 k allocations: 3.421 MiB, 38.05% compilation time)
  0.088288 seconds (53 allocations: 4.594 KiB)
  0.089784 seconds (51 allocations: 4.391 KiB)


3.14149444

In [8]:
rmprocs(procs()[2:end])
addprocs(8)
@show workers()

@everywhere begin
    #if isfile("Project.toml")
    #    using Pkg
    #    Pkg.activate(".")
    #end
    using MyUtils
    using Random
end

workers() = [2, 3, 4, 5, 6, 7, 8, 9]


In [9]:
function mcpi_my_distributed(N)
    c = MyUtils.@my_distributed begin
        rng = Random.default_rng()
    end (+) for i in 1:N
        ifelse(rand(rng)^2 + rand(rng)^2 ≤ 1, 1, 0)
    end
    4c/N
end

@time mcpi_my_distributed(10^8)
@time mcpi_my_distributed(10^8)
@time mcpi_my_distributed(10^8)

  1.724397 seconds (950.04 k allocations: 56.819 MiB, 1.66% gc time, 18.32% compilation time)
  0.092918 seconds (659 allocations: 28.953 KiB)
  0.088074 seconds (655 allocations: 28.109 KiB)


3.14167032

In [10]:
using BenchmarkTools

@btime mcpi(10^8)
@btime mcpi_my_threads(10^8)
@btime mcpi_my_distributed(10^8)

  315.588 ms (0 allocations: 0 bytes)
  88.862 ms (49 allocations: 4.27 KiB)
  86.232 ms (647 allocations: 27.86 KiB)


3.14154528

In [11]:
rmprocs(procs()[2:end])
@show workers();

workers() = [1]
