# MyUtils.jl

* Copyright (c) 2021 Gen Kuroki
* License: https://opensource.org/licenses/MIT
* Repository: https://github.com/genkuroki/MyUtils.jl
* nbviewer: https://nbviewer.jupyter.org/github/genkuroki/MyUtils.jl/blob/main/MyUtils.ipynb

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#@my_threads-and-@my_distributed" data-toc-modified-id="@my_threads-and-@my_distributed-1"><span class="toc-item-num">1&nbsp;&nbsp;</span><code>@my_threads</code> and <code>@my_distributed</code></a></span></li><li><span><a href="#printf-functions" data-toc-modified-id="printf-functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>printf functions</a></span></li><li><span><a href="#RemoveOneFrom(A::AbstractVector,-k::Integer)-<:-AbstractVector" data-toc-modified-id="RemoveOneFrom(A::AbstractVector,-k::Integer)-<:-AbstractVector-3"><span class="toc-item-num">3&nbsp;&nbsp;</span><code>RemoveOneFrom(A::AbstractVector, k::Integer) &lt;: AbstractVector</code></a></span></li></ul></div>

In [1]:
@show VERSION
isfile("Project.toml") && using Revise
using MyUtils

VERSION = v"1.7.0-DEV.859"


## `@my_threads` and `@my_distributed`

In [2]:
using Base.Threads
using Distributed
using Random

@show nthreads();

nthreads() = 8


In [3]:
?MyUtils.@my_threads

```
@my_threads
```

A macro to parallelize a `for` loop to run with multiple threads.  It splits the iteration space among multiple tasks with `prebody` and `postbody`. It runs those tasks on threads according to a scheduling policy.

Usage:

```julia
@my_threads [schedule] begin
    prebody
end for ...
    ...
end begin
    postbody
end
```


In [4]:
?MyUtils.@my_distributed

```
@my_distributed
```

A distributed memory, parallel for loop of the form:

```julia
@my_distributed begin
    prebody
end [reducer] for var = range
    body
end
```


In [5]:
function mcpi(N)
    rng = Random.default_rng()
    c = 0
    for i in 1:N
        c += ifelse(rand(rng)^2 + rand(rng)^2 ≤ 1, 1, 0)
    end
    4c/N
end

@time mcpi(10^8)
@time mcpi(10^8)
@time mcpi(10^8)

  0.383953 seconds (12 allocations: 19.656 KiB)
  0.377749 seconds
  0.329752 seconds


3.1414724

In [6]:
function mcpi_my_threads(N)
    a = Atomic{Int}(0)
    MyUtils.@my_threads begin
        rng = Random.default_rng()
        c = 0
    end for i in 1:N
        c += ifelse(rand(rng)^2 + rand(rng)^2 ≤ 1, 1, 0)
    end begin
        atomic_add!(a, c)
    end
    4a[]/N
end

@time mcpi_my_threads(10^8)
@time mcpi_my_threads(10^8)
@time mcpi_my_threads(10^8)

  0.169824 seconds (56.62 k allocations: 3.487 MiB, 32.67% compilation time)
  0.096930 seconds (56 allocations: 4.688 KiB)
  0.100084 seconds (59 allocations: 4.875 KiB)


3.1417468

In [7]:
rmprocs(procs()[2:end])
addprocs(nthreads())
@show workers()

@everywhere begin
    using MyUtils
    using Random
end

workers() = [2, 3, 4, 5, 6, 7, 8, 9]


In [8]:
function mcpi_my_distributed(N)
    c = MyUtils.@my_distributed begin
        rng = Random.default_rng()
    end (+) for i in 1:N
        ifelse(rand(rng)^2 + rand(rng)^2 ≤ 1, 1, 0)
    end
    4c/N
end

@time mcpi_my_distributed(10^8)
@time mcpi_my_distributed(10^8)
@time mcpi_my_distributed(10^8)

  1.802507 seconds (975.23 k allocations: 57.828 MiB, 4.02% gc time, 16.58% compilation time)
  0.100317 seconds (661 allocations: 29.016 KiB)
  0.102805 seconds (652 allocations: 28.078 KiB)


3.1416204

In [9]:
using BenchmarkTools

@btime mcpi(10^8)
@btime mcpi_my_threads(10^8)
@btime mcpi_my_distributed(10^8)

  324.212 ms (0 allocations: 0 bytes)
  96.287 ms (49 allocations: 4.27 KiB)
  92.173 ms (649 allocations: 27.95 KiB)


3.14164812

In [10]:
rmprocs(procs()[2:end])
@show workers();

workers() = [1]


## printf functions

In [11]:
for k in 0:10
    printf("%.$(k)f\n", π)
end

3
3.1
3.14
3.142
3.1416
3.14159
3.141593
3.1415927
3.14159265
3.141592654
3.1415926536


In [12]:
[sprintf("%.$(k)f", π) for k in 0:10]

11-element Vector{String}:
 "3"
 "3.1"
 "3.14"
 "3.142"
 "3.1416"
 "3.14159"
 "3.141593"
 "3.1415927"
 "3.14159265"
 "3.141592654"
 "3.1415926536"

## `RemoveOneFrom(A::AbstractVector, k::Integer) <: AbstractVector`

In [13]:
@doc MyUtils.RemoveOneFrom

```
RemoveOneFrom(A::AbstractVector, k::Integer) <: AbstractVector
```

Equivalent to A[[1:k-1; k+1:end]] without unnecessary memory allocation.

Example

```julia
A = [1, 2, 3, 4, 5]
R = RemoveOneFrom(A, 3)
@show collect(R)
# -> collect(R) = [1, 2, 4, 5]
R[3] = 99
@show R
# -> R = [1, 2, 99, 5]
@show A
# -> A = [1, 2, 3, 99, 5]
```


In [14]:
A = [1, 2, 3, 4, 5]
R = MyUtils.RemoveOneFrom(A, 3)
@show collect(R)
R[3] = 99
@show R
@show A;

collect(R) = [1, 2, 4, 5]
R = [1, 2, 99, 5]
A = [1, 2, 3, 99, 5]


In [15]:
A = collect(1:10^6)
k = 5*10^5
@time A[[1:k-1; k+1:end]]
@time A[[1:k-1; k+1:end]]
@time A[[1:k-1; k+1:end]]
println()
@time @view A[[1:k-1; k+1:end]]
@time @view A[[1:k-1; k+1:end]]
@time @view A[[1:k-1; k+1:end]]
println()
@time MyUtils.RemoveOneFrom(A, k)
@time MyUtils.RemoveOneFrom(A, k)
@time MyUtils.RemoveOneFrom(A, k);

  0.029192 seconds (302 allocations: 15.275 MiB, 28.36% gc time, 42.99% compilation time)
  0.007313 seconds (9 allocations: 15.259 MiB)
  0.014945 seconds (9 allocations: 15.259 MiB, 52.97% gc time)

  0.022089 seconds (22.99 k allocations: 8.949 MiB, 83.95% compilation time)
  0.003430 seconds (8 allocations: 7.630 MiB)
  0.003610 seconds (8 allocations: 7.630 MiB)

  0.000003 seconds (1 allocation: 32 bytes)
  0.000005 seconds (1 allocation: 32 bytes)
  0.000004 seconds (1 allocation: 32 bytes)
