# Parallelize code using native julia methods

This notebook presents an example of a typical a parallel problem (count stuff on a big dataset)
and uses native julia conde only to solve it. The code will involve two steps:

- 1) Split data across processes, make independent computations on each process and get partial results
- 2) Join partial results

This notebook will focus on the use of the functions pmap, @spawn, fetch and remotecall.

Some related material:

- http://docs.julialang.org/en/release-0.5/manual/parallel-computing/
- https://github.com/JuliaLang/julia/blob/master/examples/wordcount.jl
- https://blog.ajdecon.org/parallel-word-count-with-julia-an-interesting/



In [1]:
addprocs(4) 

4-element Array{Int64,1}:
 2
 3
 4
 5

In [2]:
workers()

4-element Array{Int64,1}:
 2
 3
 4
 5

In [3]:
big_array = rand(1:10, 10^8);

In [4]:
function count_elements(array::Array{Int64})
    n = length(array)
    counts = Dict{Int64}{Int64}()
    for i in array
        if i in keys(counts)
            counts[i] += 1 
        else
            counts[i] = 1
        end
    end
    return counts
end

count_elements (generic function with 1 method)

In [10]:
@time result_sequential = count_elements(big_array);

  4.349756 seconds (3.19 k allocations: 137.636 KB)


#### Faster way to create counts

In [None]:
function count_elements2(array::Array{Int64})
    n = length(array)
    counts = Dict{Int64}{Int64}()
    for i in array
        counts[i] = get(counts,i,0) + 1
    end
    return counts
end

In [None]:
@time count_elements2(big_array);

### pmap function

Now we will build a custom reducer to aggregate the partial results then we will split the data
into similar size chunks and split the workload into different processess.

In [6]:
# reducer
function count_reduce(array_of_count_dicts)
    counts_combined = Dict{Int64}{Int64}()
    
    for d in array_of_count_dicts
        for k in keys(d)
            if k in keys(counts_combined)
                counts_combined[k] += d[k]  
            else
                counts_combined[k] = d[k] 
            end
        end
    end
    return counts_combined
end

count_reduce (generic function with 1 method)

In [7]:
# This code will fail because the different workers do not have the ¨count_elements" function
@time begin
n = length(big_array)
n_processors = length(workers())
splits_ind = [Int(x) for x in 1:(n/n_processors):(n+1)]
big_array_splits = [big_array[x:y-1] for (x,y) in zip(splits_ind[1:end-1], splits_ind[2:end])]
res = pmap(count_elements, big_array_splits)
d = count_reduce(res)
end

LoadError: LoadError: On worker 2:
UndefVarError: #count_elements not defined
 in deserialize_datatype at ./serialize.jl:823
 in handle_deserialize at ./serialize.jl:571
 in deserialize_msg at ./multi.jl:120
 in message_handler_loop at ./multi.jl:1317
 in process_tcp_streams at ./multi.jl:1276
 in #618 at ./event.jl:68
 in #remotecall_fetch#606(::Array{Any,1}, ::Function, ::Function, ::Base.Worker, ::Array{Int64,1}, ::Vararg{Array{Int64,1},N}) at ./multi.jl:1070
 in remotecall_fetch(::Function, ::Base.Worker, ::Array{Int64,1}, ::Vararg{Array{Int64,1},N}) at ./multi.jl:1062
 in #remotecall_fetch#609(::Array{Any,1}, ::Function, ::Function, ::Int64, ::Array{Int64,1}, ::Vararg{Array{Int64,1},N}) at ./multi.jl:1080
 in remotecall_fetch(::Function, ::Int64, ::Array{Int64,1}, ::Vararg{Array{Int64,1},N}) at ./multi.jl:1080
 in #remotecall_pool#689(::Array{Any,1}, ::Function, ::Function, ::Function, ::WorkerPool, ::Array{Int64,1}, ::Vararg{Array{Int64,1},N}) at ./workerpool.jl:93
 in remotecall_pool(::Function, ::Function, ::WorkerPool, ::Array{Int64,1}, ::Vararg{Array{Int64,1},N}) at ./workerpool.jl:91
 in #remotecall_fetch#692(::Array{Any,1}, ::Function, ::Function, ::WorkerPool, ::Array{Int64,1}, ::Vararg{Array{Int64,1},N}) at ./workerpool.jl:124
 in remotecall_fetch(::Function, ::WorkerPool, ::Array{Int64,1}, ::Vararg{Array{Int64,1},N}) at ./workerpool.jl:124
 in (::Base.###697#698#700{WorkerPool,#count_elements})(::Array{Any,1}, ::Function, ::Array{Int64,1}, ::Vararg{Array{Int64,1},N}) at ./workerpool.jl:151
 in (::Base.##697#699)(::Array{Int64,1}, ::Vararg{Array{Int64,1},N}) at ./workerpool.jl:151
 in macro expansion at ./asyncmap.jl:63 [inlined]
 in (::Base.##755#757{Base.AsyncCollector,Base.AsyncCollectorState})() at ./task.jl:360

...and 3 other exceptions.

while loading In[7], in expression starting on line 7

In [8]:
@everywhere function count_elements(array::Array{Int64})
    n = length(array)
    counts = Dict{Int64}{Int64}()
    for i in array
        if i in keys(counts)
            counts[i] += 1 
        else
            counts[i] = 1
        end
    end
    return counts
end



In [12]:
@time begin
    n = length(big_array)
    n_processors = length(workers())
    splits_ind = [Int(x) for x in 1:(n/n_processors):(n+1)]
    big_array_splits = [big_array[x:y-1] for (x,y) in zip(splits_ind[1:end-1], splits_ind[2:end])]
    res = pmap(count_elements, big_array_splits)
    result_paralel = count_reduce(res);
end

  2.302563 seconds (29.68 k allocations: 764.246 MB, 7.40% gc time)


Dict{Int64,Int64} with 10 entries:
  7  => 9995828
  4  => 10003840
  9  => 10001185
  10 => 9996515
  2  => 10000655
  3  => 9998257
  8  => 10002993
  5  => 9999068
  6  => 10003080
  1  => 9998579

In [15]:
# Both computations yield to the exact same result
result_paralel  == result_sequential

true

### @spawn and fetch functions

Using **```@spawn```** and **```fetch```** we can build our own pmaplike function.

- **```@spawn```**: Creates a closure around an expression and runs it on an automatically-chosen process, returning a Future to the result.

- **```fetch```**: Gets the computation returned from the Future object that we build using **```@spawn```**.

In [18]:
workers()

4-element Array{Int64,1}:
 2
 3
 4
 5

In [36]:
# 1) Splits input string into nprocs() equal-sized chunks (last one rounds up),
# 2) @spawns wordcount() for each chunk to run in parallel. 
# 3) Then fetch()s results and performs count_reduce().

function parallel_wordcount(big_array, n_processors)
    
    n = length(big_array)
    splits_ind = [Int(x) for x in 1:(n/n_processors):(n+1)]
    big_array_splits = [big_array[x:y-1] for (x,y) in zip(splits_ind[1:end-1], splits_ind[2:end])]
    
    partial_res = []
    for subarray in big_array_splits
        #r = remotecall(count_elements, subarray)
        push!(partial_res, @spawn count_elements(subarray) )
    end    
    results = [fetch(r) for r in partial_res]
    return count_reduce(results)
end




parallel_wordcount (generic function with 1 method)

In [46]:
@time r = parallel_wordcount(big_array, 4);

  2.293853 seconds (965 allocations: 762.993 MB, 4.63% gc time)


In [47]:
r

Dict{Int64,Int64} with 10 entries:
  7  => 9995828
  4  => 10003840
  9  => 10001185
  10 => 9996515
  2  => 10000655
  3  => 9998257
  8  => 10002993
  5  => 9999068
  6  => 10003080
  1  => 9998579

### Let us look at the code piece by piece

In [48]:
workers()

4-element Array{Int64,1}:
 2
 3
 4
 5

In [51]:
#run a command on a different worker
rmatrix = remotecall(2, rand, 2, 2)
print(rmatrix)

Future(2,1,72,Nullable{Any}())

In [52]:
rmatrix

Future(2,1,72,Nullable{Any}())

In [50]:
fetch(rmatrix)

2×2 Array{Float64,2}:
 0.57138   0.113624
 0.900078  0.490539

In [None]:
partial_res = []
for subarray in big_array_splits
    r = remotecall(count_elements, subarray)
    push!(partial_res, @spawn count_elements(subarray) )
end