In [112]:
using LimberJack
using QuadGK
using NPZ
using FITSIO
using BenchmarkTools
using Trapz

In [113]:
Threads.nthreads()

12

In [114]:
nzs = FITS("../data/DESY1_cls/y1_redshift_distributions_v1.fits")
nz = read(nzs["nz_lens"], "BIN2")
zs = read(nzs["nz_lens"], "Z_MID");
#ls = npzread("../data/DESY1_cls/DESgc_DESwl/cl_DESgc__2_DESwl__3.npz")["ell"];

In [146]:
ls = [Float64(l) for l in 1:1_000];

In [147]:
cosmology = LimberJack.Cosmology(0.3, 0.05, 0.67, 0.96, 0.811,
                                     tk_mode="EisHu",
                                     Pk_mode="Halofit");

In [148]:
tracer = NumberCountsTracer(cosmology, zs, nz, 2.0);

In [149]:
function serial_spec()
    for ell in ls
        quadgk(lk -> Cℓintegrand(cosmology, tracer, tracer, lk, ell),
               log(10^-4), log(10^2), rtol=1E-5)[1]/(ell+0.5)
    end
end

function parallel_spec()
    Threads.@threads for ell in ls
        quadgk(lk -> Cℓintegrand(cosmology, tracer, tracer, lk, ell),
               log(10^-4), log(10^2), rtol=1E-5)[1]/(ell+0.5)
    end
end

parallel_spec (generic function with 1 method)

In [150]:
bench = @benchmarkable serial_spec()
run(bench)

BenchmarkTools.Trial: 
  memory estimate:  751.58 MiB
  allocs estimate:  25504523
  --------------
  minimum time:     1.120 s (6.31% GC)
  median time:      1.161 s (6.62% GC)
  mean time:        1.259 s (6.12% GC)
  maximum time:     1.593 s (5.25% GC)
  --------------
  samples:          4
  evals/sample:     1

In [151]:
bench = @benchmarkable parallel_spec()
run(bench)

BenchmarkTools.Trial: 
  memory estimate:  751.42 MiB
  allocs estimate:  25497095
  --------------
  minimum time:     1.309 s (23.60% GC)
  median time:      1.329 s (23.51% GC)
  mean time:        1.555 s (17.65% GC)
  maximum time:     2.255 s (6.88% GC)
  --------------
  samples:          4
  evals/sample:     1

In [157]:
lks = log(10^-4):0.001:log(10^2);

In [158]:
integrand = zeros(length(lks))
function serial_spec_2(lks, ls)
    Cls = zeros(length(ls))
    for j in 1:length(ls)
        integrand = zeros(length(lks))
        for i in 1:length(lks)
            integrand[i] = Cℓintegrand(cosmology, tracer, tracer, lks[i], ls[j])
        end
        #integral = 0.5 .* (integrand[2:length(lks)].+integrand[1:length(lks)-1]) .- integrand[1]
        integral = trapz(lks, integrand)
        Cls[j] = integral
    end
    return Cls
end

serial_spec_2 (generic function with 1 method)

In [159]:
integrand = zeros(length(lks))
function parallel_spec_2(lks, ls)
    Cls = zeros(length(ls))
    Threads.@threads for j in 1:length(ls)
        integrand = zeros(length(lks))
        for i in 1:length(lks)
            integrand[i] = Cℓintegrand(cosmology, tracer, tracer, lks[i], ls[j])
        end
        #integral = 0.5 .* (integrand[2:length(lks)].+integrand[1:length(lks)-1]) .- integrand[1]
        integral = trapz(lks, integrand)
        Cls[j] = integral
    end
    return Cls
end

parallel_spec_2 (generic function with 1 method)

In [160]:
bench = @benchmarkable serial_spec_2(lks, ls)
run(bench)

BenchmarkTools.Trial: 
  memory estimate:  5.27 GiB
  allocs estimate:  182112566
  --------------
  minimum time:     7.060 s (7.05% GC)
  median time:      7.060 s (7.05% GC)
  mean time:        7.060 s (7.05% GC)
  maximum time:     7.060 s (7.05% GC)
  --------------
  samples:          1
  evals/sample:     1

In [161]:
bench = @benchmarkable parallel_spec_2(lks, ls)
run(bench)

BenchmarkTools.Trial: 
  memory estimate:  5.28 GiB
  allocs estimate:  182170968
  --------------
  minimum time:     8.164 s (22.80% GC)
  median time:      8.164 s (22.80% GC)
  mean time:        8.164 s (22.80% GC)
  maximum time:     8.164 s (22.80% GC)
  --------------
  samples:          1
  evals/sample:     1

In [43]:
function TheoryTest(cosmology, Nuisances, cls_meta, files)
    # OPT: move these loops outside the lkl
    tracers = []
    for tracer in cls_meta.tracers
        tracer_type = tracer[1]
        bin = tracer[2]
        nzs = files[string("nz_", tracer_type, bin)]
        zs = vec(nzs[1:1, :])
        nz = vec(nzs[2:2, :])
        if tracer_type == 1
            bias = string("b", bin)
            tracer = 
            
        elseif tracer_type == 2
            tracer = WeakLensingTracer(cosmology, zs, nz)
        else
            print("Not implemented")
            trancer = nothing
        end
        push!(tracers, tracer)
    end
    #Cls = Vector{Real}[]
    Cls = []
    @inbounds for i in 1:length(cls_meta.pairs)
        pair = cls_meta.pairs[i]
        ids = cls_meta.pairs_ids[i]
        ls = files[string("ls_", pair[1], pair[2], pair[3], pair[4])]
        tracer1 = tracers[ids[1]]
        tracer2 = tracers[ids[2]]
        Cl = zeros(length(ls))
        @inbounds Threads.@threads for i in 1:length(ls)
            Cl[i] = angularCℓ(cosmology, tracer1, tracer2, ls[i]) 
        end
        push!(Cls, Cl)
    end
    return vcat(Cls...)
    
end

TheoryTest (generic function with 1 method)

In [5]:
function th(Ωm, h, s8)
    b0 = 1
    b1 = 1
    b2 = 1
    b3 = 1
    b4 = 1
    
    nuisances = Dict("b0" => b0,
                     "b1" => b1,
                     "b2" => b2,
                     "b3" => b3,
                     "b4" => b4)
    
    cosmology = LimberJack.Cosmology(Ωm, 0.05, h, 0.96, s8,
                                     tk_mode="EisHu",
                                     Pk_mode="Halofit")
    
    theory = TheoryTest(cosmology, nuisances, Cls_meta, files)
    return theory
end;

In [21]:
bench = @benchmarkable th(0.30, 0.60, 0.80)
run(bench)

BenchmarkTools.Trial: 
  memory estimate:  1.06 GiB
  allocs estimate:  55609105
  --------------
  minimum time:     1.546 s (10.52% GC)
  median time:      1.570 s (10.36% GC)
  mean time:        1.583 s (10.57% GC)
  maximum time:     1.634 s (11.26% GC)
  --------------
  samples:          3
  evals/sample:     1

In [12]:
using QuadGK
using Trapz
n = 1_000_000
a = rand(n);
b = rand(n);
f(ai,bi) = acos((ai*bi)/(ai+bi))+asin((ai*bi)/(ai+bi))
f2(ai) = quadgk(x -> exp(-x^2), 0, abs(ai), rtol=1e-8)[1]
f3(ai, ai_p1) = 0.5*(ai_p1+ai)#-0.5*a[1]

f3 (generic function with 2 methods)

In [19]:
function serial_1(a)
    n = length(a)
    c = zeros(n)
    for i in 1:n
        #c[i] = f(a[i], b[i])
        c[i] = f2(a[i])
    end
end

function parallel_1(a)
    n = length(a)
    c = zeros(n)
    Threads.@threads for i in 1:n
        #c[i] = f(a[i], b[i])
        c[i] = f2(a[i])
    end
end 

function serial_2(a)
    n = length(a)
    c = zeros(n)
    Threads.@threads for i in 1:n-1
        #c[i] = f(a[i], b[i])
        c[i] = f3(a[i], a[i+1])
    end
    c = c.+a[1]
end

function parallel_2(a)
    n = length(a)
    c = zeros(n)
    Threads.@threads for i in 1:n-1
        #c[i] = f(a[i], b[i])
        c[i] = f3(a[i], a[i+1])
    end
    c = c.+a[1]
end 

parallel_2 (generic function with 1 method)

In [20]:
bench = @benchmarkable serial_1(a)
run(bench)

BenchmarkTools.Trial: 
  memory estimate:  190.73 MiB
  allocs estimate:  9000002
  --------------
  minimum time:     419.626 ms (2.64% GC)
  median time:      452.478 ms (2.84% GC)
  mean time:        448.346 ms (3.20% GC)
  maximum time:     483.953 ms (4.80% GC)
  --------------
  samples:          12
  evals/sample:     1

In [21]:
bench = @benchmarkable serial_2(a)
run(bench)

BenchmarkTools.Trial: 
  memory estimate:  45.77 MiB
  allocs estimate:  1999556
  --------------
  minimum time:     6.575 ms (0.00% GC)
  median time:      33.797 ms (0.00% GC)
  mean time:        35.881 ms (15.75% GC)
  maximum time:     144.593 ms (0.00% GC)
  --------------
  samples:          136
  evals/sample:     1

In [22]:
bench = @benchmarkable parallel_1(a)
run(bench)

BenchmarkTools.Trial: 
  memory estimate:  190.74 MiB
  allocs estimate:  9000064
  --------------
  minimum time:     211.870 ms (0.00% GC)
  median time:      229.415 ms (0.00% GC)
  mean time:        255.459 ms (17.96% GC)
  maximum time:     366.624 ms (45.60% GC)
  --------------
  samples:          20
  evals/sample:     1

In [23]:
bench = @benchmarkable parallel_2(a)
run(bench)

BenchmarkTools.Trial: 
  memory estimate:  45.77 MiB
  allocs estimate:  1999556
  --------------
  minimum time:     7.045 ms (0.00% GC)
  median time:      33.102 ms (0.00% GC)
  mean time:        32.192 ms (16.19% GC)
  maximum time:     84.447 ms (0.00% GC)
  --------------
  samples:          156
  evals/sample:     1

In [44]:
for i in 1:10
    @time simple_example(1000000)
end

LoadError: UndefVarError: simple_example not defined

In [57]:
for i in 1:10
    @time simple_parallel_example(1000000)
end

LoadError: UndefVarError: simple_parallel_example not defined

In [34]:
foo(x) = rand(x,x,x)

function f_parallel(n)
    Threads.@threads for _ in 1:Threads.nthreads()
        foo(n)
    end
end

function f_serial(n)
    for _ in 1:Threads.nthreads()
        foo(n)
    end
end

f_serial (generic function with 1 method)

In [36]:
@btime f_parallel(100)

  7.866 ms (85 allocations: 91.56 MiB)


In [37]:
@btime f_serial(100)

  12.164 ms (24 allocations: 91.55 MiB)


In [38]:
@btime f_parallel(1)

  38.300 μs (73 allocations: 6.52 KiB)


In [39]:
@btime f_serial(1)

  368.116 ns (12 allocations: 768 bytes)


In [40]:
collect(1:100_000_000)

100000000-element Vector{Int64}:
         1
         2
         3
         4
         5
         6
         7
         8
         9
        10
        11
        12
        13
         ⋮
  99999989
  99999990
  99999991
  99999992
  99999993
  99999994
  99999995
  99999996
  99999997
  99999998
  99999999
 100000000

In [51]:
nset = collect(1:100_000_000);

foo(i) = sqrt((i*(i-1))%(i+1)^2) # some function

function fparallel(nset) 
    nset_perthread = round(Int64,length(nset)/Threads.nthreads())
    Threads.@threads for i in 1:Threads.nthreads()
        jfirst = nset_perthread*(i-1)+1
        jlast = jfirst + nset_perthread - 1
        for j in jfirst:jlast
           foo(j)
        end
    end
end

function fserial(nset) 
    for i in 1:length(nset)
        foo(i)
    end
end

function fparallel2(nset) 
    Threads.@threads for i in 1:length(nset)
        foo(i)
    end
end

fparallel2 (generic function with 1 method)

In [52]:
@btime fserial($nset)

@btime fparallel($nset)

@btime fparallel2($nset)

  809.550 ms (0 allocations: 0 bytes)
  125.218 ms (61 allocations: 5.77 KiB)
  123.177 ms (61 allocations: 5.77 KiB)
