https://discourse.julialang.org/t/comparing-numba-and-julia-for-a-complex-matrix-computation/63703/11

In [1]:
versioninfo()

Julia Version 1.6.1
Commit 6aaedecc44 (2021-04-23 05:59 UTC)
Platform Info:
  OS: Windows (x86_64-w64-mingw32)
  CPU: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-11.0.1 (ORCJIT, skylake)
Environment:
  JULIA_DEPOT_PATH = D:\.julia
  JULIA_NUM_THREADS = 12
  JULIA_PYTHONCALL_EXE = D:\.julia\conda\3\python.exe


In [2]:
# resized original version

height, width = 187, 746;
#org_sized = rand(Float32, (2001, 2001)) * 60;
org_sized = rand(Float32, (401, 401)) * 60;
#org_sized = rand(Float32, (101, 101)) * 60;
shadow_time_hrs = zeros(Float32, size(org_sized));
height_mat = rand(Float32, (height, width)) * 100; # originally values getting larger from (0, width//2) to the outside with the distance squared


angle_mat = round.(Int32, 2 .* atand.(0:height-1, (0:width-1)' .-(width/2-1))) .+ 1;

enlarged_size = zeros(eltype(org_sized), size(org_sized, 1) + height, size(org_sized, 2) + width);
enlarged_size[1:size(org_sized, 1), range(Int(width/2), length=size(org_sized, 2))] = org_sized;

weights = rand(Float32, 361)/ 10 .+ 0.01;  # weights originally larger in the middle

function computeSumHours(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x)
    height, width = size(height_mat)
    short_elevations = @view enlarged_size[y:y+height, x:x+width]
    shadowed_segments = zeros(eltype(weights), 361)

    for x2 in 1:width
        for y2 in 1:height
            overshadowed = (short_elevations[y2, x2] - org_sized[y, x]) > height_mat[y2, x2]
            if overshadowed
                angle = angle_mat[y2, x2]
                if shadowed_segments[angle] == 0.0
                    shadowed_segments[angle] = weights[angle]
                end
            end
        end
    end

    return sum(shadowed_segments)
end

function computeAllLines(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights)
    for x in 1:size(org_sized, 2) - 1
        for y  in 1:size(org_sized, 1) - 1
            shadow_time_hrs[x, y] = computeSumHours(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x)
        end
    end
    return shadow_time_hrs
end

@time result = computeAllLines(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights)
@time result = computeAllLines(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights)
@time result = computeAllLines(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights)
r0 = result;

 27.782046 seconds (358.24 k allocations: 480.214 MiB, 0.23% gc time, 0.23% compilation time)
 27.535812 seconds (160.00 k allocations: 468.750 MiB, 0.18% gc time)
 27.422671 seconds (160.00 k allocations: 468.750 MiB, 0.17% gc time)


In [3]:
# revised version

weights = Float32.(weights)
shadowed_segments = zeros(eltype(weights), 361)

function computeSumHours!(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x, shadowed_segments)
    height, width = size(height_mat)
    short_elevations = @view enlarged_size[y:y+height, x:x+width]
    shadowed_segments .= 0

    @inbounds for x2 in 1:width
        for y2 in 1:height
            overshadowed = (short_elevations[y2, x2] - org_sized[y, x]) > height_mat[y2, x2]
            if overshadowed
                angle = angle_mat[y2, x2]
                if shadowed_segments[angle] == 0.0
                    shadowed_segments[angle] = weights[angle]
                end
            end
        end
    end

    return sum(shadowed_segments)
end

function computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
    for x in 1:size(org_sized, 2) - 1
        for y  in 1:size(org_sized, 1) - 1
            shadow_time_hrs[x, y] = computeSumHours!(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x, shadowed_segments)
        end
    end
    return shadow_time_hrs
end

@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
r1 = result;

 20.327024 seconds (185.93 k allocations: 11.005 MiB, 0.26% compilation time)
 20.414127 seconds
 20.300706 seconds


In [4]:
# revised version 2

weights = Float32.(weights)
shadowed_segments = zeros(eltype(weights), 361)

function computeSumHours!(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x, shadowed_segments)
    height, width = size(height_mat)
    short_elevations = @view enlarged_size[y:y+height, x:x+width]
    shadowed_segments .= 0

    Threads.@threads for x2 in 1:width
        for y2 in 1:height
            @inbounds overshadowed = (short_elevations[y2, x2] - org_sized[y, x]) > height_mat[y2, x2]
            @inbounds if overshadowed
                angle = angle_mat[y2, x2]
                if shadowed_segments[angle] == 0.0
                    shadowed_segments[angle] = weights[angle]
                end
            end
        end
    end

    return sum(shadowed_segments)
end

function computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
    for x in 1:size(org_sized, 2) - 1
        for y  in 1:size(org_sized, 1) - 1
            shadow_time_hrs[x, y] = computeSumHours!(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x, shadowed_segments)
        end
    end
    return shadow_time_hrs
end

@show Threads.nthreads()
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
r2 = result;

Threads.nthreads() = 12
  5.735022 seconds (10.10 M allocations: 996.348 MiB, 4.01% gc time, 0.75% compilation time)
  5.994589 seconds (9.96 M allocations: 987.436 MiB, 3.52% gc time)
  5.925293 seconds (9.95 M allocations: 987.364 MiB, 3.75% gc time)


In [5]:
# revised version 3

weights = Float32.(weights)
shadowed_segments = zeros(eltype(weights), 361)

function computeSumHours!(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x, shadowed_segments)
    height, width = size(height_mat)
    short_elevations = @view enlarged_size[y:y+height, x:x+width]
    shadowed_segments .= 0

    Threads.@threads for x2 in 1:width
        @inbounds @simd for y2 in 1:height
            overshadowed = (short_elevations[y2, x2] - org_sized[y, x]) > height_mat[y2, x2]
            if overshadowed
                angle = angle_mat[y2, x2]
                if shadowed_segments[angle] == 0.0
                    shadowed_segments[angle] = weights[angle]
                end
            end
        end
    end

    return sum(shadowed_segments)
end

function computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
    shadowed_segments = zeros(eltype(weights), 361)
    for x in 1:size(org_sized, 2) - 1
        for y  in 1:size(org_sized, 1) - 1
            shadow_time_hrs[x, y] = computeSumHours!(org_sized, enlarged_size, angle_mat, height_mat, weights, y, x, shadowed_segments)
        end
    end
    return shadow_time_hrs
end

@show Threads.nthreads()
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
@time result = computeAllLines!(org_sized, enlarged_size, angle_mat, height_mat, shadow_time_hrs, weights, shadowed_segments)
r3 = result;

Threads.nthreads() = 12
  6.086136 seconds (10.12 M allocations: 997.237 MiB, 3.73% gc time, 0.65% compilation time)
  6.001222 seconds (9.95 M allocations: 987.398 MiB, 3.53% gc time)
  6.127604 seconds (9.96 M allocations: 987.518 MiB, 3.43% gc time)


In [6]:
r0 ≈ r1 ≈ r2 ≈ r3

true