In [1]:
function invcumsum2d!(B::AbstractMatrix, C::AbstractMatrix)
    O = zero(eltype(C))
    for k in keys(C)
        B[k[1], k[2]] = (
            C[k[1], k[2]]
            - get(C, (k[1]-1, k[2]), O)
            - get(C, (k[1], k[2]-1), O)
            + get(C, (k[1]-1, k[2]-1), O)
        )
    end
    B
end

invcumsum2d(C::AbstractMatrix) = invcumsum2d!(similar(C), C)

function invcumsum!(B::AbstractArray{T,N}, C::AbstractArray{T,N}; dims::Integer) where {T,N}
    r = axes(C)
    r0 = ntuple(i -> i == dims ? UnitRange(1, last(r[i]) - 1) : UnitRange(r[i]), N)
    r1 = ntuple(i -> i == dims ? UnitRange(2, last(r[i])) : UnitRange(r[i]), N)
    r2 = ntuple(i -> i == dims ? UnitRange(1, 1) : UnitRange(r[i]), N)

    view(B, r2...) .= view(C, r2...)
    view(B, r1...) .= view(C, r1...) .- view(C, r0...)
    B
end

function invcumsum2d_view!(B::AbstractMatrix, C::AbstractMatrix)
    I, J = axes(C)
    view(B, I[begin], J[begin]) .= view(C, I[begin], J[begin])
    view(B, I[begin+1:end], J[begin]) .= (
        view(C, I[begin+1:end], J[begin])
        .- view(C, I[begin:end-1], J[begin])
    )
    view(B, I[begin], J[begin+1:end]) .= (
        view(C, I[begin], J[begin+1:end])
        .- view(C, I[begin], J[begin:end-1])
    )
    view(B, I[begin+1:end], J[begin+1:end]) .= (
        view(C, I[begin+1:end], J[begin+1:end])
        .- view(C, I[begin:end-1], J[begin+1:end])
        .- view(C, I[begin+1:end], J[begin:end-1])
        .+ view(C, I[begin:end-1], J[begin:end-1])
    )
    B
end

invcumsum2d_view! (generic function with 1 method)

In [2]:
n = 2^10
A = randn(n, n)
B0 = similar(A)
B1 = similar(A)
B2 = similar(A)
B3 = similar(A)
C = cumsum(cumsum(A, dims=2), dims=1)
@showtime A1 = invcumsum2d!(B0, C)
@showtime A2 = invcumsum!(B2, invcumsum!(B1, C; dims=2), dims=1)
@showtime A3 = invcumsum2d_view!(B3, C)
@showtime A4 = invcumsum2d(C)
println()
@showtime A1 = invcumsum2d!(B0, C)
@showtime A2 = invcumsum!(B2, invcumsum!(B1, C; dims=2), dims=1)
@showtime A3 = invcumsum2d_view!(B3, C)
@showtime A4 = invcumsum2d(C)
println()
@showtime A1 = invcumsum2d!(B0, C)
@showtime A2 = invcumsum!(B2, invcumsum!(B1, C; dims=2), dims=1)
@showtime A3 = invcumsum2d_view!(B3, C)
@showtime A4 = invcumsum2d(C)
A1 == A4 ≈ A2 ≈ A3 ≈ A

A1 = invcumsum2d!(B0, C): 0.040601 seconds (70.67 k allocations: 3.610 MiB, 91.46% compilation time)
A2 = invcumsum!(B2, invcumsum!(B1, C; dims = 2), dims = 1): 0.422290 seconds (2.44 M allocations: 112.591 MiB, 6.55% gc time, 98.82% compilation time)
A3 = invcumsum2d_view!(B3, C): 0.713199 seconds (3.06 M allocations: 137.786 MiB, 27.16% gc time, 99.42% compilation time)
A4 = invcumsum2d(C): 0.006460 seconds (974 allocations: 8.050 MiB, 55.60% compilation time)

A1 = invcumsum2d!(B0, C): 0.002000 seconds
A2 = invcumsum!(B2, invcumsum!(B1, C; dims = 2), dims = 1): 0.003262 seconds
A3 = invcumsum2d_view!(B3, C): 0.002136 seconds
A4 = invcumsum2d(C): 0.003603 seconds (2 allocations: 8.000 MiB)

A1 = invcumsum2d!(B0, C): 0.001917 seconds
A2 = invcumsum!(B2, invcumsum!(B1, C; dims = 2), dims = 1): 0.003122 seconds
A3 = invcumsum2d_view!(B3, C): 0.002040 seconds
A4 = invcumsum2d(C): 0.017222 seconds (2 allocations: 8.000 MiB, 83.66% gc time)


true