Skip to content

Commit

Permalink
Merge pull request #83 from eth-cscs/LoopVectorizationExt
Browse files Browse the repository at this point in the history
Use extension for LoopVectorization dependency
  • Loading branch information
omlins committed Jan 16, 2024
2 parents 26a72df + 529b88d commit 13a00ff
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 33 deletions.
19 changes: 12 additions & 7 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,28 @@ name = "ImplicitGlobalGrid"
uuid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0"
version = "0.14.0"

[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"

[weakdeps]
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"

[extensions]
ImplicitGlobalGrid_LoopVectorizationExt = "LoopVectorization"

[compat]
AMDGPU = "0.5, 0.6, 0.7, 0.8"
CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4, 5"
LoopVectorization = "0.12"
MPI = "0.20"
julia = "1.9"

[deps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"

[extras]
CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "MPIPreferences"]
test = ["Test", "MPIPreferences", "LoopVectorization"]
3 changes: 3 additions & 0 deletions ext/ImplicitGlobalGrid_LoopVectorizationExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module ImplicitGlobalGrid_LoopVectorizationExt
include(joinpath(@__DIR__, "..", "src", "LoopVectorizationExt", "memcopy_LV.jl"))
end
49 changes: 49 additions & 0 deletions src/Exceptions.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
module Exceptions
export @ModuleInternalError, @IncoherentCallError, @NotInitializedError, @NotLoadedError, @IncoherentArgumentError, @KeywordArgumentError, @ArgumentEvaluationError, @ArgumentError
export ModuleInternalError, IncoherentCallError, NotInitializedError, NotLoadedError, IncoherentArgumentError, KeywordArgumentError, ArgumentEvaluationError

macro ModuleInternalError(msg) esc(:(throw(ModuleInternalError($msg)))) end
macro IncoherentCallError(msg) esc(:(throw(IncoherentCallError($msg)))) end
macro NotInitializedError(msg) esc(:(throw(NotInitializedError($msg)))) end
macro NotLoadedError(msg) esc(:(throw(NotLoadedError($msg)))) end
macro IncoherentArgumentError(msg) esc(:(throw(IncoherentArgumentError($msg)))) end
macro KeywordArgumentError(msg) esc(:(throw(KeywordArgumentError($msg)))) end
macro ArgumentEvaluationError(msg) esc(:(throw(ArgumentEvaluationError($msg)))) end
macro ArgumentError(msg) esc(:(throw(ArgumentError($msg)))) end

struct ModuleInternalError <: Exception
msg::String
end
Base.showerror(io::IO, e::ModuleInternalError) = print(io, "ModuleInternalError: ", e.msg)

struct IncoherentCallError <: Exception
msg::String
end
Base.showerror(io::IO, e::IncoherentCallError) = print(io, "IncoherentCallError: ", e.msg)

struct NotInitializedError <: Exception
msg::String
end
Base.showerror(io::IO, e::NotInitializedError) = print(io, "NotInitializedError: ", e.msg)

struct NotLoadedError <: Exception
msg::String
end
Base.showerror(io::IO, e::NotLoadedError) = print(io, "NotLoadedError: ", e.msg)

struct IncoherentArgumentError <: Exception
msg::String
end
Base.showerror(io::IO, e::IncoherentArgumentError) = print(io, "IncoherentArgumentError: ", e.msg)

struct KeywordArgumentError <: Exception
msg::String
end
Base.showerror(io::IO, e::KeywordArgumentError) = print(io, "KeywordArgumentError: ", e.msg)

struct ArgumentEvaluationError <: Exception
msg::String
end
Base.showerror(io::IO, e::ArgumentEvaluationError) = print(io, "ArgumentEvaluationError: ", e.msg)

end # Module Exceptions
7 changes: 7 additions & 0 deletions src/ImplicitGlobalGrid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,16 @@ To see a description of a function type `?<functionname>`.
"""
module ImplicitGlobalGrid

## Include of exception module
include("Exceptions.jl");
using .Exceptions

## Include of shared constant parameters, types and syntax sugar
include("shared.jl")

## Alphabetical include of defaults for extensions
include(joinpath("LoopVectorizationExt", "memcopy_LV_default.jl"))

## Alphabetical include of files
include("finalize_global_grid.jl")
include("gather.jl")
Expand Down
9 changes: 9 additions & 0 deletions src/LoopVectorizationExt/memcopy_LV.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import ImplicitGlobalGrid
import ImplicitGlobalGrid: GGNumber
using LoopVectorization

function ImplicitGlobalGrid.memcopy_loopvect!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber
@tturbo for i eachindex(dst, src) # NOTE: tturbo will use maximally Threads.nthreads() threads. Set the number of threads e.g. as: export JULIA_NUM_THREADS=12. NOTE: tturbo fails if src_flat and dst_flat are used due to an issue in ArrayInterface : https://github.com/JuliaArrays/ArrayInterface.jl/issues/228
@inbounds dst[i] = src[i] # NOTE: We fix here exceptionally the use of @inbounds (currently anyways done by LoopVectorization) as this copy between two flat vectors (which must have the right length) is considered safe.
end
end
3 changes: 3 additions & 0 deletions src/LoopVectorizationExt/memcopy_LV_default.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
const ERRMSG_EXTENSION_NOT_LOADED = "AD: the LoopVectorization extension was not loaded. Make sure to import LoopVectorization before ImplicitGlobalGrid."

memcopy_loopvect!(args...) = @NotLoadedError(ERRMSG_EXTENSION_NOT_LOADED)
1 change: 0 additions & 1 deletion src/shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import MPI
using CUDA
using AMDGPU
using Base.Threads
using LoopVectorization


##-------------------------
Expand Down
44 changes: 19 additions & 25 deletions src/update_halo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -583,13 +583,14 @@ function write_h2h!(sendbuf::AbstractArray{T}, A::Array{T}, sendranges::Array{Un
ix = (length(sendranges[1])==1) ? sendranges[1][1] : sendranges[1];
iy = (length(sendranges[2])==1) ? sendranges[2][1] : sendranges[2];
iz = (length(sendranges[3])==1) ? sendranges[3][1] : sendranges[3];
if (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(sendbuf, view(A,ix, :, :), loopvectorization(dim));
elseif (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(sendbuf, view(A,ix, :,iz), loopvectorization(dim));
elseif (dim == 1 && length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(sendbuf, view(A,ix,iy,iz), loopvectorization(dim));
elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(sendbuf, view(A, :,iy, :), loopvectorization(dim));
elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(sendbuf, view(A, :,iy,iz), loopvectorization(dim));
elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2) ) memcopy!(sendbuf, view(A, :, :,iz), loopvectorization(dim));
elseif (dim == 1 || dim == 2 || dim == 3) memcopy!(sendbuf, view(A,sendranges...), loopvectorization(dim)); # This general case is slower than the three optimised cases above (the result would be the same, of course).
if (length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(sendbuf, 1, :, :), view(A,ix, :, :), loopvectorization(dim));
elseif (length(ix)==1 && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(sendbuf, 1, 1, :), view(A,ix,iy, :), loopvectorization(dim));
elseif (length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(sendbuf, 1, :, 1), view(A,ix, :,iz), loopvectorization(dim));
elseif (length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(view(sendbuf, 1, 1, 1), view(A,ix,iy,iz), loopvectorization(dim));
elseif (ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(sendbuf, :, 1, :), view(A, :,iy, :), loopvectorization(dim));
elseif (ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(sendbuf, :, 1, 1), view(A, :,iy,iz), loopvectorization(dim));
elseif (ix == 1:size(A,1) && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(sendbuf, :, :, 1), view(A, :, :,iz), loopvectorization(dim));
else memcopy!(sendbuf, view(A,sendranges...), loopvectorization(dim)); # This general case is slower than the optimised cases above (the result would be the same, of course).
end
end

Expand All @@ -598,13 +599,14 @@ function read_h2h!(recvbuf::AbstractArray{T}, A::Array{T}, recvranges::Array{Uni
ix = (length(recvranges[1])==1) ? recvranges[1][1] : recvranges[1];
iy = (length(recvranges[2])==1) ? recvranges[2][1] : recvranges[2];
iz = (length(recvranges[3])==1) ? recvranges[3][1] : recvranges[3];
if (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(A,ix, :, :), recvbuf, loopvectorization(dim));
elseif (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(A,ix, :,iz), recvbuf, loopvectorization(dim));
elseif (dim == 1 && length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(view(A,ix,iy,iz), recvbuf, loopvectorization(dim));
elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(A, :,iy, :), recvbuf, loopvectorization(dim));
elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(A, :,iy,iz), recvbuf, loopvectorization(dim));
elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2) ) memcopy!(view(A, :, :,iz), recvbuf, loopvectorization(dim));
elseif (dim == 1 || dim == 2 || dim == 3) memcopy!(view(A,recvranges...), recvbuf, loopvectorization(dim)); # This general case is slower than the three optimised cases above (the result would be the same, of course).
if (length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(A,ix, :, :), view(recvbuf, 1, :, :), loopvectorization(dim));
elseif (length(ix)==1 && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(A,ix,iy, :), view(recvbuf, 1, 1, :), loopvectorization(dim));
elseif (length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(A,ix, :,iz), view(recvbuf, 1, :, 1), loopvectorization(dim));
elseif (length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(view(A,ix,iy,iz), view(recvbuf, 1, 1, 1), loopvectorization(dim));
elseif (ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(A, :,iy, :), view(recvbuf, :, 1, :), loopvectorization(dim));
elseif (ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(A, :,iy,iz), view(recvbuf, :, 1, 1), loopvectorization(dim));
elseif (ix == 1:size(A,1) && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(A, :, :,iz), view(recvbuf, :, :, 1), loopvectorization(dim));
else memcopy!(view(A,recvranges...), recvbuf, loopvectorization(dim)); # This general case is slower than the optimised cases above (the result would be the same, of course).
end
end

Expand Down Expand Up @@ -757,7 +759,7 @@ function sendrecv_halo_local(n::Integer, dim::Integer, F::GGField, i::Integer)
end

function memcopy!(dst::AbstractArray{T}, src::AbstractArray{T}, loopvectorization::Bool) where T <: GGNumber
if loopvectorization && !(T <: Complex) # NOTE: LoopVectorization does not yet support Complex numbers and copy reinterpreted arrays leads to bad performance.
if loopvectorization && nthreads() > 1 && length(src) > 1 && !(T <: Complex) # NOTE: LoopVectorization does not yet support Complex numbers and copy reinterpreted arrays leads to bad performance.
memcopy_loopvect!(dst, src)
else
dst_flat = view(dst,:)
Expand All @@ -766,7 +768,9 @@ function memcopy!(dst::AbstractArray{T}, src::AbstractArray{T}, loopvectorizatio
end
end


# (CPU functions)

function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber
if nthreads() > 1 && sizeof(src) >= GG_THREADCOPY_THRESHOLD
@threads for i = 1:length(dst) # NOTE: Set the number of threads e.g. as: export JULIA_NUM_THREADS=12
Expand All @@ -777,16 +781,6 @@ function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T
end
end

function memcopy_loopvect!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber
if nthreads() > 1 && length(src) > 1
@tturbo for i eachindex(dst, src) # NOTE: tturbo will use maximally Threads.nthreads() threads. Set the number of threads e.g. as: export JULIA_NUM_THREADS=12. NOTE: tturbo fails if src_flat and dst_flat are used due to an issue in ArrayInterface : https://github.com/JuliaArrays/ArrayInterface.jl/issues/228
@inbounds dst[i] = src[i] # NOTE: We fix here exceptionally the use of @inbounds (currently anyways done by LoopVectorization) as this copy between two flat vectors (which must have the right length) is considered safe.
end
else
@inbounds copyto!(dst, src)
end
end


# (CUDA functions)

Expand Down
1 change: 1 addition & 0 deletions test/test_update_halo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

push!(LOAD_PATH, "../src")
using Test
import LoopVectorization
using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
import MPI
using CUDA
Expand Down

0 comments on commit 13a00ff

Please sign in to comment.