diff --git a/Project.toml b/Project.toml index b33b80a..739de78 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,17 @@ name = "ImplicitGlobalGrid" uuid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0" version = "0.14.0" +[deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" + +[weakdeps] +LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" + +[extensions] +ImplicitGlobalGrid_LoopVectorizationExt = "LoopVectorization" + [compat] AMDGPU = "0.5, 0.6, 0.7, 0.8" CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4, 5" @@ -10,16 +21,10 @@ LoopVectorization = "0.12" MPI = "0.20" julia = "1.9" -[deps] -AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" -MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" - [extras] CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9" MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test", "MPIPreferences"] +test = ["Test", "MPIPreferences", "LoopVectorization"] diff --git a/ext/ImplicitGlobalGrid_LoopVectorizationExt.jl b/ext/ImplicitGlobalGrid_LoopVectorizationExt.jl new file mode 100644 index 0000000..5efa430 --- /dev/null +++ b/ext/ImplicitGlobalGrid_LoopVectorizationExt.jl @@ -0,0 +1,3 @@ +module ImplicitGlobalGrid_LoopVectorizationExt + include(joinpath(@__DIR__, "..", "src", "LoopVectorizationExt", "memcopy_LV.jl")) +end \ No newline at end of file diff --git a/src/Exceptions.jl b/src/Exceptions.jl new file mode 100644 index 0000000..3ecde56 --- /dev/null +++ b/src/Exceptions.jl @@ -0,0 +1,49 @@ +module Exceptions +export @ModuleInternalError, @IncoherentCallError, @NotInitializedError, @NotLoadedError, @IncoherentArgumentError, @KeywordArgumentError, @ArgumentEvaluationError, @ArgumentError +export ModuleInternalError, IncoherentCallError, NotInitializedError, NotLoadedError, IncoherentArgumentError, KeywordArgumentError, ArgumentEvaluationError + +macro ModuleInternalError(msg) esc(:(throw(ModuleInternalError($msg)))) end +macro IncoherentCallError(msg) esc(:(throw(IncoherentCallError($msg)))) end +macro NotInitializedError(msg) esc(:(throw(NotInitializedError($msg)))) end +macro NotLoadedError(msg) esc(:(throw(NotLoadedError($msg)))) end +macro IncoherentArgumentError(msg) esc(:(throw(IncoherentArgumentError($msg)))) end +macro KeywordArgumentError(msg) esc(:(throw(KeywordArgumentError($msg)))) end +macro ArgumentEvaluationError(msg) esc(:(throw(ArgumentEvaluationError($msg)))) end +macro ArgumentError(msg) esc(:(throw(ArgumentError($msg)))) end + +struct ModuleInternalError <: Exception + msg::String +end +Base.showerror(io::IO, e::ModuleInternalError) = print(io, "ModuleInternalError: ", e.msg) + +struct IncoherentCallError <: Exception + msg::String +end +Base.showerror(io::IO, e::IncoherentCallError) = print(io, "IncoherentCallError: ", e.msg) + +struct NotInitializedError <: Exception + msg::String +end +Base.showerror(io::IO, e::NotInitializedError) = print(io, "NotInitializedError: ", e.msg) + +struct NotLoadedError <: Exception + msg::String +end +Base.showerror(io::IO, e::NotLoadedError) = print(io, "NotLoadedError: ", e.msg) + +struct IncoherentArgumentError <: Exception + msg::String +end +Base.showerror(io::IO, e::IncoherentArgumentError) = print(io, "IncoherentArgumentError: ", e.msg) + +struct KeywordArgumentError <: Exception + msg::String +end +Base.showerror(io::IO, e::KeywordArgumentError) = print(io, "KeywordArgumentError: ", e.msg) + +struct ArgumentEvaluationError <: Exception + msg::String +end +Base.showerror(io::IO, e::ArgumentEvaluationError) = print(io, "ArgumentEvaluationError: ", e.msg) + +end # Module Exceptions diff --git a/src/ImplicitGlobalGrid.jl b/src/ImplicitGlobalGrid.jl index 95f94c9..628d799 100644 --- a/src/ImplicitGlobalGrid.jl +++ b/src/ImplicitGlobalGrid.jl @@ -34,9 +34,16 @@ To see a description of a function type `?`. """ module ImplicitGlobalGrid +## Include of exception module +include("Exceptions.jl"); +using .Exceptions + ## Include of shared constant parameters, types and syntax sugar include("shared.jl") +## Alphabetical include of defaults for extensions +include(joinpath("LoopVectorizationExt", "memcopy_LV_default.jl")) + ## Alphabetical include of files include("finalize_global_grid.jl") include("gather.jl") diff --git a/src/LoopVectorizationExt/memcopy_LV.jl b/src/LoopVectorizationExt/memcopy_LV.jl new file mode 100644 index 0000000..c0f7034 --- /dev/null +++ b/src/LoopVectorizationExt/memcopy_LV.jl @@ -0,0 +1,9 @@ +import ImplicitGlobalGrid +import ImplicitGlobalGrid: GGNumber +using LoopVectorization + +function ImplicitGlobalGrid.memcopy_loopvect!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber + @tturbo for i ∈ eachindex(dst, src) # NOTE: tturbo will use maximally Threads.nthreads() threads. Set the number of threads e.g. as: export JULIA_NUM_THREADS=12. NOTE: tturbo fails if src_flat and dst_flat are used due to an issue in ArrayInterface : https://github.com/JuliaArrays/ArrayInterface.jl/issues/228 + @inbounds dst[i] = src[i] # NOTE: We fix here exceptionally the use of @inbounds (currently anyways done by LoopVectorization) as this copy between two flat vectors (which must have the right length) is considered safe. + end +end \ No newline at end of file diff --git a/src/LoopVectorizationExt/memcopy_LV_default.jl b/src/LoopVectorizationExt/memcopy_LV_default.jl new file mode 100644 index 0000000..d71e763 --- /dev/null +++ b/src/LoopVectorizationExt/memcopy_LV_default.jl @@ -0,0 +1,3 @@ +const ERRMSG_EXTENSION_NOT_LOADED = "AD: the LoopVectorization extension was not loaded. Make sure to import LoopVectorization before ImplicitGlobalGrid." + +memcopy_loopvect!(args...) = @NotLoadedError(ERRMSG_EXTENSION_NOT_LOADED) \ No newline at end of file diff --git a/src/shared.jl b/src/shared.jl index 1188958..b9a9907 100644 --- a/src/shared.jl +++ b/src/shared.jl @@ -2,7 +2,6 @@ import MPI using CUDA using AMDGPU using Base.Threads -using LoopVectorization ##------------------------- diff --git a/src/update_halo.jl b/src/update_halo.jl index 189fe9c..4be9d87 100644 --- a/src/update_halo.jl +++ b/src/update_halo.jl @@ -583,13 +583,14 @@ function write_h2h!(sendbuf::AbstractArray{T}, A::Array{T}, sendranges::Array{Un ix = (length(sendranges[1])==1) ? sendranges[1][1] : sendranges[1]; iy = (length(sendranges[2])==1) ? sendranges[2][1] : sendranges[2]; iz = (length(sendranges[3])==1) ? sendranges[3][1] : sendranges[3]; - if (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(sendbuf, view(A,ix, :, :), loopvectorization(dim)); - elseif (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(sendbuf, view(A,ix, :,iz), loopvectorization(dim)); - elseif (dim == 1 && length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(sendbuf, view(A,ix,iy,iz), loopvectorization(dim)); - elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(sendbuf, view(A, :,iy, :), loopvectorization(dim)); - elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(sendbuf, view(A, :,iy,iz), loopvectorization(dim)); - elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2) ) memcopy!(sendbuf, view(A, :, :,iz), loopvectorization(dim)); - elseif (dim == 1 || dim == 2 || dim == 3) memcopy!(sendbuf, view(A,sendranges...), loopvectorization(dim)); # This general case is slower than the three optimised cases above (the result would be the same, of course). + if (length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(sendbuf, 1, :, :), view(A,ix, :, :), loopvectorization(dim)); + elseif (length(ix)==1 && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(sendbuf, 1, 1, :), view(A,ix,iy, :), loopvectorization(dim)); + elseif (length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(sendbuf, 1, :, 1), view(A,ix, :,iz), loopvectorization(dim)); + elseif (length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(view(sendbuf, 1, 1, 1), view(A,ix,iy,iz), loopvectorization(dim)); + elseif (ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(sendbuf, :, 1, :), view(A, :,iy, :), loopvectorization(dim)); + elseif (ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(sendbuf, :, 1, 1), view(A, :,iy,iz), loopvectorization(dim)); + elseif (ix == 1:size(A,1) && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(sendbuf, :, :, 1), view(A, :, :,iz), loopvectorization(dim)); + else memcopy!(sendbuf, view(A,sendranges...), loopvectorization(dim)); # This general case is slower than the optimised cases above (the result would be the same, of course). end end @@ -598,13 +599,14 @@ function read_h2h!(recvbuf::AbstractArray{T}, A::Array{T}, recvranges::Array{Uni ix = (length(recvranges[1])==1) ? recvranges[1][1] : recvranges[1]; iy = (length(recvranges[2])==1) ? recvranges[2][1] : recvranges[2]; iz = (length(recvranges[3])==1) ? recvranges[3][1] : recvranges[3]; - if (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(A,ix, :, :), recvbuf, loopvectorization(dim)); - elseif (dim == 1 && length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(A,ix, :,iz), recvbuf, loopvectorization(dim)); - elseif (dim == 1 && length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(view(A,ix,iy,iz), recvbuf, loopvectorization(dim)); - elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(A, :,iy, :), recvbuf, loopvectorization(dim)); - elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(A, :,iy,iz), recvbuf, loopvectorization(dim)); - elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2) ) memcopy!(view(A, :, :,iz), recvbuf, loopvectorization(dim)); - elseif (dim == 1 || dim == 2 || dim == 3) memcopy!(view(A,recvranges...), recvbuf, loopvectorization(dim)); # This general case is slower than the three optimised cases above (the result would be the same, of course). + if (length(ix)==1 && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(A,ix, :, :), view(recvbuf, 1, :, :), loopvectorization(dim)); + elseif (length(ix)==1 && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(A,ix,iy, :), view(recvbuf, 1, 1, :), loopvectorization(dim)); + elseif (length(ix)==1 && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(A,ix, :,iz), view(recvbuf, 1, :, 1), loopvectorization(dim)); + elseif (length(ix)==1 && length(iy)==1 && length(iz)==1 ) memcopy!(view(A,ix,iy,iz), view(recvbuf, 1, 1, 1), loopvectorization(dim)); + elseif (ix == 1:size(A,1) && length(iy)==1 && iz == 1:size(A,3)) memcopy!(view(A, :,iy, :), view(recvbuf, :, 1, :), loopvectorization(dim)); + elseif (ix == 1:size(A,1) && length(iy)==1 && length(iz)==1 ) memcopy!(view(A, :,iy,iz), view(recvbuf, :, 1, 1), loopvectorization(dim)); + elseif (ix == 1:size(A,1) && iy == 1:size(A,2) && length(iz)==1 ) memcopy!(view(A, :, :,iz), view(recvbuf, :, :, 1), loopvectorization(dim)); + else memcopy!(view(A,recvranges...), recvbuf, loopvectorization(dim)); # This general case is slower than the optimised cases above (the result would be the same, of course). end end @@ -757,7 +759,7 @@ function sendrecv_halo_local(n::Integer, dim::Integer, F::GGField, i::Integer) end function memcopy!(dst::AbstractArray{T}, src::AbstractArray{T}, loopvectorization::Bool) where T <: GGNumber - if loopvectorization && !(T <: Complex) # NOTE: LoopVectorization does not yet support Complex numbers and copy reinterpreted arrays leads to bad performance. + if loopvectorization && nthreads() > 1 && length(src) > 1 && !(T <: Complex) # NOTE: LoopVectorization does not yet support Complex numbers and copy reinterpreted arrays leads to bad performance. memcopy_loopvect!(dst, src) else dst_flat = view(dst,:) @@ -766,7 +768,9 @@ function memcopy!(dst::AbstractArray{T}, src::AbstractArray{T}, loopvectorizatio end end + # (CPU functions) + function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber if nthreads() > 1 && sizeof(src) >= GG_THREADCOPY_THRESHOLD @threads for i = 1:length(dst) # NOTE: Set the number of threads e.g. as: export JULIA_NUM_THREADS=12 @@ -777,16 +781,6 @@ function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T end end -function memcopy_loopvect!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber - if nthreads() > 1 && length(src) > 1 - @tturbo for i ∈ eachindex(dst, src) # NOTE: tturbo will use maximally Threads.nthreads() threads. Set the number of threads e.g. as: export JULIA_NUM_THREADS=12. NOTE: tturbo fails if src_flat and dst_flat are used due to an issue in ArrayInterface : https://github.com/JuliaArrays/ArrayInterface.jl/issues/228 - @inbounds dst[i] = src[i] # NOTE: We fix here exceptionally the use of @inbounds (currently anyways done by LoopVectorization) as this copy between two flat vectors (which must have the right length) is considered safe. - end - else - @inbounds copyto!(dst, src) - end -end - # (CUDA functions) diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl index 3a742fd..fb57038 100644 --- a/test/test_update_halo.jl +++ b/test/test_update_halo.jl @@ -4,6 +4,7 @@ push!(LOAD_PATH, "../src") using Test +import LoopVectorization using ImplicitGlobalGrid; GG = ImplicitGlobalGrid import MPI using CUDA