Merge pull request #83 from eth-cscs/LoopVectorizationExt

Use extension for LoopVectorization dependency
eth-cscs · Jan 16, 2024 · 13a00ff · 13a00ff
2 parents 26a72df + 529b88d
commit 13a00ff
Show file tree

Hide file tree

Showing 9 changed files with 103 additions and 33 deletions.
diff --git a/Project.toml b/Project.toml
@@ -3,23 +3,28 @@ name = "ImplicitGlobalGrid"
 uuid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0"
 version = "0.14.0"
 
+[deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+
+[weakdeps]
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+
+[extensions]
+ImplicitGlobalGrid_LoopVectorizationExt = "LoopVectorization"
+
 [compat]
 AMDGPU = "0.5, 0.6, 0.7, 0.8"
 CUDA = "1, ~3.1, ~3.2, ~3.3, ~3.7.1, ~3.8, ~3.9, ~3.10, ~3.11, ~3.12, ~3.13, 4, 5"
 LoopVectorization = "0.12"
 MPI = "0.20"
 julia = "1.9"
 
-[deps]
-AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
-MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-
 [extras]
 CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
 MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "MPIPreferences"]
+test = ["Test", "MPIPreferences", "LoopVectorization"]
diff --git a/ext/ImplicitGlobalGrid_LoopVectorizationExt.jl b/ext/ImplicitGlobalGrid_LoopVectorizationExt.jl
@@ -0,0 +1,3 @@
+module ImplicitGlobalGrid_LoopVectorizationExt
+    include(joinpath(@__DIR__, "..", "src", "LoopVectorizationExt", "memcopy_LV.jl"))
+end
diff --git a/src/Exceptions.jl b/src/Exceptions.jl
@@ -0,0 +1,49 @@
+module Exceptions
+export @ModuleInternalError, @IncoherentCallError, @NotInitializedError, @NotLoadedError, @IncoherentArgumentError, @KeywordArgumentError, @ArgumentEvaluationError, @ArgumentError
+export ModuleInternalError, IncoherentCallError, NotInitializedError, NotLoadedError, IncoherentArgumentError, KeywordArgumentError, ArgumentEvaluationError
+
+macro ModuleInternalError(msg) esc(:(throw(ModuleInternalError($msg)))) end
+macro IncoherentCallError(msg) esc(:(throw(IncoherentCallError($msg)))) end
+macro NotInitializedError(msg) esc(:(throw(NotInitializedError($msg)))) end
+macro NotLoadedError(msg)  esc(:(throw(NotLoadedError($msg)))) end
+macro IncoherentArgumentError(msg) esc(:(throw(IncoherentArgumentError($msg)))) end
+macro KeywordArgumentError(msg) esc(:(throw(KeywordArgumentError($msg)))) end
+macro ArgumentEvaluationError(msg) esc(:(throw(ArgumentEvaluationError($msg)))) end
+macro ArgumentError(msg) esc(:(throw(ArgumentError($msg)))) end
+
+struct ModuleInternalError <: Exception
+    msg::String
+end
+Base.showerror(io::IO, e::ModuleInternalError) = print(io, "ModuleInternalError: ", e.msg)
+
+struct IncoherentCallError <: Exception
+    msg::String
+end
+Base.showerror(io::IO, e::IncoherentCallError) = print(io, "IncoherentCallError: ", e.msg)
+
+struct NotInitializedError <: Exception
+    msg::String
+end
+Base.showerror(io::IO, e::NotInitializedError) = print(io, "NotInitializedError: ", e.msg)
+
+struct NotLoadedError <: Exception
+    msg::String
+end
+Base.showerror(io::IO, e::NotLoadedError) = print(io, "NotLoadedError: ", e.msg)
+
+struct IncoherentArgumentError <: Exception
+    msg::String
+end
+Base.showerror(io::IO, e::IncoherentArgumentError) = print(io, "IncoherentArgumentError: ", e.msg)
+
+struct KeywordArgumentError <: Exception
+    msg::String
+end
+Base.showerror(io::IO, e::KeywordArgumentError) = print(io, "KeywordArgumentError: ", e.msg)
+
+struct ArgumentEvaluationError <: Exception
+    msg::String
+end
+Base.showerror(io::IO, e::ArgumentEvaluationError) = print(io, "ArgumentEvaluationError: ", e.msg)
+
+end # Module Exceptions
diff --git a/src/ImplicitGlobalGrid.jl b/src/ImplicitGlobalGrid.jl
@@ -34,9 +34,16 @@ To see a description of a function type `?<functionname>`.
 """
 module ImplicitGlobalGrid
 
+## Include of exception module
+include("Exceptions.jl");
+using .Exceptions
+
 ## Include of shared constant parameters, types and syntax sugar
 include("shared.jl")
 
+## Alphabetical include of defaults for extensions
+include(joinpath("LoopVectorizationExt", "memcopy_LV_default.jl"))
+
 ## Alphabetical include of files
 include("finalize_global_grid.jl")
 include("gather.jl")

diff --git a/src/LoopVectorizationExt/memcopy_LV.jl b/src/LoopVectorizationExt/memcopy_LV.jl
@@ -0,0 +1,9 @@
+import ImplicitGlobalGrid
+import ImplicitGlobalGrid: GGNumber
+using LoopVectorization
+
+function ImplicitGlobalGrid.memcopy_loopvect!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber
+    @tturbo for i ∈ eachindex(dst, src)  # NOTE: tturbo will use maximally Threads.nthreads() threads. Set the number of threads e.g. as: export JULIA_NUM_THREADS=12. NOTE: tturbo fails if src_flat and dst_flat are used due to an issue in ArrayInterface : https://github.com/JuliaArrays/ArrayInterface.jl/issues/228
+        @inbounds dst[i] = src[i]        # NOTE: We fix here exceptionally the use of @inbounds (currently anyways done by LoopVectorization) as this copy between two flat vectors (which must have the right length) is considered safe.
+    end
+end
diff --git a/src/LoopVectorizationExt/memcopy_LV_default.jl b/src/LoopVectorizationExt/memcopy_LV_default.jl
@@ -0,0 +1,3 @@
+const ERRMSG_EXTENSION_NOT_LOADED = "AD: the LoopVectorization extension was not loaded. Make sure to import LoopVectorization before ImplicitGlobalGrid."
+
+memcopy_loopvect!(args...) = @NotLoadedError(ERRMSG_EXTENSION_NOT_LOADED)
diff --git a/src/shared.jl b/src/shared.jl
@@ -2,7 +2,6 @@ import MPI
 using CUDA
 using AMDGPU
 using Base.Threads
-using LoopVectorization
 
 
 ##-------------------------

diff --git a/src/update_halo.jl b/src/update_halo.jl
@@ -583,13 +583,14 @@ function write_h2h!(sendbuf::AbstractArray{T}, A::Array{T}, sendranges::Array{Un
     ix = (length(sendranges[1])==1) ? sendranges[1][1] : sendranges[1];
     iy = (length(sendranges[2])==1) ? sendranges[2][1] : sendranges[2];
     iz = (length(sendranges[3])==1) ? sendranges[3][1] : sendranges[3];
-    if     (dim == 1 && length(ix)==1     && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(sendbuf, view(A,ix, :, :), loopvectorization(dim));
-    elseif (dim == 1 && length(ix)==1     && iy == 1:size(A,2) && length(iz)==1    ) memcopy!(sendbuf, view(A,ix, :,iz), loopvectorization(dim));
-    elseif (dim == 1 && length(ix)==1     && length(iy)==1     && length(iz)==1    ) memcopy!(sendbuf, view(A,ix,iy,iz), loopvectorization(dim));
-    elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1     && iz == 1:size(A,3)) memcopy!(sendbuf, view(A, :,iy, :), loopvectorization(dim));
-    elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1     && length(iz)==1    ) memcopy!(sendbuf, view(A, :,iy,iz), loopvectorization(dim));
-    elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2)                     ) memcopy!(sendbuf, view(A, :, :,iz), loopvectorization(dim));
-    elseif (dim == 1 || dim == 2 || dim == 3)                                        memcopy!(sendbuf, view(A,sendranges...), loopvectorization(dim)); # This general case is slower than the three optimised cases above (the result would be the same, of course).
+    if     (length(ix)==1     && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(sendbuf, 1, :, :), view(A,ix, :, :), loopvectorization(dim));
+    elseif (length(ix)==1     && length(iy)==1     && iz == 1:size(A,3)) memcopy!(view(sendbuf, 1, 1, :), view(A,ix,iy, :), loopvectorization(dim));
+    elseif (length(ix)==1     && iy == 1:size(A,2) && length(iz)==1    ) memcopy!(view(sendbuf, 1, :, 1), view(A,ix, :,iz), loopvectorization(dim));
+    elseif (length(ix)==1     && length(iy)==1     && length(iz)==1    ) memcopy!(view(sendbuf, 1, 1, 1), view(A,ix,iy,iz), loopvectorization(dim));
+    elseif (ix == 1:size(A,1) && length(iy)==1     && iz == 1:size(A,3)) memcopy!(view(sendbuf, :, 1, :), view(A, :,iy, :), loopvectorization(dim));
+    elseif (ix == 1:size(A,1) && length(iy)==1     && length(iz)==1    ) memcopy!(view(sendbuf, :, 1, 1), view(A, :,iy,iz), loopvectorization(dim));
+    elseif (ix == 1:size(A,1) && iy == 1:size(A,2) && length(iz)==1    ) memcopy!(view(sendbuf, :, :, 1), view(A, :, :,iz), loopvectorization(dim));
+    else                                                                 memcopy!(sendbuf, view(A,sendranges...), loopvectorization(dim)); # This general case is slower than the optimised cases above (the result would be the same, of course).
     end
 end
 
@@ -598,13 +599,14 @@ function read_h2h!(recvbuf::AbstractArray{T}, A::Array{T}, recvranges::Array{Uni
     ix = (length(recvranges[1])==1) ? recvranges[1][1] : recvranges[1];
     iy = (length(recvranges[2])==1) ? recvranges[2][1] : recvranges[2];
     iz = (length(recvranges[3])==1) ? recvranges[3][1] : recvranges[3];
-    if     (dim == 1 && length(ix)==1     && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(A,ix, :, :), recvbuf, loopvectorization(dim));
-    elseif (dim == 1 && length(ix)==1     && iy == 1:size(A,2) && length(iz)==1    ) memcopy!(view(A,ix, :,iz), recvbuf, loopvectorization(dim));
-    elseif (dim == 1 && length(ix)==1     && length(iy)==1     && length(iz)==1    ) memcopy!(view(A,ix,iy,iz), recvbuf, loopvectorization(dim));
-    elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1     && iz == 1:size(A,3)) memcopy!(view(A, :,iy, :), recvbuf, loopvectorization(dim));
-    elseif (dim == 2 && ix == 1:size(A,1) && length(iy)==1     && length(iz)==1    ) memcopy!(view(A, :,iy,iz), recvbuf, loopvectorization(dim));
-    elseif (dim == 3 && ix == 1:size(A,1) && iy == 1:size(A,2)                     ) memcopy!(view(A, :, :,iz), recvbuf, loopvectorization(dim));
-    elseif (dim == 1 || dim == 2 || dim == 3)                                        memcopy!(view(A,recvranges...), recvbuf, loopvectorization(dim)); # This general case is slower than the three optimised cases above (the result would be the same, of course).
+    if     (length(ix)==1     && iy == 1:size(A,2) && iz == 1:size(A,3)) memcopy!(view(A,ix, :, :), view(recvbuf, 1, :, :), loopvectorization(dim));
+    elseif (length(ix)==1     && length(iy)==1     && iz == 1:size(A,3)) memcopy!(view(A,ix,iy, :), view(recvbuf, 1, 1, :), loopvectorization(dim));
+    elseif (length(ix)==1     && iy == 1:size(A,2) && length(iz)==1    ) memcopy!(view(A,ix, :,iz), view(recvbuf, 1, :, 1), loopvectorization(dim));
+    elseif (length(ix)==1     && length(iy)==1     && length(iz)==1    ) memcopy!(view(A,ix,iy,iz), view(recvbuf, 1, 1, 1), loopvectorization(dim));
+    elseif (ix == 1:size(A,1) && length(iy)==1     && iz == 1:size(A,3)) memcopy!(view(A, :,iy, :), view(recvbuf, :, 1, :), loopvectorization(dim));
+    elseif (ix == 1:size(A,1) && length(iy)==1     && length(iz)==1    ) memcopy!(view(A, :,iy,iz), view(recvbuf, :, 1, 1), loopvectorization(dim));
+    elseif (ix == 1:size(A,1) && iy == 1:size(A,2) && length(iz)==1    ) memcopy!(view(A, :, :,iz), view(recvbuf, :, :, 1), loopvectorization(dim));
+    else                                                                 memcopy!(view(A,recvranges...), recvbuf, loopvectorization(dim)); # This general case is slower than the optimised cases above (the result would be the same, of course).
     end
 end
 
@@ -757,7 +759,7 @@ function sendrecv_halo_local(n::Integer, dim::Integer, F::GGField, i::Integer)
 end
 
 function memcopy!(dst::AbstractArray{T}, src::AbstractArray{T}, loopvectorization::Bool) where T <: GGNumber
-    if loopvectorization && !(T <: Complex)  # NOTE: LoopVectorization does not yet support Complex numbers and copy reinterpreted arrays leads to bad performance.
+    if loopvectorization && nthreads() > 1 && length(src) > 1 && !(T <: Complex)  # NOTE: LoopVectorization does not yet support Complex numbers and copy reinterpreted arrays leads to bad performance.
         memcopy_loopvect!(dst, src)
     else
         dst_flat = view(dst,:)
@@ -766,7 +768,9 @@ function memcopy!(dst::AbstractArray{T}, src::AbstractArray{T}, loopvectorizatio
     end
 end
 
+
 # (CPU functions)
+
 function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber
     if nthreads() > 1 && sizeof(src) >= GG_THREADCOPY_THRESHOLD
         @threads for i = 1:length(dst)  # NOTE: Set the number of threads e.g. as: export JULIA_NUM_THREADS=12
@@ -777,16 +781,6 @@ function memcopy_threads!(dst::AbstractArray{T}, src::AbstractArray{T}) where T
     end
 end
 
-function memcopy_loopvect!(dst::AbstractArray{T}, src::AbstractArray{T}) where T <: GGNumber
-    if nthreads() > 1 && length(src) > 1
-        @tturbo for i ∈ eachindex(dst, src)  # NOTE: tturbo will use maximally Threads.nthreads() threads. Set the number of threads e.g. as: export JULIA_NUM_THREADS=12. NOTE: tturbo fails if src_flat and dst_flat are used due to an issue in ArrayInterface : https://github.com/JuliaArrays/ArrayInterface.jl/issues/228
-            @inbounds dst[i] = src[i]        # NOTE: We fix here exceptionally the use of @inbounds (currently anyways done by LoopVectorization) as this copy between two flat vectors (which must have the right length) is considered safe.
-        end
-    else
-        @inbounds copyto!(dst, src)
-    end
-end
-
 
 # (CUDA functions)
 

diff --git a/test/test_update_halo.jl b/test/test_update_halo.jl
@@ -4,6 +4,7 @@
 
 push!(LOAD_PATH, "../src")
 using Test
+import LoopVectorization
 using ImplicitGlobalGrid; GG = ImplicitGlobalGrid
 import MPI
 using CUDA