From 4d9ebea32dab4f6199d5b0b1a4529ac11edcaa3f Mon Sep 17 00:00:00 2001
From: Samuel Omlin <samuel.omlin@cscs.ch>
Date: Thu, 18 Jan 2024 19:19:18 +0100
Subject: [PATCH] introduce functional check and make select_device extension
 compatible

---
 ext/ImplicitGlobalGrid_AMDGPUExt.jl |  1 +
 ext/ImplicitGlobalGrid_CUDAExt.jl   |  1 +
 src/AMDGPUExt/defaults.jl           |  6 ++++++
 src/AMDGPUExt/select_device.jl      |  2 ++
 src/AMDGPUExt/shared.jl             |  1 +
 src/CUDAExt/defaults.jl             |  6 ++++++
 src/CUDAExt/select_device.jl        |  2 ++
 src/CUDAExt/shared.jl               |  1 +
 src/defaults_shared.jl              |  6 ++++++
 src/init_global_grid.jl             |  8 +++++---
 src/select_device.jl                | 14 +++++++-------
 src/shared.jl                       | 22 +++++++++++++---------
 12 files changed, 51 insertions(+), 19 deletions(-)
 create mode 100644 src/AMDGPUExt/select_device.jl
 create mode 100644 src/CUDAExt/select_device.jl

diff --git a/ext/ImplicitGlobalGrid_AMDGPUExt.jl b/ext/ImplicitGlobalGrid_AMDGPUExt.jl
index dbc5bf3..5ac806f 100644
--- a/ext/ImplicitGlobalGrid_AMDGPUExt.jl
+++ b/ext/ImplicitGlobalGrid_AMDGPUExt.jl
@@ -1,4 +1,5 @@
 module ImplicitGlobalGrid_AMDGPUExt
     include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "shared.jl"))
+    include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "select_device.jl"))
     include(joinpath(@__DIR__, "..", "src", "AMDGPUExt", "update_halo.jl"))
 end
\ No newline at end of file
diff --git a/ext/ImplicitGlobalGrid_CUDAExt.jl b/ext/ImplicitGlobalGrid_CUDAExt.jl
index 381fd59..58775fd 100644
--- a/ext/ImplicitGlobalGrid_CUDAExt.jl
+++ b/ext/ImplicitGlobalGrid_CUDAExt.jl
@@ -1,4 +1,5 @@
 module ImplicitGlobalGrid_CUDAExt
     include(joinpath(@__DIR__, "..", "src", "CUDAExt", "shared.jl"))
+    include(joinpath(@__DIR__, "..", "src", "CUDAExt", "select_device.jl"))
     include(joinpath(@__DIR__, "..", "src", "CUDAExt", "update_halo.jl"))
 end
\ No newline at end of file
diff --git a/src/AMDGPUExt/defaults.jl b/src/AMDGPUExt/defaults.jl
index 50a1523..9fec08b 100644
--- a/src/AMDGPUExt/defaults.jl
+++ b/src/AMDGPUExt/defaults.jl
@@ -3,6 +3,12 @@
 is_rocarray(A::GGArray) = false
 
 
+# select_device.jl
+
+function nb_rocdevices end
+function rocdevice! end
+
+
 # update_halo.jl
 
 function free_update_halo_rocbuffers end
diff --git a/src/AMDGPUExt/select_device.jl b/src/AMDGPUExt/select_device.jl
new file mode 100644
index 0000000..cb8cce3
--- /dev/null
+++ b/src/AMDGPUExt/select_device.jl
@@ -0,0 +1,2 @@
+ImplicitGlobalGrid.nb_rocdevices()       = length(AMDGPU.devices())
+ImplicitGlobalGrid.rocdevice!(device_id) = AMDGPU.device_id!(device_id)
\ No newline at end of file
diff --git a/src/AMDGPUExt/shared.jl b/src/AMDGPUExt/shared.jl
index 50f7d3c..402cdc2 100644
--- a/src/AMDGPUExt/shared.jl
+++ b/src/AMDGPUExt/shared.jl
@@ -14,6 +14,7 @@ const ROCField{T,N} = GGField{T,N,ROCArray{T,N}}
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
 
 ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_AMDGPUExt}) = true
+ImplicitGlobalGrid.is_functional(::Val{:AMDGPU})                   = AMDGPU.functional()
 
 
 ##-------------
diff --git a/src/CUDAExt/defaults.jl b/src/CUDAExt/defaults.jl
index 5389086..187f4c5 100644
--- a/src/CUDAExt/defaults.jl
+++ b/src/CUDAExt/defaults.jl
@@ -3,6 +3,12 @@
 is_cuarray(A::GGArray) = false
 
 
+# select_device.jl
+
+function nb_cudevices end
+function cudevice! end
+
+
 # update_halo.jl
 
 function free_update_halo_cubuffers end
diff --git a/src/CUDAExt/select_device.jl b/src/CUDAExt/select_device.jl
new file mode 100644
index 0000000..bcffa29
--- /dev/null
+++ b/src/CUDAExt/select_device.jl
@@ -0,0 +1,2 @@
+ImplicitGlobalGrid.nb_cudevices()       = length(CUDA.devices())
+ImplicitGlobalGrid.cudevice!(device_id) = CUDA.device!(device_id)
\ No newline at end of file
diff --git a/src/CUDAExt/shared.jl b/src/CUDAExt/shared.jl
index 3f6930a..93fc6dc 100644
--- a/src/CUDAExt/shared.jl
+++ b/src/CUDAExt/shared.jl
@@ -14,6 +14,7 @@ const CuField{T,N} = GGField{T,N,CuArray{T,N}}
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
 
 ImplicitGlobalGrid.is_loaded(::Val{:ImplicitGlobalGrid_CUDAExt}) = true
+ImplicitGlobalGrid.is_functional(::Val{:CUDA})                   = CUDA.functional(true)
 
 
 ##-------------
diff --git a/src/defaults_shared.jl b/src/defaults_shared.jl
index 7fe1e81..334dae9 100644
--- a/src/defaults_shared.jl
+++ b/src/defaults_shared.jl
@@ -1,3 +1,9 @@
+# shared.jl
+
+is_loaded(arg) = false #TODO: this would not work as it should be the caller module...: (Base.get_extension(@__MODULE__, ext) !== nothing)
+is_functional(arg) = false
+
+
 # update_halo.jl
 
 function gpusendbuf end
diff --git a/src/init_global_grid.jl b/src/init_global_grid.jl
index 0e3ed41..4f8ba63 100644
--- a/src/init_global_grid.jl
+++ b/src/init_global_grid.jl
@@ -41,7 +41,9 @@ See also: [`finalize_global_grid`](@ref), [`select_device`](@ref)
 function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0, dimy::Integer=0, dimz::Integer=0, periodx::Integer=0, periody::Integer=0, periodz::Integer=0, overlaps::Tuple{Int,Int,Int}=(2,2,2), halowidths::Tuple{Int,Int,Int}=max.(1,overlaps.÷2), disp::Integer=1, reorder::Integer=1, comm::MPI.Comm=MPI.COMM_WORLD, init_MPI::Bool=true, device_type::String=DEVICE_TYPE_AUTO, select_device::Bool=true, quiet::Bool=false)
     if grid_is_initialized() error("The global grid has already been initialized.") end
     set_cuda_loaded()
+    set_cuda_functional()
     set_amdgpu_loaded()
+    set_amdgpu_functional()
     nxyz              = [nx, ny, nz];
     dims              = [dimx, dimy, dimz];
     periods           = [periodx, periody, periodz];
@@ -71,10 +73,10 @@ function init_global_grid(nx::Integer, ny::Integer, nz::Integer; dimx::Integer=0
         if haskey(ENV, "IGG_LOOPVECTORIZATION_DIMZ") loopvectorization[3] = (parse(Int64, ENV["IGG_LOOPVECTORIZATION_DIMZ"]) > 0); end
     end
     if !(device_type in [DEVICE_TYPE_NONE, DEVICE_TYPE_AUTO, DEVICE_TYPE_CUDA, DEVICE_TYPE_AMDGPU]) error("Argument `device_type`: invalid value obtained ($device_type). Valid values are: $DEVICE_TYPE_CUDA, $DEVICE_TYPE_AMDGPU, $DEVICE_TYPE_NONE, $DEVICE_TYPE_AUTO") end
-    if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && amdgpu_loaded()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
+    if ((device_type == DEVICE_TYPE_AUTO) && cuda_loaded() && cuda_functional() && amdgpu_loaded() && amdgpu_functional()) error("Automatic detection of the device type to be used not possible: both CUDA and AMDGPU extensions are loaded and functional. Set keyword argument `device_type` to $DEVICE_TYPE_CUDA or $DEVICE_TYPE_AMDGPU.") end
     if (device_type != DEVICE_TYPE_NONE)
-        if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_loaded()   end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
-        if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_CUDA,   DEVICE_TYPE_AUTO]) cuda_enabled   = cuda_loaded() && cuda_functional()  end # NOTE: cuda could be enabled/disabled depending on some additional criteria.
+        if (device_type in [DEVICE_TYPE_AMDGPU, DEVICE_TYPE_AUTO]) amdgpu_enabled = amdgpu_loaded() && amdgpu_functional() end # NOTE: amdgpu could be enabled/disabled depending on some additional criteria.
     end
     if (any(nxyz .< 1)) error("Invalid arguments: nx, ny, and nz cannot be less than 1."); end
     if (any(dims .< 0)) error("Invalid arguments: dimx, dimy, and dimz cannot be negative."); end
diff --git a/src/select_device.jl b/src/select_device.jl
index fc3d5c0..5df62cf 100644
--- a/src/select_device.jl
+++ b/src/select_device.jl
@@ -13,22 +13,22 @@ Select the device (GPU) corresponding to the node-local MPI rank and return its
 See also: [`init_global_grid`](@ref)
 """
 function select_device()
+    check_initialized()
     if (cuda_enabled() && amdgpu_enabled()) error("Cannot select a device because both CUDA and AMDGPU are enabled (meaning that both modules were imported before ImplicitGlobalGrid).") end
     if cuda_enabled() || amdgpu_enabled()
-        check_initialized();
         if cuda_enabled()
-            @assert CUDA.functional(true)
-            nb_devices = length(CUDA.devices())
+            @assert cuda_functional()
+            nb_devices = nb_cudevices()
         elseif amdgpu_enabled()
-            @assert AMDGPU.functional()
-            nb_devices = length(AMDGPU.devices())
+            @assert amdgpu_functional()
+            nb_devices = nb_rocdevices()
         end
         comm_l = MPI.Comm_split_type(comm(), MPI.COMM_TYPE_SHARED, me())
         if (MPI.Comm_size(comm_l) > nb_devices) error("More processes have been launched per node than there are GPUs available."); end
         me_l      = MPI.Comm_rank(comm_l)
         device_id = amdgpu_enabled() ? me_l+1 : me_l
-        if     cuda_enabled()   CUDA.device!(device_id)
-        elseif amdgpu_enabled() AMDGPU.device_id!(device_id)
+        if     cuda_enabled()   cudevice!(device_id)
+        elseif amdgpu_enabled() rocdevice!(device_id)
         end
         return device_id
     else
diff --git a/src/shared.jl b/src/shared.jl
index f75a135..1e6a4a9 100644
--- a/src/shared.jl
+++ b/src/shared.jl
@@ -5,16 +5,20 @@ using Base.Threads
 ##------------------------------------
 ## HANDLING OF CUDA AND AMDGPU SUPPORT
 
-is_loaded(arg) = false #TODO: this would not work as it should be the caller module...: (Base.get_extension(@__MODULE__, ext) !== nothing)
-
 let
-    global cuda_loaded, amdgpu_loaded, set_cuda_loaded, set_amdgpu_loaded
-    _cuda_loaded::Bool    = false
-    _amdgpu_loaded::Bool  = false
-    cuda_loaded()::Bool   = _cuda_loaded
-    amdgpu_loaded()::Bool = _amdgpu_loaded
-    set_cuda_loaded()     = (_cuda_loaded   = is_loaded(Val(:ImplicitGlobalGrid_CUDAExt)))
-    set_amdgpu_loaded()   = (_amdgpu_loaded = is_loaded(Val(:ImplicitGlobalGrid_AMDGPUExt)))
+    global cuda_loaded, cuda_functional, amdgpu_loaded, amdgpu_functional, set_cuda_loaded, set_cuda_functional, set_amdgpu_loaded, set_amdgpu_functional
+    _cuda_loaded::Bool        = false
+    _cuda_functional::Bool    = false
+    _amdgpu_loaded::Bool      = false
+    _amdgpu_functional::Bool  = false
+    cuda_loaded()::Bool       = _cuda_loaded
+    cuda_functional()::Bool   = _cuda_functional
+    amdgpu_loaded()::Bool     = _amdgpu_loaded
+    amdgpu_functional()::Bool = _amdgpu_functional
+    set_cuda_loaded()         = (_cuda_loaded = is_loaded(Val(:ImplicitGlobalGrid_CUDAExt)))
+    set_cuda_functional()     = (_cuda_functional = is_functional(Val(:CUDA)))
+    set_amdgpu_loaded()       = (_amdgpu_loaded = is_loaded(Val(:ImplicitGlobalGrid_AMDGPUExt)))
+    set_amdgpu_functional()   = (_amdgpu_functional = is_functional(Val(:AMDGPU)))
 end