Merge Port Dense to Dpcpp

This PR adds dpcpp support and for current ginkgo dense functionalities. Summary: - port dense kernel - port some components like reduction/thread_ids ... - use config selection to select kernel - add some macro/func for the usual dpcpp case - add oneMKL CMake configuration Note: there are some porting related stuff. we need to revisit them afterward. Related PR: #710
ginkgo-project · Jul 21, 2021 · 47ba37d · 47ba37d
2 parents c101790 + d175116
commit 47ba37d
Show file tree

Hide file tree

Showing 34 changed files with 3,216 additions and 92 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -754,7 +754,7 @@ build/dpcpp/opencl_igpu/release/static:
     SYCL_DEVICE_FILTER: "OpenCL"
     SYCL_DEVICE_TYPE: "GPU"
 
-build/dpcpp/level_zero_igpu/debug/static:
+build/dpcpp/level_zero_igpu/debug/shared:
   <<: *default_build_with_test
   extends:
     - .full_test_condition
@@ -765,7 +765,7 @@ build/dpcpp/level_zero_igpu/debug/static:
     CXX_COMPILER: "dpcpp"
     BUILD_DPCPP: "ON"
     BUILD_TYPE: "Debug"
-    BUILD_SHARED_LIBS: "OFF"
+    BUILD_SHARED_LIBS: "ON"
     DPCPP_SINGLE_MODE: "ON"
     SYCL_DEVICE_FILTER: "Level_Zero:GPU"
 

diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
@@ -78,6 +78,7 @@ set(GINKGO_AMD_ARCH_FLAGS @GINKGO_AMD_ARCH_FLAGS@)
 
 set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@)
 set(GINKGO_DPCPP_FLAGS @GINKGO_DPCPP_FLAGS@)
+set(GINKGO_MKL_ROOT @GINKGO_MKL_ROOT@)
 
 set(GINKGO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@)
 
@@ -168,4 +169,8 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP)
     find_package(rocrand REQUIRED)
 endif()
 
+if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_DPCPP)
+    find_package(MKL CONFIG REQUIRED HINTS "${GINKGO_MKL_ROOT}")
+endif()
+
 include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake)
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
@@ -42,6 +42,10 @@ function(ginkgo_create_dpcpp_test test_name)
     target_compile_options(${test_target_name} PRIVATE "${GINKGO_DPCPP_FLAGS}")
     target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel)
     ginkgo_set_test_target_properties(${test_name} ${test_target_name})
+    # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test.
+    if (MKL_ENV)
+       set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}")
+    endif()
 endfunction(ginkgo_create_dpcpp_test)
 
 function(ginkgo_create_thread_test test_name)
@@ -165,7 +169,7 @@ function(ginkgo_create_common_test test_name)
         # use float for DPC++ if necessary
         if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE)
             target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1)
-        endif() 
+        endif()
         ginkgo_set_test_target_properties(${test_name}_${exec} ${test_target_name})
     endforeach()
-endfunction(ginkgo_create_common_test)
+endfunction(ginkgo_create_common_test)
diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc
@@ -57,7 +57,7 @@ __forceinline__ __device__ void subwarp_prefix_sum(ValueType element,
     total_sum = element;
 #pragma unroll
     // hypercube prefix sum
-    for (auto step = 1; step < subwarp.size(); step *= 2) {
+    for (int step = 1; step < subwarp.size(); step *= 2) {
         auto neighbor = subwarp.shfl_xor(total_sum, step);
         total_sum += neighbor;
         prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;

diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc
@@ -70,7 +70,7 @@ struct bitonic_local {
                                                           bool reverse)
     {
         auto els_mid = els + (num_elements / 2);
-        for (auto i = 0; i < num_elements / 2; ++i) {
+        for (int i = 0; i < num_elements / 2; ++i) {
             bitonic_cas(els[i], els_mid[i], reverse);
         }
         half::merge(els, reverse);
@@ -131,7 +131,7 @@ struct bitonic_warp {
         auto tile =
             group::tiled_partition<num_threads>(group::this_thread_block());
         auto new_reverse = reverse != upper_half();
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             auto other = tile.shfl_xor(els[i], num_threads / 2);
             bitonic_cas(els[i], other, new_reverse);
         }
@@ -206,7 +206,7 @@ struct bitonic_global {
         auto upper_shared_els = shared_els + (num_groups * num_threads / 2);
         // only the lower group executes the CAS
         if (!upper_half()) {
-            for (auto i = 0; i < num_local; ++i) {
+            for (int i = 0; i < num_local; ++i) {
                 auto j = shared_idx(i);
                 bitonic_cas(shared_els[j], upper_shared_els[j], reverse);
             }
@@ -241,11 +241,11 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
                                                  bool reverse)
     {
         group::this_thread_block().sync();
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             local_els[i] = shared_els[shared_idx(i)];
         }
         warp::merge(local_els, reverse);
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             shared_els[shared_idx(i)] = local_els[i];
         }
     }
@@ -258,7 +258,7 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
         // This is the first step, so we don't need to load from shared memory
         warp::sort(local_els, reverse);
         // store the sorted elements in shared memory
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             shared_els[shared_idx(i)] = local_els[i];
         }
     }

diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc
@@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**
  * Stores an array with uninitialized contents.
  *
- * This class needed for datatypes that do have a non-empty constructor when`
+ * This class is needed for datatypes that do have a non-empty constructor when
  * using them as shared memory, for example `thrust::complex<float>`.
  *
  * @tparam ValueType the type of values
@@ -49,7 +49,7 @@ public:
      *
      * @return the constexpr pointer to the first entry of the array.
      */
-    constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
+    constexpr GKO_ATTRIBUTES operator const ValueType *() const noexcept
     {
         return &(*this)[0];
     }
@@ -70,7 +70,8 @@ public:
      *
      * @return a reference to the array entry at the given index.
      */
-    constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
+    constexpr GKO_ATTRIBUTES const ValueType &operator[](size_type pos) const
+        noexcept
     {
         return reinterpret_cast<const ValueType *>(data_)[pos];
     }

diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc
@@ -55,7 +55,7 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs,
     auto end = row_ptrs[row + 1];
     begin_cb(row);
     auto num_steps = ceildiv(end - begin, subwarp_size);
-    for (auto step = 0; step < num_steps; ++step) {
+    for (IndexType step = 0; step < num_steps; ++step) {
         auto idx = begin + lane + step * subwarp_size;
         auto keep = idx < end && pred(idx, begin, end);
         auto mask = subwarp.ballot(keep);
@@ -189,4 +189,4 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter(
 }
 
 
-}  // namespace kernel
+}  // namespace kernel
diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc
@@ -62,7 +62,7 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree(
     // assuming rounding towards zero
     auto stride = double(size) / sample_size;
 #pragma unroll
-    for (auto i = 0; i < sampleselect_oversampling; ++i) {
+    for (int i = 0; i < sampleselect_oversampling; ++i) {
         auto lidx = idx * sampleselect_oversampling + i;
         auto val = input[static_cast<IndexType>(lidx * stride)];
         samples[i] = abs(val);
@@ -119,7 +119,7 @@ __global__ __launch_bounds__(default_block_size) void count_buckets(
         auto el = abs(input[i]);
         IndexType tree_idx{};
 #pragma unroll
-        for (auto level = 0; level < sampleselect_searchtree_height; ++level) {
+        for (int level = 0; level < sampleselect_searchtree_height; ++level) {
             auto cmp = !(el < sh_tree[tree_idx]);
             tree_idx = 2 * tree_idx + 1 + cmp;
         }
@@ -168,7 +168,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
     // compute prefix sum over warp-sized blocks
     IndexType total{};
     auto base_idx = warp_idx * work_per_warp * warp.size();
-    for (auto step = 0; step < work_per_warp; ++step) {
+    for (IndexType step = 0; step < work_per_warp; ++step) {
         auto idx = warp_lane + step * warp.size() + base_idx;
         auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
         IndexType warp_total{};
@@ -207,7 +207,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
     // add block prefix sum to each warp's block of data
     block.sync();
     auto warp_prefixsum = warp_sums[warp_idx];
-    for (auto step = 0; step < work_per_warp; ++step) {
+    for (IndexType step = 0; step < work_per_warp; ++step) {
         auto idx = warp_lane + step * warp.size() + base_idx;
         auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
         if (idx < num_blocks) {

diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc
@@ -211,7 +211,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = row_ptrs[tidx];
-        for (auto i = 0; i < num_cols; i++) {
+        for (size_type i = 0; i < num_cols; i++) {
             if (source[stride * tidx + i] != zero<ValueType>()) {
                 values[write_to] = source[stride * tidx + i];
                 col_idxs[write_to] = i;

diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
@@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_rows) {
-        for (auto col = 0; col < nnz; col++) {
+        for (size_type col = 0; col < nnz; col++) {
             result[tidx * result_stride +
                    col_idxs[tidx + col * source_stride]] +=
                 values[tidx + col * source_stride];
@@ -226,7 +226,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
+        for (size_type i = 0; i < max_nnz_per_row; i++) {
             const auto source_idx = tidx + stride * i;
             if (source_values[source_idx] != zero<ValueType>()) {
                 result_values[write_to] = source_values[source_idx];

diff --git a/common/matrix/hybrid_kernels.hpp.inc b/common/matrix/hybrid_kernels.hpp.inc
@@ -108,7 +108,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
+        for (size_type i = 0; i < max_nnz_per_row; i++) {
             const auto source_idx = tidx + stride * i;
             if (ell_val[source_idx] != zero<ValueType>()) {
                 result_values[write_to] = ell_val[source_idx];

diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu
@@ -49,16 +49,16 @@ template <typename IndexType>
 void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // prefix_sum should be on the valid array
+    // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
         auto block_sums = block_sum_array.get_data();
         start_prefix_sum<prefix_sum_block_size>
             <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
                                                     block_sums);
-        // add the total sum of the previous block only when the number of block
-        // is larger than 1.
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
         if (num_blocks > 1) {
             finalize_prefix_sum<prefix_sum_block_size>
                 <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,

diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu
@@ -99,7 +99,7 @@ protected:
     {
         // we want some duplicate elements
         std::uniform_int_distribution<gko::int32> dist(0, num_elements / 2);
-        for (auto i = 0; i < num_elements; ++i) {
+        for (int i = 0; i < num_elements; ++i) {
             ref_shared.get_data()[i] = dist(rng);
         }
         ddata = gko::Array<gko::int32>{cuda, ref_shared};

diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
@@ -550,7 +550,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
                                                           &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }

diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
@@ -585,7 +585,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef)
                                                         &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }

diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh
@@ -121,7 +121,6 @@ GINKGO_LICENSE_BEACON="******************************<GINKGO LICENSE>***********
 
 CONTENT="content.cpp" # Store the residual part (start from namespace)
 BEFORE="before.cpp" # Store the main header and the #ifdef/#define of header file
-BEGIN="begin.cpp" # Store the header before license
 HAS_HIP_RUNTIME="false"
 DURING_LICENSE="false"
 INCLUDE_REGEX="^#include.*"

diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup
@@ -2,7 +2,7 @@ IncludeBlocks: Regroup
 IncludeCategories:
   - Regex: '^<(rapidjson|gflags|gtest|papi).*'
     Priority: 3
-  - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative).*'
+  - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi).*'
     Priority: 2
   - Regex: '^<ginkgo.*'
     Priority: 5

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
@@ -6,11 +6,15 @@ endif()
 ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION)
 set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE)
 
+find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
+set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)
+
 add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
 target_sources(ginkgo_dpcpp
     PRIVATE
     base/version.dp.cpp
     base/executor.dp.cpp
+    base/helper.dp.cpp
     components/absolute_array.dp.cpp
     components/fill_array.dp.cpp
     components/prefix_sum.dp.cpp
@@ -54,13 +58,24 @@ target_sources(ginkgo_dpcpp
 ginkgo_compile_features(ginkgo_dpcpp)
 target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP)
 
-set(GINKGO_DPCPP_FLAGS ${GINKGO_COMPILER_FLAGS} -fsycl)
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
+# Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating
+# find_package(MKL) everywhere when linking ginkgo (see the MKL example
+# https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html)
+target_compile_options(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_COMPILE_OPTIONS>)
 target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
+target_include_directories(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
-target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
+# When building ginkgo as a static library, we need to use dpcpp and per_kernel
+# link option when the program uses a dpcpp related function.
+if (BUILD_SHARED_LIBS)
+    target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
+else ()
+    target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
+endif()
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
+target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP)
 if (GINKGO_DPCPP_SINGLE_MODE)
     target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
 endif()