diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index df3265cb58c..1a884c2b408 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -754,7 +754,7 @@ build/dpcpp/opencl_igpu/release/static: SYCL_DEVICE_FILTER: "OpenCL" SYCL_DEVICE_TYPE: "GPU" -build/dpcpp/level_zero_igpu/debug/static: +build/dpcpp/level_zero_igpu/debug/shared: <<: *default_build_with_test extends: - .full_test_condition @@ -765,7 +765,7 @@ build/dpcpp/level_zero_igpu/debug/static: CXX_COMPILER: "dpcpp" BUILD_DPCPP: "ON" BUILD_TYPE: "Debug" - BUILD_SHARED_LIBS: "OFF" + BUILD_SHARED_LIBS: "ON" DPCPP_SINGLE_MODE: "ON" SYCL_DEVICE_FILTER: "Level_Zero:GPU" diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 61da405cf6f..4675bcb4781 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -78,6 +78,7 @@ set(GINKGO_AMD_ARCH_FLAGS @GINKGO_AMD_ARCH_FLAGS@) set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@) set(GINKGO_DPCPP_FLAGS @GINKGO_DPCPP_FLAGS@) +set(GINKGO_MKL_ROOT @GINKGO_MKL_ROOT@) set(GINKGO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@) @@ -168,4 +169,8 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP) find_package(rocrand REQUIRED) endif() +if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_DPCPP) + find_package(MKL CONFIG REQUIRED HINTS "${GINKGO_MKL_ROOT}") +endif() + include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index ebf70232dd3..9d36b49911e 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -42,6 +42,10 @@ function(ginkgo_create_dpcpp_test test_name) target_compile_options(${test_target_name} PRIVATE "${GINKGO_DPCPP_FLAGS}") target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) ginkgo_set_test_target_properties(${test_name} ${test_target_name}) + # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test. + if (MKL_ENV) + set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}") + endif() endfunction(ginkgo_create_dpcpp_test) function(ginkgo_create_thread_test test_name) @@ -165,7 +169,7 @@ function(ginkgo_create_common_test test_name) # use float for DPC++ if necessary if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1) - endif() + endif() ginkgo_set_test_target_properties(${test_name}_${exec} ${test_target_name}) endforeach() -endfunction(ginkgo_create_common_test) \ No newline at end of file +endfunction(ginkgo_create_common_test) diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc index 8f759b1dc95..1d57c20b2e5 100644 --- a/common/components/prefix_sum.hpp.inc +++ b/common/components/prefix_sum.hpp.inc @@ -57,7 +57,7 @@ __forceinline__ __device__ void subwarp_prefix_sum(ValueType element, total_sum = element; #pragma unroll // hypercube prefix sum - for (auto step = 1; step < subwarp.size(); step *= 2) { + for (int step = 1; step < subwarp.size(); step *= 2) { auto neighbor = subwarp.shfl_xor(total_sum, step); total_sum += neighbor; prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0; diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc index ef5bd690937..cd772e08adb 100644 --- a/common/components/sorting.hpp.inc +++ b/common/components/sorting.hpp.inc @@ -70,7 +70,7 @@ struct bitonic_local { bool reverse) { auto els_mid = els + (num_elements / 2); - for (auto i = 0; i < num_elements / 2; ++i) { + for (int i = 0; i < num_elements / 2; ++i) { bitonic_cas(els[i], els_mid[i], reverse); } half::merge(els, reverse); @@ -131,7 +131,7 @@ struct bitonic_warp { auto tile = group::tiled_partition(group::this_thread_block()); auto new_reverse = reverse != upper_half(); - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { auto other = tile.shfl_xor(els[i], num_threads / 2); bitonic_cas(els[i], other, new_reverse); } @@ -206,7 +206,7 @@ struct bitonic_global { auto upper_shared_els = shared_els + (num_groups * num_threads / 2); // only the lower group executes the CAS if (!upper_half()) { - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { auto j = shared_idx(i); bitonic_cas(shared_els[j], upper_shared_els[j], reverse); } @@ -241,11 +241,11 @@ struct bitonic_global { bool reverse) { group::this_thread_block().sync(); - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { local_els[i] = shared_els[shared_idx(i)]; } warp::merge(local_els, reverse); - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { shared_els[shared_idx(i)] = local_els[i]; } } @@ -258,7 +258,7 @@ struct bitonic_global { // This is the first step, so we don't need to load from shared memory warp::sort(local_els, reverse); // store the sorted elements in shared memory - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { shared_els[shared_idx(i)] = local_els[i]; } } diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc index 3a8b3796c12..e951cf06860 100644 --- a/common/components/uninitialized_array.hpp.inc +++ b/common/components/uninitialized_array.hpp.inc @@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * Stores an array with uninitialized contents. * - * This class needed for datatypes that do have a non-empty constructor when` + * This class is needed for datatypes that do have a non-empty constructor when * using them as shared memory, for example `thrust::complex`. * * @tparam ValueType the type of values @@ -49,7 +49,7 @@ public: * * @return the constexpr pointer to the first entry of the array. */ - constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept + constexpr GKO_ATTRIBUTES operator const ValueType *() const noexcept { return &(*this)[0]; } @@ -70,7 +70,8 @@ public: * * @return a reference to the array entry at the given index. */ - constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept + constexpr GKO_ATTRIBUTES const ValueType &operator[](size_type pos) const + noexcept { return reinterpret_cast(data_)[pos]; } diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc index 25b43e789ee..b5f7d43db67 100644 --- a/common/factorization/par_ilut_filter_kernels.hpp.inc +++ b/common/factorization/par_ilut_filter_kernels.hpp.inc @@ -55,7 +55,7 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs, auto end = row_ptrs[row + 1]; begin_cb(row); auto num_steps = ceildiv(end - begin, subwarp_size); - for (auto step = 0; step < num_steps; ++step) { + for (IndexType step = 0; step < num_steps; ++step) { auto idx = begin + lane + step * subwarp_size; auto keep = idx < end && pred(idx, begin, end); auto mask = subwarp.ballot(keep); @@ -189,4 +189,4 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter( } -} // namespace kernel \ No newline at end of file +} // namespace kernel diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc index 059069faf41..e443d7b6ba7 100644 --- a/common/factorization/par_ilut_select_kernels.hpp.inc +++ b/common/factorization/par_ilut_select_kernels.hpp.inc @@ -62,7 +62,7 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree( // assuming rounding towards zero auto stride = double(size) / sample_size; #pragma unroll - for (auto i = 0; i < sampleselect_oversampling; ++i) { + for (int i = 0; i < sampleselect_oversampling; ++i) { auto lidx = idx * sampleselect_oversampling + i; auto val = input[static_cast(lidx * stride)]; samples[i] = abs(val); @@ -119,7 +119,7 @@ __global__ __launch_bounds__(default_block_size) void count_buckets( auto el = abs(input[i]); IndexType tree_idx{}; #pragma unroll - for (auto level = 0; level < sampleselect_searchtree_height; ++level) { + for (int level = 0; level < sampleselect_searchtree_height; ++level) { auto cmp = !(el < sh_tree[tree_idx]); tree_idx = 2 * tree_idx + 1 + cmp; } @@ -168,7 +168,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum( // compute prefix sum over warp-sized blocks IndexType total{}; auto base_idx = warp_idx * work_per_warp * warp.size(); - for (auto step = 0; step < work_per_warp; ++step) { + for (IndexType step = 0; step < work_per_warp; ++step) { auto idx = warp_lane + step * warp.size() + base_idx; auto val = idx < num_blocks ? local_counters[idx] : zero(); IndexType warp_total{}; @@ -207,7 +207,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum( // add block prefix sum to each warp's block of data block.sync(); auto warp_prefixsum = warp_sums[warp_idx]; - for (auto step = 0; step < work_per_warp; ++step) { + for (IndexType step = 0; step < work_per_warp; ++step) { auto idx = warp_lane + step * warp.size() + base_idx; auto val = idx < num_blocks ? local_counters[idx] : zero(); if (idx < num_blocks) { diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc index d46b202a8ff..c7ebafd0627 100644 --- a/common/matrix/dense_kernels.hpp.inc +++ b/common/matrix/dense_kernels.hpp.inc @@ -211,7 +211,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( if (tidx < num_rows) { auto write_to = row_ptrs[tidx]; - for (auto i = 0; i < num_cols; i++) { + for (size_type i = 0; i < num_cols; i++) { if (source[stride * tidx + i] != zero()) { values[write_to] = source[stride * tidx + i]; col_idxs[write_to] = i; diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc index 2323d512258..399dd5070ac 100644 --- a/common/matrix/ell_kernels.hpp.inc +++ b/common/matrix/ell_kernels.hpp.inc @@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense( { const auto tidx = thread::get_thread_id_flat(); if (tidx < num_rows) { - for (auto col = 0; col < nnz; col++) { + for (size_type col = 0; col < nnz; col++) { result[tidx * result_stride + col_idxs[tidx + col * source_stride]] += values[tidx + col * source_stride]; @@ -226,7 +226,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( if (tidx < num_rows) { auto write_to = result_row_ptrs[tidx]; - for (auto i = 0; i < max_nnz_per_row; i++) { + for (size_type i = 0; i < max_nnz_per_row; i++) { const auto source_idx = tidx + stride * i; if (source_values[source_idx] != zero()) { result_values[write_to] = source_values[source_idx]; diff --git a/common/matrix/hybrid_kernels.hpp.inc b/common/matrix/hybrid_kernels.hpp.inc index b6af7c2be36..c7c192189e0 100644 --- a/common/matrix/hybrid_kernels.hpp.inc +++ b/common/matrix/hybrid_kernels.hpp.inc @@ -108,7 +108,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( if (tidx < num_rows) { auto write_to = result_row_ptrs[tidx]; - for (auto i = 0; i < max_nnz_per_row; i++) { + for (size_type i = 0; i < max_nnz_per_row; i++) { const auto source_idx = tidx + stride * i; if (ell_val[source_idx] != zero()) { result_values[write_to] = ell_val[source_idx]; diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu index 54739c783c8..ce108fa8cf9 100644 --- a/cuda/components/prefix_sum.cu +++ b/cuda/components/prefix_sum.cu @@ -49,7 +49,7 @@ template void prefix_sum(std::shared_ptr exec, IndexType *counts, size_type num_entries) { - // prefix_sum should be on the valid array + // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); Array block_sum_array(exec, num_blocks - 1); @@ -57,8 +57,8 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, start_prefix_sum <<>>(num_entries, counts, block_sums); - // add the total sum of the previous block only when the number of block - // is larger than 1. + // add the total sum of the previous block only when the number of + // blocks is larger than 1. if (num_blocks > 1) { finalize_prefix_sum <<>>(num_entries, counts, diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu index e973cc0f650..e2b7abc51d7 100644 --- a/cuda/test/components/sorting_kernels.cu +++ b/cuda/test/components/sorting_kernels.cu @@ -99,7 +99,7 @@ protected: { // we want some duplicate elements std::uniform_int_distribution dist(0, num_elements / 2); - for (auto i = 0; i < num_elements; ++i) { + for (int i = 0; i < num_elements; ++i) { ref_shared.get_data()[i] = dist(rng); } ddata = gko::Array{cuda, ref_shared}; diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp index 6e40ce5b5a3..de96d27d823 100644 --- a/cuda/test/matrix/dense_kernels.cpp +++ b/cuda/test/matrix/dense_kernels.cpp @@ -550,7 +550,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) &dnnz_per_row); auto tmp = gko::Array(ref, dnnz_per_row); - for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); } } diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index 2df1c397f4c..51c12fab531 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -585,7 +585,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) &dnnz_per_row); auto tmp = gko::Array(ref, dnnz_per_row); - for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); } } diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh index cbe1b2e8ab9..7a24f50bef6 100755 --- a/dev_tools/scripts/format_header.sh +++ b/dev_tools/scripts/format_header.sh @@ -121,7 +121,6 @@ GINKGO_LICENSE_BEACON="***************************************** CONTENT="content.cpp" # Store the residual part (start from namespace) BEFORE="before.cpp" # Store the main header and the #ifdef/#define of header file -BEGIN="begin.cpp" # Store the header before license HAS_HIP_RUNTIME="false" DURING_LICENSE="false" INCLUDE_REGEX="^#include.*" diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup index 9e8a5172d05..1756481e2e4 100644 --- a/dev_tools/scripts/regroup +++ b/dev_tools/scripts/regroup @@ -2,7 +2,7 @@ IncludeBlocks: Regroup IncludeCategories: - Regex: '^<(rapidjson|gflags|gtest|papi).*' Priority: 3 - - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative).*' + - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi).*' Priority: 2 - Regex: '^ "") target_sources(ginkgo_dpcpp PRIVATE base/version.dp.cpp base/executor.dp.cpp + base/helper.dp.cpp components/absolute_array.dp.cpp components/fill_array.dp.cpp components/prefix_sum.dp.cpp @@ -54,13 +58,24 @@ target_sources(ginkgo_dpcpp ginkgo_compile_features(ginkgo_dpcpp) target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP) -set(GINKGO_DPCPP_FLAGS ${GINKGO_COMPILER_FLAGS} -fsycl) set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") +# Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating +# find_package(MKL) everywhere when linking ginkgo (see the MKL example +# https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html) +target_compile_options(ginkgo_dpcpp PRIVATE $) target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) +target_include_directories(ginkgo_dpcpp PRIVATE $) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) -target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) +# When building ginkgo as a static library, we need to use dpcpp and per_kernel +# link option when the program uses a dpcpp related function. +if (BUILD_SHARED_LIBS) + target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) +else () + target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel) +endif() target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) +target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP) if (GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp new file mode 100644 index 00000000000..5e6c1a579f5 --- /dev/null +++ b/dpcpp/base/helper.dp.cpp @@ -0,0 +1,62 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include "dpcpp/base/helper.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +bool validate(sycl::queue *queue, unsigned int workgroup_size, + unsigned int subgroup_size) +{ + auto device = queue->get_device(); + auto subgroup_size_list = + device.get_info(); + auto max_workgroup_size = + device.get_info(); + bool allowed = false; + for (auto &i : subgroup_size_list) { + allowed |= (i == subgroup_size); + } + return allowed && (workgroup_size <= max_workgroup_size); +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index c888eb9d99d..cb98e4c511e 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -40,31 +40,55 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include +#include "core/base/types.hpp" #include "dpcpp/base/dim3.dp.hpp" +/** + * GKO_ENABLE_DEFAULT_HOST gives a default host implementation for those + * kernels which require encoded config but do not need explicit template + * parameter and shared memory + * + * @param name_ the name of the host function with config + * @param kernel_ the kernel name + */ +#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ + template \ + void name_(dim3 grid, dim3 block, gko::size_type, sycl::queue *queue, \ + InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler &cgh) { \ + cgh.parallel_for(sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ + } + + /** * GKO_ENABLE_DEFAULT_HOST_CONFIG gives a default host implementation for those * kernels which require encoded config but do not need explicit template - * parameter and share memory + * parameter and shared memory * * @param name_ the name of the host function with config * @param kernel_ the kernel name */ -#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ - template \ - inline void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \ - sycl::queue *queue, InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler &cgh) { \ - cgh.parallel_for(sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ + template \ + inline void name_(dim3 grid, dim3 block, gko::size_type, \ + sycl::queue *queue, InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler &cgh) { \ + cgh.parallel_for(sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } /** @@ -82,7 +106,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_) \ template \ void name_(std::uint32_t desired_cfg, dim3 grid, dim3 block, \ - size_t dynamic_shared_memory, sycl::queue *queue, \ + gko::size_type dynamic_shared_memory, sycl::queue *queue, \ InferredArgs... args) \ { \ callable_( \ @@ -114,19 +138,42 @@ namespace kernels { namespace dpcpp { +/** + * This is the validate function for common check. It checks the workgroup size + * is below device max workgroup size and subgroup size is in the supported + * subgroup size. + * + * @param queue the sycl queue pointer + * @param workgroup_size the workgroup size (block size in cuda sense) + * @param subgroup_size the subgroup size (warp size in cuda sense) + * + * @return the given arguments are valid or not in given queue. + */ bool validate(sycl::queue *queue, unsigned workgroup_size, - unsigned subgroup_size) + unsigned subgroup_size); + + +/** + * get_first_cfg will return the first valid config by validate function from + * given config array. + * + * @tparam IterArr the iteratable array type + * @tparam Validate the validate function type + * + * @param arr the config array + * @param verify the validate function + * + * @return the first valid config + */ +template +std::uint32_t get_first_cfg(IterArr &arr, Validate verify) { - auto device = queue->get_device(); - auto subgroup_size_list = - device.get_info(); - auto max_workgroup_size = - device.get_info(); - bool allowed = false; - for (auto &i : subgroup_size_list) { - allowed |= (i == subgroup_size); + for (auto &cfg : arr) { + if (verify(cfg)) { + return cfg; + } } - return allowed && (workgroup_size <= max_workgroup_size); + GKO_NOT_SUPPORTED(arr); } diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp new file mode 100644 index 00000000000..1c9a8dabb30 --- /dev/null +++ b/dpcpp/base/onemkl_bindings.hpp @@ -0,0 +1,131 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_ +#define GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_ + + +#include + + +#include +#include + + +namespace gko { +/** + * @brief The device specific kernels namespace. + * + * @ingroup kernels + */ +namespace kernels { +/** + * @brief The DPCPP namespace. + * + * @ingroup dpcpp + */ +namespace dpcpp { +/** + * @brief The ONEMKL namespace. + * + * @ingroup onemkl + */ +namespace onemkl { +/** + * @brief The detail namespace. + * + * @ingroup detail + */ +namespace detail { + + +template +inline void not_implemented(Args &&...) GKO_NOT_IMPLEMENTED; + + +} // namespace detail + + +template +struct is_supported : std::false_type {}; + +template <> +struct is_supported : std::true_type {}; + +template <> +struct is_supported : std::true_type {}; + +template <> +struct is_supported> : std::true_type {}; + +template <> +struct is_supported> : std::true_type {}; + + +#define GKO_BIND_DOT(ValueType, Name, Func) \ + void Name(::cl::sycl::queue &exec_queue, std::int64_t n, \ + const ValueType *x, std::int64_t incx, const ValueType *y, \ + std::int64_t incy, ValueType *result) \ + { \ + Func(exec_queue, n, x, incx, y, incy, result); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +// Bind the dot for x^T * y +GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(std::complex, dot, oneapi::mkl::blas::row_major::dotu); +GKO_BIND_DOT(std::complex, dot, oneapi::mkl::blas::row_major::dotu); +template +GKO_BIND_DOT(ValueType, dot, detail::not_implemented); + +// Bind the conj_dot for x' * y +GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(std::complex, conj_dot, oneapi::mkl::blas::row_major::dotc); +GKO_BIND_DOT(std::complex, conj_dot, + oneapi::mkl::blas::row_major::dotc); +template +GKO_BIND_DOT(ValueType, conj_dot, detail::not_implemented); + +#undef GKO_BIND_DOT + + +} // namespace onemkl +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_ diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp index 4b7f816b381..63f33e9ba35 100644 --- a/dpcpp/components/prefix_sum.dp.cpp +++ b/dpcpp/components/prefix_sum.dp.cpp @@ -36,7 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include + + +#include "core/base/types.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/prefix_sum.dp.hpp" namespace gko { @@ -45,22 +50,52 @@ namespace dpcpp { namespace components { +using BlockCfg = ConfigSet<11>; + +constexpr auto block_cfg_list = + ::gko::syn::value_list(); + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum) +GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum, + block_cfg_list) + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_prefix_sum, + finalize_prefix_sum) +GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_prefix_sum_call, finalize_prefix_sum, + block_cfg_list) + + template -void prefix_sum(std::shared_ptr exec, IndexType *counts, +void prefix_sum(std::shared_ptr exec, IndexType *counts, size_type num_entries) { - // TODO actually implement parallel prefix sum - exec->get_queue()->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{1}, [=](sycl::id<1> idx) { - IndexType sum{}; - for (size_type i = 0; i < num_entries; i++) { - sum += std::exchange(counts[i], sum); - } - }); - }); + // prefix_sum should only be performed on a valid array + if (num_entries > 0) { + auto queue = exec->get_queue(); + constexpr auto block_cfg_array = as_array(block_cfg_list); + const std::uint32_t cfg = + get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) { + return validate(queue, BlockCfg::decode<0>(cfg), 16); + }); + const auto wg_size = BlockCfg::decode<0>(cfg); + auto num_blocks = ceildiv(num_entries, wg_size); + Array block_sum_array(exec, num_blocks - 1); + auto block_sums = block_sum_array.get_data(); + start_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(), + num_entries, counts, block_sums); + // add the total sum of the previous block only when the number of + // blocks is larger than 1. + if (num_blocks > 1) { + finalize_prefix_sum_call(cfg, num_blocks, wg_size, 0, + exec->get_queue(), num_entries, counts, + block_sums); + } + } } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL); + // instantiate for size_type as well, as this is used in the Sellp format template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type); diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp new file mode 100644 index 00000000000..334d4239c56 --- /dev/null +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -0,0 +1,253 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_ + + +#include + + +#include + + +#include "core/base/types.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +/** + * @internal + * Computes the prefix sum and total sum of `element` over a subwarp. + * + * @param element the element over which we compute the prefix sum. + * @param prefix_sum will be set to the sum of all `element`s from lower + * lanes, plus the local `element` if `inclusive` is `true`. + * @param total_sum will be set to the total sum of `element` in this subwarp. + * @param subwarp the cooperative group representing the subwarp. + * + * @tparam inclusive if this is true, the computed prefix sum will be + * inclusive, otherwise it will be exclusive. + * + * @note For this function to work on architectures with independent thread + * scheduling, all threads of the subwarp have to execute it. + */ +template +__dpct_inline__ void subwarp_prefix_sum(ValueType element, + ValueType &prefix_sum, + ValueType &total_sum, Group subwarp) +{ + prefix_sum = inclusive ? element : zero(); + total_sum = element; +#pragma unroll + // hypercube prefix sum + for (int step = 1; step < subwarp.size(); step *= 2) { + auto neighbor = subwarp.shfl_xor(total_sum, step); + total_sum += neighbor; + prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0; + } +} + +/** + * @internal + * Computes the prefix sum of `element` over a subwarp. + * + * @param element the element over which we compute the prefix sum. + * @param prefix_sum will be set to the sum of all `element`s from lower + * lanes, plus the local `element` if `inclusive` is `true`. + * @param subwarp the cooperative group representing the subwarp. + * + * @tparam inclusive if this is true, the computed prefix sum will be + * inclusive, otherwise it will be exclusive. + * + * @note All threads of the subwarp have to execute this function for it to work + * (and not dead-lock on newer architectures). + */ +template +__dpct_inline__ void subwarp_prefix_sum(ValueType element, + ValueType &prefix_sum, Group subwarp) +{ + ValueType tmp{}; + subwarp_prefix_sum(element, prefix_sum, tmp, subwarp); +} + + +/** + * @internal + * First step of the calculation of a prefix sum. Calculates the prefix sum + * in-place on parts of the array `elements`. + * + * @param elements array on which the prefix sum is to be calculated + * @param block_sum array which stores the total sum of each block, requires at + * least `ceildiv(num_elements, block_size) - 1` elements + * @param num_elements total number of entries in `elements` + * + * @tparam block_size thread block size for this kernel, also size of blocks on + * which this kernel calculates the prefix sum in-place + * + * @note To calculate the prefix sum over an array of size bigger than + * `block_size`, `finalize_prefix_sum` has to be used as well. + */ +template +void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, + ValueType *__restrict__ block_sum, + sycl::nd_item<3> item_ct1, + UninitializedArray *prefix_helper) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + const auto element_id = item_ct1.get_local_id(2); + + // do not need to access the last element when exclusive prefix sum + (*prefix_helper)[element_id] = + (tidx + 1 < num_elements) ? elements[tidx] : zero(); + auto this_block = group::this_thread_block(item_ct1); + this_block.sync(); + + // Do a normal reduction +#pragma unroll + for (int i = 1; i < block_size; i <<= 1) { + const auto ai = i * (2 * element_id + 1) - 1; + const auto bi = i * (2 * element_id + 2) - 1; + if (bi < block_size) { + (*prefix_helper)[bi] += (*prefix_helper)[ai]; + } + this_block.sync(); + } + + if (element_id == 0) { + // Store the total sum except the last block + if (item_ct1.get_group(2) + 1 < item_ct1.get_group_range(2)) { + block_sum[item_ct1.get_group(2)] = (*prefix_helper)[block_size - 1]; + } + (*prefix_helper)[block_size - 1] = zero(); + } + + this_block.sync(); + + // Perform the down-sweep phase to get the true prefix sum +#pragma unroll + for (int i = block_size >> 1; i > 0; i >>= 1) { + const auto ai = i * (2 * element_id + 1) - 1; + const auto bi = i * (2 * element_id + 2) - 1; + if (bi < block_size) { + auto tmp = (*prefix_helper)[ai]; + (*prefix_helper)[ai] = (*prefix_helper)[bi]; + (*prefix_helper)[bi] += tmp; + } + this_block.sync(); + } + if (tidx < num_elements) { + elements[tidx] = (*prefix_helper)[element_id]; + } +} + +template +void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_elements, + ValueType *elements, ValueType *block_sum) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + prefix_helper_acc_ct1(cgh); + + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + start_prefix_sum( + num_elements, elements, block_sum, item_ct1, + prefix_helper_acc_ct1.get_pointer().get()); + }); + }); +} + + +/** + * @internal + * Second step of the calculation of a prefix sum. Increases the value of each + * entry of `elements` by the total sum of all preceding blocks. + * + * @param elements array on which the prefix sum is to be calculated + * @param block_sum array storing the total sum of each block + * @param num_elements total number of entries in `elements` + * + * @tparam block_size thread block size for this kernel, has to be the same as + * for `start_prefix_sum` + * + * @note To calculate a prefix sum, first `start_prefix_sum` has to be called. + */ +template +void finalize_prefix_sum(size_type num_elements, + ValueType *__restrict__ elements, + const ValueType *__restrict__ block_sum, + sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + + if (tidx < num_elements) { + ValueType prefix_block_sum = zero(); + for (size_type i = 0; i < item_ct1.get_group(2); i++) { + prefix_block_sum += block_sum[i]; + } + elements[tidx] += prefix_block_sum; + } +} + +template +void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_elements, + ValueType *elements, const ValueType *block_sum) +{ + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + finalize_prefix_sum( + num_elements, elements, block_sum, item_ct1); + }); + }); +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_ diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp new file mode 100644 index 00000000000..094f2093a95 --- /dev/null +++ b/dpcpp/components/reduction.dp.hpp @@ -0,0 +1,298 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_ + + +#include + + +#include + + +#include +#include +#include + + +#include "core/base/types.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/uninitialized_array.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +constexpr int default_block_size = 256; +using KCFG_1D = ConfigSet<11, 7>; +constexpr auto kcfg_1d_list = + syn::value_list(); +constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + +/** + * @internal + * + * Computes a reduction using the binary operation `reduce_op` on a group + * `group`. Each thread contributes with one element `local_data`. The local + * thread element is always passed as the first parameter to the `reduce_op`. + * The function returns the result of the reduction on all threads. + * + * @note The function is guaranteed to return the correct value on all threads + * only if `reduce_op` is commutative (in addition to being associative). + * Otherwise, the correct value is returned only to the thread with + * subwarp index 0. + */ +template < + typename Group, typename ValueType, typename Operator, + typename = std::enable_if_t::value>> +__dpct_inline__ ValueType reduce(const Group &group, ValueType local_data, + Operator reduce_op = Operator{}) +{ +#pragma unroll + for (int32 bitmask = 1; bitmask < group.size(); bitmask <<= 1) { + const auto remote_data = group.shfl_xor(local_data, bitmask); + local_data = reduce_op(local_data, remote_data); + } + return local_data; +} + + +/** + * @internal + * + * Returns the index of the thread that has the element with the largest + * magnitude among all the threads in the group. + * Only the values from threads which set `is_pivoted` to `false` will be + * considered. + */ +template < + typename Group, typename ValueType, + typename = std::enable_if_t::value>> +__dpct_inline__ int choose_pivot(const Group &group, ValueType local_data, + bool is_pivoted) +{ + using real = remove_complex; + real lmag = is_pivoted ? -one() : abs(local_data); + const auto pivot = + reduce(group, group.thread_rank(), [&](int lidx, int ridx) { + const auto rmag = group.shfl(lmag, ridx); + if (rmag > lmag) { + lmag = rmag; + lidx = ridx; + } + return lidx; + }); + // pivot operator not commutative, make sure everyone has the same pivot + return group.shfl(pivot, 0); +} + + +/** + * @internal + * + * Computes a reduction using the binary operation `reduce_op` on entire block. + * The data for the reduction is taken from the `data` array which has to be of + * size `block_size` and accessible from all threads. The `data` array is also + * used as work space (so its content will be destroyed in the process), as well + * as to store the return value - which is stored in the 0-th position of the + * array. + */ +template < + unsigned int sg_size = 32, typename Group, typename ValueType, + typename Operator, + typename = std::enable_if_t::value>> +void reduce(const Group &__restrict__ group, ValueType *__restrict__ data, + Operator reduce_op = Operator{}) +{ + const auto local_id = group.thread_rank(); + + for (int k = group.size() / 2; k >= sg_size; k /= 2) { + group.sync(); + if (local_id < k) { + data[local_id] = reduce_op(data[local_id], data[local_id + k]); + } + } + + const auto warp = group::tiled_partition(group); + const auto warp_id = group.thread_rank() / warp.size(); + if (warp_id > 0) { + return; + } + auto result = ::gko::kernels::dpcpp::reduce(warp, data[warp.thread_rank()], + reduce_op); + if (warp.thread_rank() == 0) { + data[0] = result; + } +} + + +/** + * @internal + * + * Computes a reduction using the binary operation `reduce_op` on an array + * `source` of any size. Has to be called a second time on `result` to reduce + * an array larger than `block_size`. + */ +template +void reduce_array(size_type size, const ValueType *__restrict__ source, + ValueType *__restrict__ result, sycl::nd_item<3> item_ct1, + Operator reduce_op = Operator{}) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + auto thread_result = zero(); + for (auto i = tidx; i < size; + i += item_ct1.get_local_range().get(2) * item_ct1.get_group_range(2)) { + thread_result = reduce_op(thread_result, source[i]); + } + result[item_ct1.get_local_id(2)] = thread_result; + + group::this_thread_block(item_ct1).sync(); + + // Stores the result of the reduction inside `result[0]` + reduce(group::this_thread_block(item_ct1), result, reduce_op); +} + + +/** + * @internal + * + * Computes a reduction using the add operation (+) on an array + * `source` of any size. Has to be called a second time on `result` to reduce + * an array larger than `block_size`. + */ +template +void reduce_add_array( + size_type size, const ValueType *__restrict__ source, + ValueType *__restrict__ result, sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *block_sum) +{ + reduce_array(cfg)>( + size, source, static_cast((*block_sum)), item_ct1, + [](const ValueType &x, const ValueType &y) { return x + y; }); + + if (item_ct1.get_local_id(2) == 0) { + result[item_ct1.get_group(2)] = (*block_sum)[0]; + } +} + +template +void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const ValueType *source, ValueType *result) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor(cfg)>, + 0, sycl::access::mode::read_write, + sycl::access::target::local> + block_sum_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + reduce_add_array(size, source, result, item_ct1, + block_sum_acc_ct1.get_pointer().get()); + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_add_array_config, + reduce_add_array); + +GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config, + kcfg_1d_list); + + +/** + * Compute a reduction using add operation (+). + * + * @param exec Executor associated to the array + * @param size size of the array + * @param source the pointer of the array + * + * @return the reduction result + */ +template +ValueType reduce_add_array(std::shared_ptr exec, + size_type size, const ValueType *source) +{ + auto block_results_val = source; + size_type grid_dim = size; + auto block_results = Array(exec); + ValueType answer = zero(); + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + + if (size > wg_size) { + const auto n = ceildiv(size, wg_size); + grid_dim = (n <= wg_size) ? n : wg_size; + + block_results.resize_and_reset(grid_dim); + + reduce_add_array_call(cfg, grid_dim, wg_size, 0, exec->get_queue(), + size, source, block_results.get_data()); + + block_results_val = block_results.get_const_data(); + } + + auto d_result = Array(exec, 1); + + reduce_add_array_call(cfg, 1, wg_size, 0, exec->get_queue(), grid_dim, + block_results_val, d_result.get_data()); + answer = exec->copy_val_to_host(d_result.get_const_data()); + return answer; +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_ diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp new file mode 100644 index 00000000000..70ad76d9ccb --- /dev/null +++ b/dpcpp/components/thread_ids.dp.hpp @@ -0,0 +1,327 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_ + + +#include + + +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dpct.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The DPCPP thread namespace. + * + * @ingroup dpcpp_thread + */ +namespace thread { + + +// TODO: porting - need to refine functions and their name in this file +// the grid/block description uses the cuda dim3 to represent. i.e. using dim3 +// to launch dpcpp kernel, the kernel will reverse the ordering to keep the same +// linear memory usage as cuda. + + +/** + * @internal + * + * Returns the ID of the block group this thread belongs to. + * + * @return the ID of the block group this thread belongs to + * + * @note Assumes that grid dimensions are in cuda standard format: + * `(block_group_size, first_grid_dimension, second grid_dimension)` + */ +__dpct_inline__ size_type get_block_group_id(sycl::nd_item<3> item_ct1) +{ + return static_cast(item_ct1.get_group(0)) * + item_ct1.get_group_range(1) + + item_ct1.get_group(1); +} + +/** + * @internal + * + * Returns the ID of the block this thread belongs to. + * + * @return the ID of the block this thread belongs to + * + * @note Assumes that grid dimensions are in cuda standard format: + * `(block_group_size, first_grid_dimension, second grid_dimension)` + */ +__dpct_inline__ size_type get_block_id(sycl::nd_item<3> item_ct1) +{ + return get_block_group_id(item_ct1) * item_ct1.get_group_range(2) + + item_ct1.get_group(2); +} + + +/** + * @internal + * + * Returns the local ID of the warp (relative to the block) this thread belongs + * to. + * + * @return the local ID of the warp (relative to the block) this thread belongs + * to + * + * @note Assumes that block dimensions are in cuda standard format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` + */ +__dpct_inline__ size_type get_local_warp_id(sycl::nd_item<3> item_ct1) +{ + return static_cast(item_ct1.get_local_id(0)); +} + + +/** + * @internal + * + * Returns the local ID of the sub-warp (relative to the block) this thread + * belongs to. + * + * @tparam subwarp_size size of the subwarp + * + * @return the local ID of the sub-warp (relative to the block) this thread + * belongs to + * + * @note Assumes that block dimensions are in cuda standard format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` + */ +template +__dpct_inline__ size_type get_local_subwarp_id(sycl::nd_item<3> item_ct1) +{ + // dpcpp does not have subwarp. + constexpr auto subwarps_per_warp = subwarp_size / subwarp_size; + return get_local_warp_id(item_ct1) * subwarps_per_warp + + item_ct1.get_local_id(1); +} + + +/** + * @internal + * + * Returns the local ID of the thread (relative to the block). + * to. + * + * @tparam subwarp_size size of the subwarp + * + * @return the local ID of the thread (relative to the block) + * + * @note Assumes that block dimensions are in cuda standard format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` + */ +template +__dpct_inline__ size_type get_local_thread_id(sycl::nd_item<3> item_ct1) +{ + return get_local_subwarp_id(item_ct1) * subwarp_size + + item_ct1.get_local_id(2); +} + + +/** + * @internal + * + * Returns the global ID of the warp this thread belongs to. + * + * @tparam warps_per_block number of warps within each block + * + * @return the global ID of the warp this thread belongs to. + * + * @note Assumes that block dimensions and grid dimensions are in cuda standard + * format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` and + * `(block_group_size, first_grid_dimension, second grid_dimension)`, + * respectively. + */ +template +__dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1) +{ + return get_block_id(item_ct1) * warps_per_block + + get_local_warp_id(item_ct1); +} + + +/** + * @internal + * + * Returns the global ID of the sub-warp this thread belongs to. + * + * @tparam subwarp_size size of the subwarp + * + * @return the global ID of the sub-warp this thread belongs to. + * + * @note Assumes that block dimensions and grid dimensions are in cuda standard + * format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` and + * `(block_group_size, first_grid_dimension, second grid_dimension)`, + * respectively. + */ +template +__dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1) +{ + // dpcpp does not have subwarp + constexpr auto subwarps_per_warp = subwarp_size / subwarp_size; + return get_warp_id(item_ct1) * subwarps_per_warp + + item_ct1.get_local_id(1); +} + + +/** + * @internal + * + * Returns the global ID of the thread. + * + * @return the global ID of the thread. + * + * @tparam subwarp_size size of the subwarp + * + * @note Assumes that block dimensions and grid dimensions are in cuda standard + * format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` and + * `(block_group_size, first_grid_dimension, second grid_dimension)`, + * respectively. + */ +template +__dpct_inline__ size_type get_thread_id(sycl::nd_item<3> item_ct1) +{ + return get_subwarp_id(item_ct1) * + subwarp_size + + item_ct1.get_local_id(2); +} + + +/** + * @internal + * + * Returns the global ID of the thread in the given index type. + * This function assumes one-dimensional thread and block indexing in cuda + * sense. It uses the third position infomation to get the information. + * + * @return the global ID of the thread in the given index type. + * + * @tparam IndexType the index type + */ +template +__dpct_inline__ IndexType get_thread_id_flat(sycl::nd_item<3> item_ct1) +{ + return item_ct1.get_local_id(2) + + static_cast(item_ct1.get_local_range().get(2)) * + item_ct1.get_group(2); +} + + +/** + * @internal + * + * Returns the total number of threads in the given index type. + * This function assumes one-dimensional thread and block indexing in cuda + * sense. It uses the third position infomation to get the information. + * + * @return the total number of threads in the given index type. + * + * @tparam IndexType the index type + */ +template +__dpct_inline__ IndexType get_thread_num_flat(sycl::nd_item<3> item_ct1) +{ + return item_ct1.get_local_range().get(2) * + static_cast(item_ct1.get_group_range(2)); +} + + +/** + * @internal + * + * Returns the global ID of the subwarp in the given index type. + * This function assumes one-dimensional thread and block indexing in cuda sense + * with a power of two block size of at least subwarp_size. + * + * @return the global ID of the subwarp in the given index type. + * + * @tparam subwarp_size the size of the subwarp. Must be a power of two! + * @tparam IndexType the index type + */ +template +__dpct_inline__ IndexType get_subwarp_id_flat(sycl::nd_item<3> item_ct1) +{ + static_assert(!(subwarp_size & (subwarp_size - 1)), + "subwarp_size must be a power of two"); + return item_ct1.get_local_id(2) / subwarp_size + + static_cast(item_ct1.get_local_range().get(2) / + subwarp_size) * + item_ct1.get_group(2); +} + + +/** + * @internal + * + * Returns the total number of subwarps in the given index type. + * This function assumes one-dimensional thread and block indexing in cuda sense + * with a power of two block size of at least subwarp_size. + * + * @return the total number of subwarps in the given index type. + * + * @tparam subwarp_size the size of the subwarp. Must be a power of two! + * @tparam IndexType the index type + */ +template +__dpct_inline__ IndexType get_subwarp_num_flat(sycl::nd_item<3> item_ct1) +{ + static_assert(!(subwarp_size & (subwarp_size - 1)), + "subwarp_size must be a power of two"); + return item_ct1.get_local_range().get(2) / subwarp_size * + static_cast(item_ct1.get_group_range(2)); +} + + +} // namespace thread +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_ diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp new file mode 100644 index 00000000000..eb8a36770d7 --- /dev/null +++ b/dpcpp/components/uninitialized_array.hpp @@ -0,0 +1,121 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ +#define GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ + + +#include + + +#include "dpcpp/base/dpct.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +// TODO: porting - consider directly use the array as shared memory + + +/** + * Stores an array with uninitialized contents. + * + * This class is needed for datatypes that do have a non-empty constructor when + * using them as shared memory, for example `thrust::complex`. + * + * @tparam ValueType the type of values + * @tparam size the size of the array + */ +template +class UninitializedArray { +public: + /** + * Operator for casting an UninitializedArray into its constexpr value + * pointer. + * + * @return the constexpr pointer to the first entry of the array. + */ + constexpr __dpct_inline__ operator const ValueType *() const noexcept + { + return &(*this)[0]; + } + + /** + * Operator for casting an UninitializedArray into its non-const value + * pointer. + * + * @return the non-const pointer to the first entry of the array. + */ + __dpct_inline__ operator ValueType *() noexcept { return &(*this)[0]; } + + /** + * constexpr array access operator. + * + * @param pos The array index. Using a value outside [0, size) is undefined + * behavior. + * + * @return a reference to the array entry at the given index. + */ + constexpr __dpct_inline__ const ValueType &operator[](size_type pos) const + noexcept + { + return data_[pos]; + } + + /** + * Non-const array access operator. + * + * @param pos The array index. Using a value outside [0, size) is undefined + * behavior. + * + * @return a reference to the array entry at the given index. + */ + __dpct_inline__ ValueType &operator[](size_type pos) noexcept + { + return data_[pos]; + } + +private: + // if dpcpp uses char to represent data in char, compiling gives error. + // Thanksfully, dpcpp support complex data allocation directly. + ValueType data_[size]; +}; + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 531cfd52259..b17e44f9706 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -33,25 +33,29 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" -#include - - #include +#include -#include #include #include #include #include #include #include -#include #include #include #include "core/components/prefix_sum.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/helper.hpp" +#include "dpcpp/base/onemkl_bindings.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/uninitialized_array.hpp" namespace gko { @@ -65,11 +69,625 @@ namespace dpcpp { namespace dense { +using KCFG_1D = ConfigSet<11, 7>; +constexpr auto kcfg_1d_list = + syn::value_list(); +constexpr auto subgroup_list = + syn::value_list(); +constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); +constexpr auto default_block_size = 256; + + +namespace kernel { + + +template +void compute_partial_reduce( + size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value, + CallableReduce reduce_op, sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *tmp_work) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + + constexpr auto warps_per_block = wg_size / sg_size; + + const auto num_blocks = item_ct1.get_group_range(2); + const auto local_id = thread::get_local_thread_id(item_ct1); + const auto global_id = + thread::get_thread_id(item_ct1); + + OutType *tmp_work_array = *tmp_work; + auto tmp = zero(); + for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) { + tmp = reduce_op(tmp, get_value(i)); + } + + tmp_work_array[local_id] = tmp; + + ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), + tmp_work_array, reduce_op); + + if (local_id == 0) { + work[thread::get_block_id(item_ct1)] = tmp_work_array[0]; + } +} + + +template +void finalize_reduce_computation( + size_type size, const ValueType *work, ValueType *result, + CallableReduce reduce_op, CallableFinalize finalize_op, + sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *tmp_work) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + + const auto local_id = thread::get_local_thread_id(item_ct1); + + ValueType tmp = zero(); + for (auto i = local_id; i < size; i += wg_size) { + tmp = reduce_op(tmp, work[i]); + } + ValueType *tmp_work_array = *tmp_work; + tmp_work_array[local_id] = tmp; + + ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), + tmp_work_array, reduce_op); + + if (local_id == 0) { + *result = finalize_op(tmp_work_array[0]); + } +} + + +template +void compute_partial_dot( + size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, + const ValueType *__restrict__ y, size_type stride_y, + ValueType *__restrict__ work, sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *tmp_work) +{ + compute_partial_reduce( + num_rows, work, + [x, stride_x, y, stride_y](size_type i) { + return x[i * stride_x] * y[i * stride_y]; + }, + [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1, + tmp_work); +} + +template +void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, + const ValueType *x, size_type stride_x, + const ValueType *y, size_type stride_y, + ValueType *work) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + compute_partial_dot(num_rows, x, stride_x, y, stride_y, + work, item_ct1, + tmp_work_acc_ct1.get_pointer().get()); + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot, + compute_partial_dot) +GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot, + kcfg_1d_list) + + +template +void compute_partial_conj_dot( + size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, + const ValueType *__restrict__ y, size_type stride_y, + ValueType *__restrict__ work, sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *tmp_work) +{ + compute_partial_reduce( + num_rows, work, + [x, stride_x, y, stride_y](size_type i) { + return conj(x[i * stride_x]) * y[i * stride_y]; + }, + [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1, + tmp_work); +} + +template +void compute_partial_conj_dot(dim3 grid, dim3 block, + size_t dynamic_shared_memory, sycl::queue *stream, + size_type num_rows, const ValueType *x, + size_type stride_x, const ValueType *y, + size_type stride_y, ValueType *work) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + compute_partial_conj_dot( + num_rows, x, stride_x, y, stride_y, work, item_ct1, + tmp_work_acc_ct1.get_pointer().get()); + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_conj_dot, + compute_partial_conj_dot) +GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call, + compute_partial_conj_dot, kcfg_1d_list) + + +template +void finalize_sum_reduce_computation( + size_type size, const ValueType *work, ValueType *result, + sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *tmp_work) +{ + finalize_reduce_computation( + size, work, result, + [](const ValueType &x, const ValueType &y) { return x + y; }, + [](const ValueType &x) { return x; }, item_ct1, tmp_work); +} + +template +void finalize_sum_reduce_computation(dim3 grid, dim3 block, + size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const ValueType *work, ValueType *result) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + finalize_sum_reduce_computation( + size, work, result, item_ct1, + tmp_work_acc_ct1.get_pointer().get()); + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sum_reduce_computation, + finalize_sum_reduce_computation) +GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call, + finalize_sum_reduce_computation, kcfg_1d_list) + + +template +void compute_partial_norm2( + size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, + remove_complex *__restrict__ work, sycl::nd_item<3> item_ct1, + UninitializedArray, KCFG_1D::decode<0>(cfg)> + *tmp_work) +{ + using norm_type = remove_complex; + compute_partial_reduce( + num_rows, work, + [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); }, + [](const norm_type &x, const norm_type &y) { return x + y; }, item_ct1, + tmp_work); +} + +template +void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, + const ValueType *x, size_type stride_x, + remove_complex *work) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, wg_size>, + 0, sycl::access::mode::read_write, + sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + compute_partial_norm2( + num_rows, x, stride_x, work, item_ct1, + tmp_work_acc_ct1.get_pointer().get()); + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2, + compute_partial_norm2) +GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call, + compute_partial_norm2, kcfg_1d_list) + + +template +void finalize_sqrt_reduce_computation( + size_type size, const ValueType *work, ValueType *result, + sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *tmp_work) +{ + finalize_reduce_computation( + size, work, result, + [](const ValueType &x, const ValueType &y) { return x + y; }, + [](const ValueType &x) { return std::sqrt(x); }, item_ct1, tmp_work); +} + +template +void finalize_sqrt_reduce_computation(dim3 grid, dim3 block, + size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const ValueType *work, ValueType *result) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + finalize_sqrt_reduce_computation( + size, work, result, item_ct1, + tmp_work_acc_ct1.get_pointer().get()); + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sqrt_reduce_computation, + finalize_sqrt_reduce_computation) +GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call, + finalize_sqrt_reduce_computation, kcfg_1d_list) + + +template +void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, + const size_type *__restrict__ row_ptrs, + const ValueType *__restrict__ source, + IndexType *__restrict__ row_idxs, + IndexType *__restrict__ col_idxs, + ValueType *__restrict__ values, sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + if (tidx < num_rows) { + size_type write_to = row_ptrs[tidx]; + + for (size_type i = 0; i < num_cols; i++) { + if (source[stride * tidx + i] != zero()) { + values[write_to] = source[stride * tidx + i]; + col_idxs[write_to] = i; + row_idxs[write_to] = tidx; + write_to++; + } + } + } +} + +GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo) + + +template +void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ work, + IndexType *__restrict__ result, + sycl::nd_item<3> item_ct1) +{ + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + const auto row_idx = thread::get_subwarp_id_flat(item_ct1); + auto warp_tile = + group::tiled_partition(group::this_thread_block(item_ct1)); + + if (row_idx < num_rows) { + IndexType part_result{}; + for (auto i = warp_tile.thread_rank(); i < num_cols; i += sg_size) { + if (work[stride * row_idx + i] != zero()) { + part_result += 1; + } + } + result[row_idx] = ::gko::kernels::dpcpp::reduce( + warp_tile, part_result, + [](const size_type &a, const size_type &b) { return a + b; }); + } +} + +GKO_ENABLE_DEFAULT_HOST_CONFIG(count_nnz_per_row, count_nnz_per_row) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(count_nnz_per_row, count_nnz_per_row) +GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row, + kcfg_1d_list) + + +template +void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ source, + IndexType *__restrict__ row_ptrs, + IndexType *__restrict__ col_idxs, + ValueType *__restrict__ values, sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + + if (tidx < num_rows) { + auto write_to = row_ptrs[tidx]; + for (size_type i = 0; i < num_cols; i++) { + if (source[stride * tidx + i] != zero()) { + values[write_to] = source[stride * tidx + i]; + col_idxs[write_to] = i; + write_to++; + } + } + } +} + +GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr) + + +template +void fill_in_ell(size_type num_rows, size_type num_cols, + size_type source_stride, const ValueType *__restrict__ source, + size_type max_nnz_per_row, size_type result_stride, + IndexType *__restrict__ col_ptrs, + ValueType *__restrict__ values, sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + if (tidx < num_rows) { + IndexType col_idx = 0; + for (size_type col = 0; col < num_cols; col++) { + if (source[tidx * source_stride + col] != zero()) { + col_ptrs[col_idx * result_stride + tidx] = col; + values[col_idx * result_stride + tidx] = + source[tidx * source_stride + col]; + col_idx++; + } + } + for (size_type j = col_idx; j < max_nnz_per_row; j++) { + col_ptrs[j * result_stride + tidx] = 0; + values[j * result_stride + tidx] = zero(); + } + } else if (tidx < result_stride) { + for (size_type j = 0; j < max_nnz_per_row; j++) { + col_ptrs[j * result_stride + tidx] = 0; + values[j * result_stride + tidx] = zero(); + } + } +} + +GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell) + + +template +void calculate_slice_lengths(size_type num_rows, size_type slice_size, + int slice_num, size_type stride_factor, + const size_type *__restrict__ nnz_per_row, + size_type *__restrict__ slice_lengths, + size_type *__restrict__ slice_sets, + sycl::nd_item<3> item_ct1) +{ + constexpr auto sg_size = cfg; + const auto sliceid = item_ct1.get_group(2); + const auto tid_in_warp = item_ct1.get_local_id(2); + const bool runable = sliceid * slice_size + tid_in_warp < num_rows; + size_type thread_result = 0; + for (size_type i = tid_in_warp; i < slice_size; i += sg_size) { + thread_result = + (i + slice_size * sliceid < num_rows) + ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) + : thread_result; + } + + auto warp_tile = + group::tiled_partition(group::this_thread_block(item_ct1)); + auto warp_result = ::gko::kernels::dpcpp::reduce( + warp_tile, thread_result, + [](const size_type &a, const size_type &b) { return max(a, b); }); + + if (tid_in_warp == 0 && runable) { + auto slice_length = ceildiv(warp_result, stride_factor) * stride_factor; + slice_lengths[sliceid] = slice_length; + slice_sets[sliceid] = slice_length; + } +} + +GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths, + calculate_slice_lengths) +GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call, + calculate_slice_lengths, subgroup_list) + + +template +void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, + size_type stride, const ValueType *__restrict__ source, + size_type *__restrict__ slice_lengths, + size_type *__restrict__ slice_sets, + IndexType *__restrict__ col_idxs, + ValueType *__restrict__ vals, sycl::nd_item<3> item_ct1) +{ + const auto global_row = thread::get_thread_id_flat(item_ct1); + const auto row = global_row % slice_size; + const auto sliceid = global_row / slice_size; + + if (global_row < num_rows) { + size_type sellp_ind = slice_sets[sliceid] * slice_size + row; + + for (size_type col = 0; col < num_cols; col++) { + auto val = source[global_row * stride + col]; + if (val != zero()) { + col_idxs[sellp_ind] = col; + vals[sellp_ind] = val; + sellp_ind += slice_size; + } + } + for (size_type i = sellp_ind; + i < + (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row; + i += slice_size) { + col_idxs[i] = 0; + vals[i] = zero(); + } + } +} + +GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp) + + +template +void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, + size_type *__restrict__ result, sycl::nd_item<3> item_ct1, + uint8_t *dpct_local) +{ + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + auto block_max = (size_type *)dpct_local; + + reduce_array( + size, nnz_per_row, block_max, item_ct1, + [](const size_type &x, const size_type &y) { return max(x, y); }); + + if (item_ct1.get_local_id(2) == 0) { + result[item_ct1.get_group(2)] = block_max[0]; + } +} + +template +void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const size_type *nnz_per_row, size_type *result) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor + dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); + + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + reduce_max_nnz(size, nnz_per_row, result, item_ct1, + dpct_local_acc_ct1.get_pointer().get()); + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz); +GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz, + kcfg_1d_list) + + +template +void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size, + size_type stride_factor, + const size_type *__restrict__ nnz_per_row, + size_type *__restrict__ result, + sycl::nd_item<3> item_ct1) +{ + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + auto warp_tile = + group::tiled_partition(group::this_thread_block(item_ct1)); + const auto warpid = thread::get_subwarp_id_flat(item_ct1); + const auto tid_in_warp = warp_tile.thread_rank(); + const auto slice_num = ceildiv(num_rows, slice_size); + + size_type thread_result = 0; + for (size_type i = tid_in_warp; i < slice_size; i += sg_size) { + if (warpid * slice_size + i < num_rows) { + thread_result = + max(thread_result, nnz_per_row[warpid * slice_size + i]); + } + } + + auto warp_result = ::gko::kernels::dpcpp::reduce( + warp_tile, thread_result, + [](const size_type &a, const size_type &b) { return max(a, b); }); + + if (tid_in_warp == 0 && warpid < slice_num) { + result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor; + } +} + +GKO_ENABLE_DEFAULT_HOST_CONFIG(reduce_max_nnz_per_slice, + reduce_max_nnz_per_slice) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz_per_slice, + reduce_max_nnz_per_slice) +GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call, + reduce_max_nnz_per_slice, kcfg_1d_list) + + +template +void reduce_total_cols(size_type num_slices, + const size_type *__restrict__ max_nnz_per_slice, + size_type *__restrict__ result, + sycl::nd_item<3> item_ct1, uint8_t *dpct_local) +{ + auto block_result = (size_type *)dpct_local; + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + reduce_array( + num_slices, max_nnz_per_slice, block_result, item_ct1, + [](const size_type &x, const size_type &y) { return x + y; }); + + if (item_ct1.get_local_id(2) == 0) { + result[item_ct1.get_group(2)] = block_result[0]; + } +} + +template +void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_slices, + const size_type *max_nnz_per_slice, size_type *result) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor + dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + reduce_total_cols(num_slices, max_nnz_per_slice, result, + item_ct1, + dpct_local_acc_ct1.get_pointer().get()); + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_total_cols, + reduce_total_cols); +GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols, + kcfg_1d_list) + + +} // namespace kernel + + template void simple_apply(std::shared_ptr exec, const matrix::Dense *a, const matrix::Dense *b, - matrix::Dense *c) GKO_NOT_IMPLEMENTED; + matrix::Dense *c) +{ + using namespace oneapi::mkl; + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, transpose::nontrans, + c->get_size()[0], c->get_size()[1], a->get_size()[1], one(), + a->get_const_values(), a->get_stride(), b->get_const_values(), + b->get_stride(), zero(), c->get_values(), c->get_stride()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); @@ -78,8 +696,17 @@ template void apply(std::shared_ptr exec, const matrix::Dense *alpha, const matrix::Dense *a, const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) GKO_NOT_IMPLEMENTED; + const matrix::Dense *beta, matrix::Dense *c) +{ + using namespace oneapi::mkl; + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, transpose::nontrans, + c->get_size()[0], c->get_size()[1], a->get_size()[1], + exec->copy_val_to_host(alpha->get_const_values()), + a->get_const_values(), a->get_stride(), b->get_const_values(), + b->get_stride(), exec->copy_val_to_host(beta->get_const_values()), + c->get_values(), c->get_stride()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); @@ -88,7 +715,43 @@ template void compute_dot(std::shared_ptr exec, const matrix::Dense *x, const matrix::Dense *y, - matrix::Dense *result) GKO_NOT_IMPLEMENTED; + matrix::Dense *result) +{ + if (x->get_size()[1] == 1) { + // TODO: write a custom kernel which does this more efficiently + onemkl::dot(*exec->get_queue(), x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), y->get_stride(), + result->get_values()); + } else { + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + const auto work_per_block = work_per_thread * wg_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{sg_size, 1, wg_size / sg_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + kernel::compute_partial_dot_call( + cfg, grid_dim, block_dim, 0, exec->get_queue(), + x->get_size()[0], x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), work.get_data()); + kernel::finalize_sum_reduce_computation_call( + cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, + work.get_const_data(), result->get_values() + col); + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); @@ -97,7 +760,46 @@ template void compute_conj_dot(std::shared_ptr exec, const matrix::Dense *x, const matrix::Dense *y, - matrix::Dense *result) GKO_NOT_IMPLEMENTED; + matrix::Dense *result) +{ + if (x->get_size()[1] == 1) { + // TODO: write a custom kernel which does this more efficiently + onemkl::conj_dot(*exec->get_queue(), x->get_size()[0], + x->get_const_values(), x->get_stride(), + y->get_const_values(), y->get_stride(), + result->get_values()); + + } else { + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + + const auto work_per_block = work_per_thread * wg_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{sg_size, 1, wg_size / sg_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + kernel::compute_partial_conj_dot_call( + cfg, grid_dim, block_dim, 0, exec->get_queue(), + x->get_size()[0], x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), work.get_data()); + kernel::finalize_sum_reduce_computation_call( + cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, + work.get_const_data(), result->get_values() + col); + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); @@ -106,7 +808,43 @@ template void compute_norm2(std::shared_ptr exec, const matrix::Dense *x, matrix::Dense> *result) - GKO_NOT_IMPLEMENTED; +{ + if (x->get_size()[1] == 1) { + oneapi::mkl::blas::row_major::nrm2( + *exec->get_queue(), x->get_size()[0], x->get_const_values(), + x->get_stride(), result->get_values()); + } else { + using norm_type = remove_complex; + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + + const auto work_per_block = work_per_thread * wg_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{sg_size, 1, wg_size / sg_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + kernel::compute_partial_norm2_call( + cfg, grid_dim, block_dim, 0, exec->get_queue(), + x->get_size()[0], x->get_const_values() + col, x->get_stride(), + work.get_data()); + kernel::finalize_sqrt_reduce_computation_call( + cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, + work.get_const_data(), result->get_values() + col); + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); @@ -115,7 +853,36 @@ template void convert_to_coo(std::shared_ptr exec, const matrix::Dense *source, matrix::Coo *result) - GKO_NOT_IMPLEMENTED; +{ + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + + auto row_idxs = result->get_row_idxs(); + auto col_idxs = result->get_col_idxs(); + auto values = result->get_values(); + + auto stride = source->get_stride(); + + auto nnz_prefix_sum = Array(exec, num_rows); + calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum); + + components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows); + + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + size_type grid_dim = ceildiv(num_rows, wg_size); + + kernel::fill_in_coo(grid_dim, wg_size, 0, exec->get_queue(), num_rows, + num_cols, stride, nnz_prefix_sum.get_const_data(), + source->get_const_values(), row_idxs, col_idxs, values); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); @@ -125,7 +892,41 @@ template void convert_to_csr(std::shared_ptr exec, const matrix::Dense *source, matrix::Csr *result) - GKO_NOT_IMPLEMENTED; +{ + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + + auto row_ptrs = result->get_row_ptrs(); + auto col_idxs = result->get_col_idxs(); + auto values = result->get_values(); + + auto stride = source->get_stride(); + + const auto rows_per_block = ceildiv(wg_size, sg_size); + const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); + + kernel::count_nnz_per_row_call( + cfg, grid_dim_nnz, wg_size, 0, exec->get_queue(), num_rows, num_cols, + stride, source->get_const_values(), row_ptrs); + + components::prefix_sum(exec, row_ptrs, num_rows + 1); + + size_type grid_dim = ceildiv(num_rows, wg_size); + + kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(), + num_rows, num_cols, stride, source->get_const_values(), + row_ptrs, col_idxs, values); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); @@ -135,7 +936,31 @@ template void convert_to_ell(std::shared_ptr exec, const matrix::Dense *source, matrix::Ell *result) - GKO_NOT_IMPLEMENTED; +{ + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + auto max_nnz_per_row = result->get_num_stored_elements_per_row(); + + auto col_ptrs = result->get_col_idxs(); + auto values = result->get_values(); + + auto source_stride = source->get_stride(); + auto result_stride = result->get_stride(); + + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + auto grid_dim = ceildiv(result_stride, wg_size); + kernel::fill_in_ell(grid_dim, wg_size, 0, exec->get_queue(), num_rows, + num_cols, source_stride, source->get_const_values(), + max_nnz_per_row, result_stride, col_ptrs, values); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); @@ -155,7 +980,56 @@ template void convert_to_sellp(std::shared_ptr exec, const matrix::Dense *source, matrix::Sellp *result) - GKO_NOT_IMPLEMENTED; +{ + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + + const auto stride = source->get_stride(); + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + + auto vals = result->get_values(); + auto col_idxs = result->get_col_idxs(); + auto slice_lengths = result->get_slice_lengths(); + auto slice_sets = result->get_slice_sets(); + + const auto slice_size = (result->get_slice_size() == 0) + ? matrix::default_slice_size + : result->get_slice_size(); + const auto stride_factor = (result->get_stride_factor() == 0) + ? matrix::default_stride_factor + : result->get_stride_factor(); + const int slice_num = ceildiv(num_rows, slice_size); + + auto nnz_per_row = Array(exec, num_rows); + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + auto grid_dim = slice_num; + + if (grid_dim > 0) { + kernel::calculate_slice_lengths_call( + sg_size, grid_dim, sg_size, 0, exec->get_queue(), num_rows, + slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(), + slice_lengths, slice_sets); + } + + components::prefix_sum(exec, slice_sets, slice_num + 1); + + grid_dim = ceildiv(num_rows, wg_size); + if (grid_dim > 0) { + kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows, + num_cols, slice_size, stride, + source->get_const_values(), slice_lengths, + slice_sets, col_idxs, vals); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); @@ -173,8 +1047,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void count_nonzeros(std::shared_ptr exec, - const matrix::Dense *source, - size_type *result) GKO_NOT_IMPLEMENTED; + const matrix::Dense *source, size_type *result) +{ + const auto num_rows = source->get_size()[0]; + auto nnz_per_row = Array(exec, num_rows); + + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); @@ -182,7 +1063,37 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); template void calculate_max_nnz_per_row(std::shared_ptr exec, const matrix::Dense *source, - size_type *result) GKO_NOT_IMPLEMENTED; + size_type *result) +{ + const auto num_rows = source->get_size()[0]; + auto nnz_per_row = Array(exec, num_rows); + + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto n = ceildiv(num_rows, wg_size); + const size_type grid_dim = (n <= wg_size) ? n : wg_size; + + auto block_results = Array(exec, grid_dim); + + kernel::reduce_max_nnz_call( + cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), + num_rows, nnz_per_row.get_const_data(), block_results.get_data()); + + auto d_result = Array(exec, 1); + + kernel::reduce_max_nnz_call( + cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), + grid_dim, block_results.get_const_data(), d_result.get_data()); + + *result = exec->copy_val_to_host(d_result.get_const_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); @@ -191,7 +1102,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void calculate_nonzeros_per_row(std::shared_ptr exec, const matrix::Dense *source, - Array *result) GKO_NOT_IMPLEMENTED; + Array *result) +{ + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + const dim3 block_size(wg_size, 1, 1); + auto rows_per_block = ceildiv(wg_size, sg_size); + const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block); + const dim3 grid_size(grid_x, 1, 1); + if (grid_x > 0) { + kernel::count_nnz_per_row_call( + cfg, grid_size, block_size, 0, exec->get_queue(), + source->get_size()[0], source->get_size()[1], source->get_stride(), + source->get_const_values(), result->get_data()); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); @@ -201,7 +1133,56 @@ template void calculate_total_cols(std::shared_ptr exec, const matrix::Dense *source, size_type *result, size_type stride_factor, - size_type slice_size) GKO_NOT_IMPLEMENTED; + size_type slice_size) +{ + const auto num_rows = source->get_size()[0]; + + if (num_rows == 0) { + *result = 0; + return; + } + + const auto num_cols = source->get_size()[1]; + const auto slice_num = ceildiv(num_rows, slice_size); + + auto nnz_per_row = Array(exec, num_rows); + + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + auto max_nnz_per_slice = Array(exec, slice_num); + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + + auto grid_dim = ceildiv(slice_num * sg_size, wg_size); + + kernel::reduce_max_nnz_per_slice_call( + cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, slice_size, + stride_factor, nnz_per_row.get_const_data(), + max_nnz_per_slice.get_data()); + + grid_dim = ceildiv(slice_num, wg_size); + auto block_results = Array(exec, grid_dim); + + kernel::reduce_total_cols_call( + cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), + slice_num, max_nnz_per_slice.get_const_data(), + block_results.get_data()); + + auto d_result = Array(exec, 1); + + kernel::reduce_total_cols_call( + cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), + grid_dim, block_results.get_const_data(), d_result.get_data()); + + *result = exec->copy_val_to_host(d_result.get_const_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL); diff --git a/dpcpp/test/components/CMakeLists.txt b/dpcpp/test/components/CMakeLists.txt index 87a034a64df..77ad6684840 100644 --- a/dpcpp/test/components/CMakeLists.txt +++ b/dpcpp/test/components/CMakeLists.txt @@ -2,3 +2,4 @@ ginkgo_create_test(absolute_array) ginkgo_create_dpcpp_test(cooperative_groups_kernels) ginkgo_create_test(fill_array) ginkgo_create_test(precision_conversion) +ginkgo_create_test(prefix_sum) diff --git a/dpcpp/test/components/prefix_sum.cpp b/dpcpp/test/components/prefix_sum.cpp new file mode 100644 index 00000000000..402192d0b77 --- /dev/null +++ b/dpcpp/test/components/prefix_sum.cpp @@ -0,0 +1,96 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/prefix_sum.hpp" + + +#include +#include +#include + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +class PrefixSum : public ::testing::Test { +protected: + using index_type = gko::int32; + PrefixSum() + : ref(gko::ReferenceExecutor::create()), + exec(gko::DpcppExecutor::create(0, ref)), + rand(293), + total_size(42793), + vals(ref, total_size), + dvals(exec) + { + std::uniform_int_distribution dist(0, 1000); + for (gko::size_type i = 0; i < total_size; ++i) { + vals.get_data()[i] = dist(rand); + } + dvals = vals; + } + + void test(gko::size_type size) + { + gko::kernels::reference::components::prefix_sum(ref, vals.get_data(), + size); + gko::kernels::dpcpp::components::prefix_sum(exec, dvals.get_data(), + size); + + GKO_ASSERT_ARRAY_EQ(vals, dvals); + } + + std::shared_ptr ref; + std::shared_ptr exec; + std::default_random_engine rand; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; +}; + + +TEST_F(PrefixSum, SmallEqualsReference) { test(100); } + + +TEST_F(PrefixSum, BigEqualsReference) { test(total_size); } + + +} // namespace diff --git a/dpcpp/test/matrix/CMakeLists.txt b/dpcpp/test/matrix/CMakeLists.txt index af64b693718..ba8f0fb70fe 100644 --- a/dpcpp/test/matrix/CMakeLists.txt +++ b/dpcpp/test/matrix/CMakeLists.txt @@ -1 +1,2 @@ -ginkgo_create_test(csr_kernels) \ No newline at end of file +ginkgo_create_test(csr_kernels) +ginkgo_create_test(dense_kernels) diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp new file mode 100644 index 00000000000..cf1bbe26cd4 --- /dev/null +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -0,0 +1,598 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "core/components/fill_array.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/test/utils.hpp" +#include "dpcpp/test/utils.hpp" + + +namespace { + + +class Dense : public ::testing::Test { +protected: + using itype = int; +#if GINKGO_DPCPP_SINGLE_MODE + using vtype = float; +#else + using vtype = double; +#endif // GINKGO_DPCPP_SINGLE_MODE + using Mtx = gko::matrix::Dense; + using MixedMtx = gko::matrix::Dense>; + using NormVector = gko::matrix::Dense>; + using Arr = gko::Array; + using ComplexMtx = gko::matrix::Dense>; + using MixedComplexMtx = + gko::matrix::Dense>>; + + Dense() : rand_engine(15) {} + + void SetUp() + { + ASSERT_GT(gko::DpcppExecutor::get_num_devices("all"), 0); + ref = gko::ReferenceExecutor::create(); + dpcpp = gko::DpcppExecutor::create(0, ref); + } + + void TearDown() + { + if (dpcpp != nullptr) { + ASSERT_NO_THROW(dpcpp->synchronize()); + } + } + + template + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); + } + + void set_up_vector_data(gko::size_type num_vecs, + bool different_alpha = false) + { + x = gen_mtx(1000, num_vecs); + y = gen_mtx(1000, num_vecs); + if (different_alpha) { + alpha = gen_mtx(1, num_vecs); + } else { + alpha = gko::initialize({2.0}, ref); + } + dx = Mtx::create(dpcpp); + dx->copy_from(x.get()); + dy = Mtx::create(dpcpp); + dy->copy_from(y.get()); + dalpha = Mtx::create(dpcpp); + dalpha->copy_from(alpha.get()); + expected = Mtx::create(ref, gko::dim<2>{1, num_vecs}); + dresult = Mtx::create(dpcpp, gko::dim<2>{1, num_vecs}); + } + + void set_up_apply_data() + { + x = gen_mtx(65, 25); + c_x = gen_mtx(65, 25); + y = gen_mtx(25, 35); + expected = gen_mtx(65, 35); + alpha = gko::initialize({2.0}, ref); + beta = gko::initialize({-1.0}, ref); + square = gen_mtx(x->get_size()[0], x->get_size()[0]); + dx = Mtx::create(dpcpp); + dx->copy_from(x.get()); + dc_x = ComplexMtx::create(dpcpp); + dc_x->copy_from(c_x.get()); + dy = Mtx::create(dpcpp); + dy->copy_from(y.get()); + dresult = Mtx::create(dpcpp); + dresult->copy_from(expected.get()); + dalpha = Mtx::create(dpcpp); + dalpha->copy_from(alpha.get()); + dbeta = Mtx::create(dpcpp); + dbeta->copy_from(beta.get()); + dsquare = Mtx::create(dpcpp); + dsquare->copy_from(square.get()); + + std::vector tmp(x->get_size()[0], 0); + auto rng = std::default_random_engine{}; + std::iota(tmp.begin(), tmp.end(), 0); + std::shuffle(tmp.begin(), tmp.end(), rng); + std::vector tmp2(x->get_size()[1], 0); + std::iota(tmp2.begin(), tmp2.end(), 0); + std::shuffle(tmp2.begin(), tmp2.end(), rng); + std::vector tmp3(x->get_size()[0] / 10); + std::uniform_int_distribution row_dist(0, x->get_size()[0] - 1); + for (auto &i : tmp3) { + i = row_dist(rng); + } + rpermute_idxs = + std::unique_ptr(new Arr{ref, tmp.begin(), tmp.end()}); + cpermute_idxs = + std::unique_ptr(new Arr{ref, tmp2.begin(), tmp2.end()}); + rgather_idxs = + std::unique_ptr(new Arr{ref, tmp3.begin(), tmp3.end()}); + } + + template + std::unique_ptr convert(InputType &&input) + { + auto result = ConvertedType::create(input->get_executor()); + input->convert_to(result.get()); + return result; + } + + std::shared_ptr ref; + std::shared_ptr dpcpp; + + std::ranlux48 rand_engine; + + std::unique_ptr x; + std::unique_ptr c_x; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + std::unique_ptr expected; + std::unique_ptr square; + std::unique_ptr dresult; + std::unique_ptr dx; + std::unique_ptr dc_x; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; + std::unique_ptr dsquare; + std::unique_ptr rpermute_idxs; + std::unique_ptr cpermute_idxs; + std::unique_ptr rgather_idxs; +}; + + +TEST_F(Dense, SingleVectorDpcppComputeDotIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->compute_dot(y.get(), expected.get()); + dx->compute_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); +} + + +TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->compute_dot(y.get(), expected.get()); + dx->compute_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); +} + + +TEST_F(Dense, SingleVectorDpcppComputeConjDotIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->compute_conj_dot(y.get(), expected.get()); + dx->compute_conj_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); +} + + +TEST_F(Dense, MultipleVectorDpcppComputeConjDotIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->compute_conj_dot(y.get(), expected.get()); + dx->compute_conj_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); +} + + +TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef) +{ + set_up_vector_data(20); + auto norm_size = gko::dim<2>{1, x->get_size()[1]}; + auto norm_expected = NormVector::create(this->ref, norm_size); + auto dnorm = NormVector::create(this->dpcpp, norm_size); + + x->compute_norm2(norm_expected.get()); + dx->compute_norm2(dnorm.get()); + + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r::value); +} + + +TEST_F(Dense, SimpleApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + x->apply(y.get(), expected.get()); + dx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); +} + + +TEST_F(Dense, SimpleApplyMixedIsEquivalentToRef) +{ + SKIP_IF_SINGLE_MODE; + set_up_apply_data(); + + x->apply(convert(y).get(), convert(expected).get()); + dx->apply(convert(dy).get(), convert(dresult).get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7); +} + + +TEST_F(Dense, AdvancedApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + x->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); +} + + +TEST_F(Dense, AdvancedApplyMixedIsEquivalentToRef) +{ + SKIP_IF_SINGLE_MODE; + set_up_apply_data(); + + x->apply(convert(alpha).get(), convert(y).get(), + convert(beta).get(), convert(expected).get()); + dx->apply(convert(dalpha).get(), convert(dy).get(), + convert(dbeta).get(), convert(dresult).get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7); +} + + +TEST_F(Dense, ApplyToComplexIsEquivalentToRef) +{ + SKIP_IF_SINGLE_MODE; + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(complex_b.get(), complex_x.get()); + dx->apply(dcomplex_b.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r::value); +} + + +TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef) +{ + SKIP_IF_SINGLE_MODE; + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = MixedComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = MixedComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(complex_b.get(), complex_x.get()); + dx->apply(dcomplex_b.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7); +} + + +TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get()); + dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r::value); +} + + +TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef) +{ + SKIP_IF_SINGLE_MODE; + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = MixedComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = MixedComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(convert(alpha).get(), complex_b.get(), + convert(beta).get(), complex_x.get()); + dx->apply(convert(dalpha).get(), dcomplex_b.get(), + convert(dbeta).get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7); +} + + +TEST_F(Dense, ComputeDotComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(1234, 2); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(1234, 2); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2}); + auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2}); + + complex_b->compute_dot(complex_x.get(), result.get()); + dcomplex_b->compute_dot(dcomplex_x.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(result, dresult, r::value); +} + + +TEST_F(Dense, ComputeConjDotComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(1234, 2); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(1234, 2); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2}); + auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2}); + + complex_b->compute_conj_dot(complex_x.get(), result.get()); + dcomplex_b->compute_conj_dot(dcomplex_x.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(result, dresult, r::value); +} + + +TEST_F(Dense, ConvertToCooIsEquivalentToRef) +{ + set_up_apply_data(); + auto coo_mtx = gko::matrix::Coo::create(ref); + auto dcoo_mtx = gko::matrix::Coo::create(dpcpp); + + x->convert_to(coo_mtx.get()); + dx->convert_to(dcoo_mtx.get()); + + ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), + coo_mtx->get_num_stored_elements()); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), r::value); +} + + +TEST_F(Dense, MoveToCooIsEquivalentToRef) +{ + set_up_apply_data(); + auto coo_mtx = gko::matrix::Coo::create(ref); + auto dcoo_mtx = gko::matrix::Coo::create(dpcpp); + + x->move_to(coo_mtx.get()); + dx->move_to(dcoo_mtx.get()); + + ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), + coo_mtx->get_num_stored_elements()); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), r::value); +} + + +TEST_F(Dense, ConvertToCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto csr_mtx = gko::matrix::Csr::create(ref); + auto dcsr_mtx = gko::matrix::Csr::create(dpcpp); + + x->convert_to(csr_mtx.get()); + dx->convert_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), r::value); +} + + +TEST_F(Dense, MoveToCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto csr_mtx = gko::matrix::Csr::create(ref); + auto dcsr_mtx = gko::matrix::Csr::create(dpcpp); + + x->move_to(csr_mtx.get()); + dx->move_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), r::value); +} + + +TEST_F(Dense, ConvertToEllIsEquivalentToRef) +{ + set_up_apply_data(); + auto ell_mtx = gko::matrix::Ell::create(ref); + auto dell_mtx = gko::matrix::Ell::create(dpcpp); + + x->convert_to(ell_mtx.get()); + dx->convert_to(dell_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), r::value); +} + + +TEST_F(Dense, MoveToEllIsEquivalentToRef) +{ + set_up_apply_data(); + auto ell_mtx = gko::matrix::Ell::create(ref); + auto dell_mtx = gko::matrix::Ell::create(dpcpp); + + x->move_to(ell_mtx.get()); + dx->move_to(dell_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), r::value); +} + + +TEST_F(Dense, ConvertToSellpIsEquivalentToRef) +{ + set_up_apply_data(); + auto sellp_mtx = gko::matrix::Sellp::create(ref); + auto dsellp_mtx = gko::matrix::Sellp::create(dpcpp); + + x->convert_to(sellp_mtx.get()); + dx->convert_to(dsellp_mtx.get()); + + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r::value); +} + + +TEST_F(Dense, MoveToSellpIsEquivalentToRef) +{ + set_up_apply_data(); + auto sellp_mtx = gko::matrix::Sellp::create(ref); + auto dsellp_mtx = gko::matrix::Sellp::create(dpcpp); + + x->move_to(sellp_mtx.get()); + dx->move_to(dsellp_mtx.get()); + + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r::value); +} + + +TEST_F(Dense, ConvertsEmptyToSellp) +{ + auto dempty_mtx = Mtx::create(dpcpp); + auto dsellp_mtx = gko::matrix::Sellp::create(dpcpp); + + dempty_mtx->convert_to(dsellp_mtx.get()); + + ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); + ASSERT_FALSE(dsellp_mtx->get_size()); +} + + +TEST_F(Dense, CountNNZIsEquivalentToRef) +{ + set_up_apply_data(); + gko::size_type nnz; + gko::size_type dnnz; + + gko::kernels::reference::dense::count_nonzeros(ref, x.get(), &nnz); + gko::kernels::dpcpp::dense::count_nonzeros(dpcpp, dx.get(), &dnnz); + + ASSERT_EQ(nnz, dnnz); +} + + +TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) +{ + set_up_apply_data(); + gko::Array nnz_per_row(ref); + nnz_per_row.resize_and_reset(x->get_size()[0]); + gko::Array dnnz_per_row(dpcpp); + dnnz_per_row.resize_and_reset(dx->get_size()[0]); + + gko::kernels::reference::dense::calculate_nonzeros_per_row(ref, x.get(), + &nnz_per_row); + gko::kernels::dpcpp::dense::calculate_nonzeros_per_row(dpcpp, dx.get(), + &dnnz_per_row); + + auto tmp = gko::Array(ref, dnnz_per_row); + for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { + ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); + } +} + + +TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef) +{ + set_up_apply_data(); + gko::size_type max_nnz; + gko::size_type dmax_nnz; + + gko::kernels::reference::dense::calculate_max_nnz_per_row(ref, x.get(), + &max_nnz); + gko::kernels::dpcpp::dense::calculate_max_nnz_per_row(dpcpp, dx.get(), + &dmax_nnz); + + ASSERT_EQ(max_nnz, dmax_nnz); +} + + +TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) +{ + set_up_apply_data(); + gko::size_type total_cols; + gko::size_type dtotal_cols; + + gko::kernels::reference::dense::calculate_total_cols( + ref, x.get(), &total_cols, 2, gko::matrix::default_slice_size); + gko::kernels::dpcpp::dense::calculate_total_cols( + dpcpp, dx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size); + + ASSERT_EQ(total_cols, dtotal_cols); +} + + +} // namespace diff --git a/dpcpp/test/utils.hpp b/dpcpp/test/utils.hpp new file mode 100644 index 00000000000..57e703b8ef1 --- /dev/null +++ b/dpcpp/test/utils.hpp @@ -0,0 +1,56 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_TEST_UTILS_HPP_ +#define GKO_DPCPP_TEST_UTILS_HPP_ + + +#include + + +namespace { + + +#if GINKGO_DPCPP_SINGLE_MODE +#define SKIP_IF_SINGLE_MODE GTEST_SKIP() << "Skip due to single mode" +#else +#define SKIP_IF_SINGLE_MODE \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") +#endif + + +} // namespace + + +#endif // GKO_DPCPP_TEST_UTILS_HPP_ diff --git a/hip/components/prefix_sum.hip.cpp b/hip/components/prefix_sum.hip.cpp index 28cd01b4fb5..9302fc07b9a 100644 --- a/hip/components/prefix_sum.hip.cpp +++ b/hip/components/prefix_sum.hip.cpp @@ -49,7 +49,7 @@ template void prefix_sum(std::shared_ptr exec, IndexType *counts, size_type num_entries) { - // prefix_sum should be on the valid array + // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); Array block_sum_array(exec, num_blocks - 1); @@ -58,8 +58,8 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, HIP_KERNEL_NAME(start_prefix_sum), dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, num_entries, counts, block_sums); - // add the total sum of the previous block only when the number of block - // is larger than 1. + // add the total sum of the previous block only when the number of + // blocks is larger than 1. if (num_blocks > 1) { hipLaunchKernelGGL( HIP_KERNEL_NAME(finalize_prefix_sum), diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp index ebd5b441f6c..0e9570540fa 100644 --- a/include/ginkgo/core/synthesizer/containers.hpp +++ b/include/ginkgo/core/synthesizer/containers.hpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_PUBLIC_CORE_SYNTHESIZER_CONTAINERS_HPP_ +#include #include @@ -46,14 +47,32 @@ namespace gko { namespace syn { +/** + * value_list records several values with the same type in template. + * + * @tparam T the value type of the list + * @tparam Values the values in the list + */ template struct value_list {}; +/** + * type_list records several types in template + * + * @tparam Types the types in the list + */ template struct type_list {}; +/** + * range records start, end, step in template + * + * @tparam Start start of range + * @tparam End end of range + * @tparam Step step of range. default is 1 + */ template struct range {}; @@ -61,9 +80,22 @@ struct range {}; namespace detail { +/** + * concatenate_impl base type + * + * @tparam List1 the first List + * @tparam List2 the second List + */ template struct concatenate_impl; +/** + * concatenate_impl specializes for two value_list with the same value type. + * + * @tparam T the value type of two value_list + * @tparam Values the values of the first list + * @tparam Values the values of the second list + */ template struct concatenate_impl, value_list> { using type = value_list; @@ -73,6 +105,12 @@ struct concatenate_impl, value_list> { } // namespace detail +/** + * concatenate combines two value_list into one value_list. + * + * @tparam List1 the first list + * @tparam List2 the second list + */ template using concatenate = typename detail::concatenate_impl::type; @@ -80,19 +118,43 @@ using concatenate = typename detail::concatenate_impl::type; namespace detail { +/** + * as_list_impl base type + * + * @tparam T the input template + */ template struct as_list_impl; +/** + * as_list_impl specializes for the value_list + * + * @tparam T the value_list type + * @tparam Values the values of value_list + */ template struct as_list_impl> { using type = value_list; }; +/** + * as_list_impl specializes for the type_list + * + * @tparam ...Types the types of type_list + */ template struct as_list_impl> { using type = type_list; }; +/** + * as_list_impl specializes for the range. This is the recursive case. It will + * concatenate Start and range. + * + * @tparam int the start of range + * @tparam int the end of range + * @tparam int the step of range + */ template struct as_list_impl, std::enable_if_t<(Start < End)>> { using type = concatenate< @@ -100,6 +162,13 @@ struct as_list_impl, std::enable_if_t<(Start < End)>> { typename as_list_impl>::type>; }; +/** + * as_list_impl specializes for the range. This is the end case. + * + * @tparam int the start of range + * @tparam int the end of range + * @tparam int the step of range + */ template struct as_list_impl, std::enable_if_t<(Start >= End)>> { using type = value_list; @@ -109,10 +178,34 @@ struct as_list_impl, std::enable_if_t<(Start >= End)>> { } // namespace detail +/** + * as_list gives the alias type of as_list_impl::type. It gives a list + * (itself) if input is already a list, or generates list type from range input. + * + * @tparam T list or range + */ template using as_list = typename detail::as_list_impl::type; +/** + * as_array returns the array from value_list. It will be helpful if using + * for in runtime on the array. + * + * @tparam T the type of value_list + * @tparam Value the values of value_list + * + * @param value_list the input value_list + * + * @return std::array the std::array contains the values of value_list + */ +template +constexpr std::array as_array(value_list vl) +{ + return std::array{Value...}; +} + + } // namespace syn } // namespace gko