From 8259e01843b0e38cf629ae8c8bdf7413a450a512 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Fri, 19 Feb 2021 21:20:50 +0800 Subject: [PATCH 01/22] update format header --- dev_tools/scripts/format_header.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh index cbe1b2e8ab9..7a24f50bef6 100755 --- a/dev_tools/scripts/format_header.sh +++ b/dev_tools/scripts/format_header.sh @@ -121,7 +121,6 @@ GINKGO_LICENSE_BEACON="***************************************** CONTENT="content.cpp" # Store the residual part (start from namespace) BEFORE="before.cpp" # Store the main header and the #ifdef/#define of header file -BEGIN="begin.cpp" # Store the header before license HAS_HIP_RUNTIME="false" DURING_LICENSE="false" INCLUDE_REGEX="^#include.*" From fa90ab8cea0fd6dbab8c28a9d46fd0db8a0b1433 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sat, 20 Feb 2021 00:28:42 +0800 Subject: [PATCH 02/22] auto --- dpcpp/components/reduction.dp.hpp | 278 ++++++++++++++++++++ dpcpp/components/thread_ids.dp.hpp | 317 +++++++++++++++++++++++ dpcpp/components/uninitialized_array.hpp | 113 ++++++++ 3 files changed, 708 insertions(+) create mode 100644 dpcpp/components/reduction.dp.hpp create mode 100644 dpcpp/components/thread_ids.dp.hpp create mode 100644 dpcpp/components/uninitialized_array.hpp diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp new file mode 100644 index 00000000000..2fe6c516e9c --- /dev/null +++ b/dpcpp/components/reduction.dp.hpp @@ -0,0 +1,278 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_ + + +#include + + +#include + + +#include +#include + + +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/uninitialized_array.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +constexpr int default_block_size = 512; + + +// #include "common/components/reduction.hpp.inc" +/** + * @internal + * + * Computes a reduction using the binary operation `reduce_op` on a group + * `group`. Each thread contributes with one element `local_data`. The local + * thread element is always passed as the first parameter to the `reduce_op`. + * The function returns the result of the reduction on all threads. + * + * @note The function is guaranteed to return the correct value on all threads + * only if `reduce_op` is commutative (in addition to being associative). + * Otherwise, the correct value is returned only to the thread with + * subwarp index 0. + */ +template < + typename Group, typename ValueType, typename Operator, + typename = std::enable_if_t::value>> +__dpct_inline__ ValueType reduce(const Group &group, ValueType local_data, + Operator reduce_op = Operator{}) +{ +#pragma unroll + for (int32 bitmask = 1; bitmask < group.size(); bitmask <<= 1) { + const auto remote_data = group.shfl_xor(local_data, bitmask); + local_data = reduce_op(local_data, remote_data); + } + return local_data; +} + + +/** + * @internal + * + * Returns the index of the thread that has the element with the largest + * magnitude among all the threads in the group. + * Only the values from threads which set `is_pivoted` to `false` will be + * considered. + */ +template < + typename Group, typename ValueType, + typename = std::enable_if_t::value>> +__dpct_inline__ int choose_pivot(const Group &group, ValueType local_data, + bool is_pivoted) +{ + using real = remove_complex; + real lmag = is_pivoted ? -one() : abs(local_data); + const auto pivot = std::reduce( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + group, group.thread_rank(), [&](int lidx, int ridx) { + const auto rmag = group.shfl(lmag, ridx); + if (rmag > lmag) { + lmag = rmag; + lidx = ridx; + } + return lidx; + }); + // pivot operator not commutative, make sure everyone has the same pivot + return group.shfl(pivot, 0); +} + + +/** + * @internal + * + * Computes a reduction using the binary operation `reduce_op` on entire block. + * The data for the reduction is taken from the `data` array which has to be of + * size `block_size` and accessible from all threads. The `data` array is also + * used as work space (so its content will be destroyed in the process), as well + * as to store the return value - which is stored in the 0-th position of the + * array. + */ +template < + typename Group, typename ValueType, typename Operator, + typename = std::enable_if_t::value>> +void reduce(const Group &__restrict__ group, ValueType *__restrict__ data, + Operator reduce_op = Operator{}) +{ + const auto local_id = group.thread_rank(); + + for (int k = group.size() / 2; k >= config::warp_size; k /= 2) { + group.sync(); + if (local_id < k) { + data[local_id] = reduce_op(data[local_id], data[local_id + k]); + } + } + + const auto warp = group::tiled_partition(group); + const auto warp_id = group.thread_rank() / warp.size(); + if (warp_id > 0) { + return; + } + auto result = std::reduce( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + warp, data[warp.thread_rank()], reduce_op); + if (warp.thread_rank() == 0) { + data[0] = result; + } +} + + +/** + * @internal + * + * Computes a reduction using the binary operation `reduce_op` on an array + * `source` of any size. Has to be called a second time on `result` to reduce + * an array larger than `block_size`. + */ +template +void reduce_array(size_type size, const ValueType *__restrict__ source, + ValueType *__restrict__ result, sycl::nd_item<3> item_ct1, + Operator reduce_op = Operator{}) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + auto thread_result = zero(); + for (auto i = tidx; i < size; + i += item_ct1.get_local_range().get(2) * item_ct1.get_group_range(2)) { + thread_result = reduce_op(thread_result, source[i]); + } + result[item_ct1.get_local_id(2)] = thread_result; + + group::this_thread_block(item_ct1).sync(); + + // Stores the result of the reduction inside `result[0]` + reduce(group::this_thread_block(item_ct1), result, reduce_op); +} + + +/** + * @internal + * + * Computes a reduction using the add operation (+) on an array + * `source` of any size. Has to be called a second time on `result` to reduce + * an array larger than `default_block_size`. + */ +template +void reduce_add_array( + size_type size, const ValueType *__restrict__ source, + ValueType *__restrict__ result, sycl::nd_item<3> item_ct1, + UninitializedArray *block_sum) +{ + reduce_array(size, source, static_cast((*block_sum)), item_ct1, + [](const ValueType &x, const ValueType &y) { return x + y; }); + + if (item_ct1.get_local_id(2) == 0) { + result[item_ct1.get_group(2)] = (*block_sum)[0]; + } +} + +template +void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const ValueType *source, ValueType *result) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + block_sum_acc_ct1(cgh); + + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for( + sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + reduce_add_array( + size, source, result, item_ct1, + (UninitializedArray *) + block_sum_acc_ct1.get_pointer()); + }); + }); +} + + +/** + * Compute a reduction using add operation (+). + * + * @param exec Executor associated to the array + * @param size size of the array + * @param source the pointer of the array + * + * @return the reduction result + */ +template +ValueType reduce_add_array(std::shared_ptr exec, + size_type size, const ValueType *source) +{ + auto block_results_val = source; + size_type grid_dim = size; + auto block_results = Array(exec); + if (size > default_block_size) { + const auto n = ceildiv(size, default_block_size); + grid_dim = (n <= default_block_size) ? n : default_block_size; + + block_results.resize_and_reset(grid_dim); + + reduce_add_array(grid_dim, default_block_size, 0, exec->get_queue(), + size, source, block_results.get_data()); + + block_results_val = block_results.get_const_data(); + } + + auto d_result = Array(exec, 1); + + reduce_add_array(1, default_block_size, 0, exec->get_queue(), grid_dim, + block_results_val, d_result.get_data()); + auto answer = exec->copy_val_to_host(d_result.get_const_data()); + return answer; +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_ diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp new file mode 100644 index 00000000000..4f27302dbc5 --- /dev/null +++ b/dpcpp/components/thread_ids.dp.hpp @@ -0,0 +1,317 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_ + + +#include + + +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dpct.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +/** + * @brief The DPCPP thread namespace. + * + * @ingroup dpcpp_thread + */ +namespace thread { + + +// #include "common/components/thread_ids.hpp.inc" +/** + * @internal + * + * Returns the ID of the block group this thread belongs to. + * + * @return the ID of the block group this thread belongs to + * + * @note Assumes that grid dimensions are in standard format: + * `(block_group_size, first_grid_dimension, second grid_dimension)` + */ +__dpct_inline__ size_type get_block_group_id(sycl::nd_item<3> item_ct1) +{ + return static_cast(item_ct1.get_group(0)) * + item_ct1.get_group_range(1) + + item_ct1.get_group(1); +} + +/** + * @internal + * + * Returns the ID of the block this thread belongs to. + * + * @return the ID of the block this thread belongs to + * + * @note Assumes that grid dimensions are in standard format: + * `(block_group_size, first_grid_dimension, second grid_dimension)` + */ +__dpct_inline__ size_type get_block_id(sycl::nd_item<3> item_ct1) +{ + return get_block_group_id(item_ct1) * item_ct1.get_group_range(2) + + item_ct1.get_group(2); +} + + +/** + * @internal + * + * Returns the local ID of the warp (relative to the block) this thread belongs + * to. + * + * @return the local ID of the warp (relative to the block) this thread belongs + * to + * + * @note Assumes that block dimensions are in standard format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` + */ +__dpct_inline__ size_type get_local_warp_id(sycl::nd_item<3> item_ct1) +{ + return static_cast(item_ct1.get_local_id(0)); +} + + +/** + * @internal + * + * Returns the local ID of the sub-warp (relative to the block) this thread + * belongs to. + * + * @tparam subwarp_size size of the subwarp + * + * @return the local ID of the sub-warp (relative to the block) this thread + * belongs to + * + * @note Assumes that block dimensions are in standard format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` + */ +template +__dpct_inline__ size_type get_local_subwarp_id(sycl::nd_item<3> item_ct1) +{ + constexpr auto subwarps_per_warp = config::warp_size / subwarp_size; + return get_local_warp_id(item_ct1) * subwarps_per_warp + + item_ct1.get_local_id(1); +} + + +/** + * @internal + * + * Returns the local ID of the thread (relative to the block). + * to. + * + * @tparam subwarp_size size of the subwarp + * + * @return the local ID of the thread (relative to the block) + * + * @note Assumes that block dimensions are in standard format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` + */ +template +__dpct_inline__ size_type get_local_thread_id(sycl::nd_item<3> item_ct1) +{ + return get_local_subwarp_id(item_ct1) * subwarp_size + + item_ct1.get_local_id(2); +} + + +/** + * @internal + * + * Returns the global ID of the warp this thread belongs to. + * + * @tparam warps_per_block number of warps within each block + * + * @return the global ID of the warp this thread belongs to. + * + * @note Assumes that block dimensions and grid dimensions are in standard + * format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` and + * `(block_group_size, first_grid_dimension, second grid_dimension)`, + * respectively. + */ +template +__dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1) +{ + return get_block_id(item_ct1) * warps_per_block + + get_local_warp_id(item_ct1); +} + + +/** + * @internal + * + * Returns the global ID of the sub-warp this thread belongs to. + * + * @tparam subwarp_size size of the subwarp + * + * @return the global ID of the sub-warp this thread belongs to. + * + * @note Assumes that block dimensions and grid dimensions are in standard + * format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` and + * `(block_group_size, first_grid_dimension, second grid_dimension)`, + * respectively. + */ +template +__dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1) +{ + constexpr auto subwarps_per_warp = config::warp_size / subwarp_size; + return get_warp_id(item_ct1) * subwarps_per_warp + + item_ct1.get_local_id(1); +} + + +/** + * @internal + * + * Returns the global ID of the thread. + * + * @return the global ID of the thread. + * + * @tparam subwarp_size size of the subwarp + * + * @note Assumes that block dimensions and grid dimensions are in standard + * format: + * `(subwarp_size, config::warp_size / subwarp_size, block_size / + * config::warp_size)` and + * `(block_group_size, first_grid_dimension, second grid_dimension)`, + * respectively. + */ +template +__dpct_inline__ size_type get_thread_id(sycl::nd_item<3> item_ct1) +{ + return get_subwarp_id(item_ct1) * + subwarp_size + + item_ct1.get_local_id(2); +} + + +/** + * @internal + * + * Returns the global ID of the thread in the given index type. + * This function assumes one-dimensional thread and block indexing. + * + * @return the global ID of the thread in the given index type. + * + * @tparam IndexType the index type + */ +template +__dpct_inline__ IndexType get_thread_id_flat(sycl::nd_item<3> item_ct1) +{ + return item_ct1.get_local_id(2) + + static_cast(item_ct1.get_local_range().get(2)) * + item_ct1.get_group(2); +} + + +/** + * @internal + * + * Returns the total number of threads in the given index type. + * This function assumes one-dimensional thread and block indexing. + * + * @return the total number of threads in the given index type. + * + * @tparam IndexType the index type + */ +template +__dpct_inline__ IndexType get_thread_num_flat(sycl::nd_item<3> item_ct1) +{ + return item_ct1.get_local_range().get(2) * + static_cast(item_ct1.get_group_range(2)); +} + + +/** + * @internal + * + * Returns the global ID of the subwarp in the given index type. + * This function assumes one-dimensional thread and block indexing + * with a power of two block size of at least subwarp_size. + * + * @return the global ID of the subwarp in the given index type. + * + * @tparam subwarp_size the size of the subwarp. Must be a power of two! + * @tparam IndexType the index type + */ +template +__dpct_inline__ IndexType get_subwarp_id_flat(sycl::nd_item<3> item_ct1) +{ + static_assert(!(subwarp_size & (subwarp_size - 1)), + "subwarp_size must be a power of two"); + return item_ct1.get_local_id(2) / subwarp_size + + static_cast(item_ct1.get_local_range().get(2) / + subwarp_size) * + item_ct1.get_group(2); +} + + +/** + * @internal + * + * Returns the total number of subwarps in the given index type. + * This function assumes one-dimensional thread and block indexing + * with a power of two block size of at least subwarp_size. + * + * @return the total number of subwarps in the given index type. + * + * @tparam subwarp_size the size of the subwarp. Must be a power of two! + * @tparam IndexType the index type + */ +template +__dpct_inline__ IndexType get_subwarp_num_flat(sycl::nd_item<3> item_ct1) +{ + static_assert(!(subwarp_size & (subwarp_size - 1)), + "subwarp_size must be a power of two"); + return item_ct1.get_local_range().get(2) / subwarp_size * + static_cast(item_ct1.get_group_range(2)); +} + +} // namespace thread +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_ diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp new file mode 100644 index 00000000000..b8d3006007d --- /dev/null +++ b/dpcpp/components/uninitialized_array.hpp @@ -0,0 +1,113 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ +#define GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ + + +#include + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +// #include "common/components/uninitialized_array.hpp.inc" +/** + * Stores an array with uninitialized contents. + * + * This class needed for datatypes that do have a non-empty constructor when` + * using them as shared memory, for example `thrust::complex`. + * + * @tparam ValueType the type of values + * @tparam size the size of the array + */ +template +class UninitializedArray { +public: + /** + * Operator for casting an UninitializedArray into its constexpr value + * pointer. + * + * @return the constexpr pointer to the first entry of the array. + */ + constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept + { + return &(*this)[0]; + } + + /** + * Operator for casting an UninitializedArray into its non-const value + * pointer. + * + * @return the non-const pointer to the first entry of the array. + */ + GKO_ATTRIBUTES operator ValueType *() noexcept { return &(*this)[0]; } + + /** + * constexpr array access operator. + * + * @param pos The array index. Using a value outside [0, size) is undefined + * behavior. + * + * @return a reference to the array entry at the given index. + */ + constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept + { + return reinterpret_cast(data_)[pos]; + } + + /** + * Non-const array access operator. + * + * @param pos The array index. Using a value outside [0, size) is undefined + * behavior. + * + * @return a reference to the array entry at the given index. + */ + GKO_ATTRIBUTES ValueType &operator[](size_type pos) noexcept + { + return reinterpret_cast(data_)[pos]; + } + +private: + unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size]; +}; + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_ From 12f669ba0c9e69887ecabc3d505ebb570e2a6104 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sat, 20 Feb 2021 00:37:10 +0800 Subject: [PATCH 03/22] manual modification --- dpcpp/components/reduction.dp.hpp | 12 +++++------- dpcpp/components/thread_ids.dp.hpp | 1 + dpcpp/components/uninitialized_array.hpp | 10 +++++++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index 2fe6c516e9c..4caf46229c8 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -57,7 +57,7 @@ namespace kernels { namespace dpcpp { -constexpr int default_block_size = 512; +constexpr int default_block_size = 256; // #include "common/components/reduction.hpp.inc" @@ -105,9 +105,8 @@ __dpct_inline__ int choose_pivot(const Group &group, ValueType local_data, { using real = remove_complex; real lmag = is_pivoted ? -one() : abs(local_data); - const auto pivot = std::reduce( - oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), - group, group.thread_rank(), [&](int lidx, int ridx) { + const auto pivot = + reduce(group, group.thread_rank(), [&](int lidx, int ridx) { const auto rmag = group.shfl(lmag, ridx); if (rmag > lmag) { lmag = rmag; @@ -150,9 +149,8 @@ void reduce(const Group &__restrict__ group, ValueType *__restrict__ data, if (warp_id > 0) { return; } - auto result = std::reduce( - oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), - warp, data[warp.thread_rank()], reduce_op); + auto result = ::gko::kernels::dpcpp::reduce(warp, data[warp.thread_rank()], + reduce_op); if (warp.thread_rank() == 0) { data[0] = result; } diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp index 4f27302dbc5..8694d6a88c9 100644 --- a/dpcpp/components/thread_ids.dp.hpp +++ b/dpcpp/components/thread_ids.dp.hpp @@ -308,6 +308,7 @@ __dpct_inline__ IndexType get_subwarp_num_flat(sycl::nd_item<3> item_ct1) static_cast(item_ct1.get_group_range(2)); } + } // namespace thread } // namespace dpcpp } // namespace kernels diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp index b8d3006007d..fb7575bc202 100644 --- a/dpcpp/components/uninitialized_array.hpp +++ b/dpcpp/components/uninitialized_array.hpp @@ -37,6 +37,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "dpcpp/base/dpct.hpp" + + namespace gko { namespace kernels { namespace dpcpp { @@ -61,7 +64,7 @@ class UninitializedArray { * * @return the constexpr pointer to the first entry of the array. */ - constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept + constexpr __dpct_inline__ operator ValueType *() const noexcept { return &(*this)[0]; } @@ -72,7 +75,7 @@ class UninitializedArray { * * @return the non-const pointer to the first entry of the array. */ - GKO_ATTRIBUTES operator ValueType *() noexcept { return &(*this)[0]; } + __dpct_inline__ operator ValueType *() noexcept { return &(*this)[0]; } /** * constexpr array access operator. @@ -82,7 +85,8 @@ class UninitializedArray { * * @return a reference to the array entry at the given index. */ - constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept + constexpr __dpct_inline__ ValueType &operator[](size_type pos) const + noexcept { return reinterpret_cast(data_)[pos]; } From 0e222387fc7a90274583e73135be2947723ce0be Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sat, 20 Feb 2021 01:04:38 +0800 Subject: [PATCH 04/22] auto dense --- dpcpp/matrix/dense_kernels.dp.cpp | 1574 ++++++++++++++++++++++++++++- 1 file changed, 1555 insertions(+), 19 deletions(-) diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 531cfd52259..36935b1a4d6 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -33,25 +33,30 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" -#include +#include +#include #include -#include #include #include #include #include #include #include -#include #include #include #include "core/components/prefix_sum.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/uninitialized_array.hpp" namespace gko { @@ -65,11 +70,1245 @@ namespace dpcpp { namespace dense { +constexpr auto default_block_size = 512; + + +// #include "common/matrix/dense_kernels.hpp.inc" +namespace kernel { + + +template +void strided_fill(size_type num_rows, size_type num_cols, size_type stride, + ValueType *__restrict__ mat, ValueType value, + sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + mat[row_id * stride + col_id] = value; + } +} + +template +void strided_fill(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + size_type stride, ValueType *mat, ValueType value) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + strided_fill(num_rows, num_cols, stride, mat, + value, item_ct1); + }); + }); +} + + +template +void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols, + const ValueType *__restrict__ alpha, ValueType *__restrict__ x, + size_type stride_x, sycl::nd_item<3> item_ct1) +{ + constexpr auto warps_per_block = block_size / config::warp_size; + const auto global_id = + thread::get_thread_id(item_ct1); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id; + if (row_id < num_rows) { + x[row_id * stride_x + col_id] = + alpha[alpha_id] == zero() + ? zero() + : x[row_id * stride_x + col_id] * alpha[alpha_id]; + } +} + +template +void scale(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + size_type num_alpha_cols, const ValueType *alpha, ValueType *x, + size_type stride_x) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + scale(num_rows, num_cols, + num_alpha_cols, alpha, x, + stride_x, item_ct1); + }); + }); +} + + +template +void add_scaled(size_type num_rows, size_type num_cols, + size_type num_alpha_cols, const ValueType *__restrict__ alpha, + const ValueType *__restrict__ x, size_type stride_x, + ValueType *__restrict__ y, size_type stride_y, + sycl::nd_item<3> item_ct1) +{ + constexpr auto warps_per_block = block_size / config::warp_size; + const auto global_id = + thread::get_thread_id(item_ct1); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id; + if (row_id < num_rows && alpha[alpha_id] != zero()) { + y[row_id * stride_y + col_id] += + x[row_id * stride_x + col_id] * alpha[alpha_id]; + } +} + +template +void add_scaled(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + size_type num_alpha_cols, const ValueType *alpha, + const ValueType *x, size_type stride_x, ValueType *y, + size_type stride_y) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + add_scaled( + num_rows, num_cols, num_alpha_cols, alpha, x, + stride_x, y, stride_y, item_ct1); + }); + }); +} + + +template +void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha, + const ValueType *__restrict__ diag, + ValueType *__restrict__ y, size_type stride_y, + sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + + if (tidx >= size) { + return; + } + + y[tidx * stride_y + tidx] += alpha[0] * diag[tidx]; +} + +template +void add_scaled_diag(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const ValueType *alpha, const ValueType *diag, + ValueType *y, size_type stride_y) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + add_scaled_diag(size, alpha, diag, y, stride_y, + item_ct1); + }); + }); +} + + +template +void compute_partial_reduce(size_type num_rows, OutType *__restrict__ work, + CallableGetValue get_value, + CallableReduce reduce_op, sycl::nd_item<3> item_ct1, + UninitializedArray *tmp_work) +{ + constexpr auto warps_per_block = block_size / config::warp_size; + + const auto num_blocks = item_ct1.get_group_range(2); + const auto local_id = + thread::get_local_thread_id(item_ct1); + const auto global_id = + thread::get_thread_id(item_ct1); + + auto tmp = zero(); + for (auto i = global_id; i < num_rows; i += block_size * num_blocks) { + tmp = reduce_op(tmp, get_value(i)); + } + + (*tmp_work)[local_id] = tmp; + + reduce(group::this_thread_block(item_ct1), + static_cast((*tmp_work)), reduce_op); + + if (local_id == 0) { + work[thread::get_block_id(item_ct1)] = (*tmp_work)[0]; + } +} + + +template +void finalize_reduce_computation( + size_type size, const ValueType *work, ValueType *result, + CallableReduce reduce_op, CallableFinalize finalize_op, + sycl::nd_item<3> item_ct1, + UninitializedArray *tmp_work) +{ + const auto local_id = + thread::get_local_thread_id(item_ct1); + + ValueType tmp = zero(); + for (auto i = local_id; i < size; i += block_size) { + tmp = reduce_op(tmp, work[i]); + } + + (*tmp_work)[local_id] = tmp; + + reduce(group::this_thread_block(item_ct1), + static_cast((*tmp_work)), reduce_op); + + if (local_id == 0) { + *result = finalize_op((*tmp_work)[0]); + } +} + + +template +void compute_partial_dot(size_type num_rows, const ValueType *__restrict__ x, + size_type stride_x, const ValueType *__restrict__ y, + size_type stride_y, ValueType *__restrict__ work, + sycl::nd_item<3> item_ct1, + UninitializedArray *tmp_work) +{ + compute_partial_reduce( + /* + DPCT1007:4: Migration of this CUDA API is not supported by the Intel(R) + DPC++ Compatibility Tool. + */ + num_rows, + work, + [x, stride_x, y, stride_y](size_type i) { + return x[i * stride_x] * conj(y[i * stride_y]); + }, + [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1, + tmp_work); +} + +template +void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, + const ValueType *x, size_type stride_x, + const ValueType *y, size_type stride_y, + ValueType *work) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + compute_partial_dot( + num_rows, x, stride_x, y, stride_y, work, + item_ct1, + (UninitializedArray *) + tmp_work_acc_ct1.get_pointer()); + }); + }); +} + + +template +void finalize_dot_computation( + size_type size, const ValueType *work, ValueType *result, + sycl::nd_item<3> item_ct1, + UninitializedArray *tmp_work) +{ + finalize_reduce_computation( + size, work, result, + [](const ValueType &x, const ValueType &y) { return x + y; }, + [](const ValueType &x) { return x; }, item_ct1, tmp_work); +} + +template +void finalize_dot_computation(dim3 grid, dim3 block, + size_t dynamic_shared_memory, sycl::queue *stream, + size_type size, const ValueType *work, + ValueType *result) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + finalize_dot_computation( + size, work, result, item_ct1, + (UninitializedArray *) + tmp_work_acc_ct1.get_pointer()); + }); + }); +} + + +template +void compute_partial_norm2( + size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, + remove_complex *__restrict__ work, sycl::nd_item<3> item_ct1, + UninitializedArray, block_size> *tmp_work) +{ + using norm_type = remove_complex; + compute_partial_reduce( + num_rows, work, + [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); }, + [](const norm_type &x, const norm_type &y) { return x + y; }, item_ct1, + tmp_work); +} + +template +void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, + const ValueType *x, size_type stride_x, + remove_complex *work) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor< + UninitializedArray, block_size>, 0, + sycl::access::mode::read_write, sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for( + sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + compute_partial_norm2( + num_rows, x, stride_x, work, item_ct1, + (UninitializedArray, block_size> + *)tmp_work_acc_ct1.get_pointer()); + }); + }); +} + + +template +void finalize_norm2_computation( + size_type size, const ValueType *work, ValueType *result, + sycl::nd_item<3> item_ct1, + UninitializedArray *tmp_work) +{ + finalize_reduce_computation( + size, work, result, + [](const ValueType &x, const ValueType &y) { return x + y; }, + [](const ValueType &x) { return sycl::sqrt((float)x); }, item_ct1, + tmp_work); +} + +template +void finalize_norm2_computation(dim3 grid, dim3 block, + size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const ValueType *work, ValueType *result) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + finalize_norm2_computation( + size, work, result, item_ct1, + (UninitializedArray *) + tmp_work_acc_ct1.get_pointer()); + }); + }); +} + + +template +void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, + const size_type *__restrict__ row_ptrs, + const ValueType *__restrict__ source, + IndexType *__restrict__ row_idxs, + IndexType *__restrict__ col_idxs, + ValueType *__restrict__ values, sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + if (tidx < num_rows) { + size_type write_to = row_ptrs[tidx]; + + for (size_type i = 0; i < num_cols; i++) { + if (source[stride * tidx + i] != zero()) { + values[write_to] = source[stride * tidx + i]; + col_idxs[write_to] = i; + row_idxs[write_to] = tidx; + write_to++; + } + } + } +} + +template +void fill_in_coo(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + size_type stride, const size_type *row_ptrs, + const ValueType *source, IndexType *row_idxs, + IndexType *col_idxs, ValueType *values) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + fill_in_coo(num_rows, num_cols, stride, row_ptrs, + source, row_idxs, col_idxs, values, + item_ct1); + }); + }); +} + + +template +void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ work, + IndexType *__restrict__ result, + sycl::nd_item<3> item_ct1) +{ + constexpr auto warp_size = config::warp_size; + const auto row_idx = thread::get_subwarp_id_flat(item_ct1); + auto warp_tile = + group::tiled_partition(group::this_thread_block(item_ct1)); + + if (row_idx < num_rows) { + IndexType part_result{}; + for (auto i = warp_tile.thread_rank(); i < num_cols; i += warp_size) { + if (work[stride * row_idx + i] != zero()) { + part_result += 1; + } + } + result[row_idx] = std::reduce( + oneapi::dpl::execution::make_device_policy( + dpct::get_default_queue()), + warp_tile, part_result, + [](const size_type &a, const size_type &b) { return a + b; }); + } +} + +template +void count_nnz_per_row(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, + size_type num_cols, size_type stride, + const ValueType *work, IndexType *result) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + count_nnz_per_row(num_rows, num_cols, stride, work, + result, item_ct1); + }); + }); +} + + +template +void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, + const ValueType *__restrict__ source, + IndexType *__restrict__ row_ptrs, + IndexType *__restrict__ col_idxs, + ValueType *__restrict__ values, sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + + if (tidx < num_rows) { + auto write_to = row_ptrs[tidx]; + for (auto i = 0; i < num_cols; i++) { + if (source[stride * tidx + i] != zero()) { + values[write_to] = source[stride * tidx + i]; + col_idxs[write_to] = i; + write_to++; + } + } + } +} + +template +void fill_in_csr(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + size_type stride, const ValueType *source, IndexType *row_ptrs, + IndexType *col_idxs, ValueType *values) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + fill_in_csr(num_rows, num_cols, stride, source, + row_ptrs, col_idxs, values, item_ct1); + }); + }); +} + + +template +void fill_in_ell(size_type num_rows, size_type num_cols, + size_type source_stride, const ValueType *__restrict__ source, + size_type max_nnz_per_row, size_type result_stride, + IndexType *__restrict__ col_ptrs, + ValueType *__restrict__ values, sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + if (tidx < num_rows) { + IndexType col_idx = 0; + for (size_type col = 0; col < num_cols; col++) { + if (source[tidx * source_stride + col] != zero()) { + col_ptrs[col_idx * result_stride + tidx] = col; + values[col_idx * result_stride + tidx] = + source[tidx * source_stride + col]; + col_idx++; + } + } + for (size_type j = col_idx; j < max_nnz_per_row; j++) { + col_ptrs[j * result_stride + tidx] = 0; + values[j * result_stride + tidx] = zero(); + } + } else if (tidx < result_stride) { + for (size_type j = 0; j < max_nnz_per_row; j++) { + col_ptrs[j * result_stride + tidx] = 0; + values[j * result_stride + tidx] = zero(); + } + } +} + +template +void fill_in_ell(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + size_type source_stride, const ValueType *source, + size_type max_nnz_per_row, size_type result_stride, + IndexType *col_ptrs, ValueType *values) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + fill_in_ell(num_rows, num_cols, source_stride, + source, max_nnz_per_row, result_stride, + col_ptrs, values, item_ct1); + }); + }); +} + + +void calculate_slice_lengths(size_type num_rows, size_type slice_size, + int slice_num, size_type stride_factor, + const size_type *__restrict__ nnz_per_row, + size_type *__restrict__ slice_lengths, + size_type *__restrict__ slice_sets, + sycl::nd_item<3> item_ct1) +{ + constexpr auto warp_size = config::warp_size; + const auto sliceid = item_ct1.get_group(2); + const auto tid_in_warp = item_ct1.get_local_id(2); + + if (sliceid * slice_size + tid_in_warp < num_rows) { + size_type thread_result = 0; + for (size_type i = tid_in_warp; i < slice_size; i += warp_size) { + thread_result = + (i + slice_size * sliceid < num_rows) + ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) + : thread_result; + } + + auto warp_tile = group::tiled_partition( + group::this_thread_block(item_ct1)); + auto warp_result = reduce( + warp_tile, thread_result, + [](const size_type &a, const size_type &b) { return max(a, b); }); + + if (tid_in_warp == 0) { + auto slice_length = + ceildiv(warp_result, stride_factor) * stride_factor; + slice_lengths[sliceid] = slice_length; + slice_sets[sliceid] = slice_length; + } + } +} + +void calculate_slice_lengths(dim3 grid, dim3 block, + size_t dynamic_shared_memory, sycl::queue *stream, + size_type num_rows, size_type slice_size, + int slice_num, size_type stride_factor, + const size_type *nnz_per_row, + size_type *slice_lengths, size_type *slice_sets) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + calculate_slice_lengths(num_rows, slice_size, + slice_num, stride_factor, + nnz_per_row, slice_lengths, + slice_sets, item_ct1); + }); + }); +} + + +template +void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, + size_type stride, const ValueType *__restrict__ source, + size_type *__restrict__ slice_lengths, + size_type *__restrict__ slice_sets, + IndexType *__restrict__ col_idxs, + ValueType *__restrict__ vals, sycl::nd_item<3> item_ct1) +{ + const auto global_row = thread::get_thread_id_flat(item_ct1); + const auto row = global_row % slice_size; + const auto sliceid = global_row / slice_size; + + if (global_row < num_rows) { + size_type sellp_ind = slice_sets[sliceid] * slice_size + row; + + for (size_type col = 0; col < num_cols; col++) { + auto val = source[global_row * stride + col]; + if (val != zero()) { + col_idxs[sellp_ind] = col; + vals[sellp_ind] = val; + sellp_ind += slice_size; + } + } + for (size_type i = sellp_ind; + i < + (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row; + i += slice_size) { + col_idxs[i] = 0; + vals[i] = zero(); + } + } +} + +template +void fill_in_sellp(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + size_type slice_size, size_type stride, + const ValueType *source, size_type *slice_lengths, + size_type *slice_sets, IndexType *col_idxs, ValueType *vals) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + fill_in_sellp(num_rows, num_cols, slice_size, + stride, source, slice_lengths, + slice_sets, col_idxs, vals, + item_ct1); + }); + }); +} + + +void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, + size_type *__restrict__ result, sycl::nd_item<3> item_ct1, + uint8_t *dpct_local) +{ + auto block_max = (size_type *)dpct_local; + + reduce_array( + size, nnz_per_row, block_max, + [](const size_type &x, const size_type &y) { return max(x, y); }); + + if (item_ct1.get_local_id(2) == 0) { + result[item_ct1.get_group(2)] = block_max[0]; + } +} + +void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const size_type *nnz_per_row, size_type *result) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor + dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); + + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + reduce_max_nnz(size, nnz_per_row, result, item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); + }); +} + + +void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size, + size_type stride_factor, + const size_type *__restrict__ nnz_per_row, + size_type *__restrict__ result, + sycl::nd_item<3> item_ct1) +{ + constexpr auto warp_size = config::warp_size; + auto warp_tile = + group::tiled_partition(group::this_thread_block(item_ct1)); + const auto warpid = thread::get_subwarp_id_flat(item_ct1); + const auto tid_in_warp = warp_tile.thread_rank(); + const auto slice_num = ceildiv(num_rows, slice_size); + + size_type thread_result = 0; + for (size_type i = tid_in_warp; i < slice_size; i += warp_size) { + if (warpid * slice_size + i < num_rows) { + thread_result = + max(thread_result, nnz_per_row[warpid * slice_size + i]); + } + } + + auto warp_result = reduce( + warp_tile, thread_result, + [](const size_type &a, const size_type &b) { return max(a, b); }); + + if (tid_in_warp == 0 && warpid < slice_num) { + result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor; + } +} + +void reduce_max_nnz_per_slice(dim3 grid, dim3 block, + size_t dynamic_shared_memory, sycl::queue *stream, + size_type num_rows, size_type slice_size, + size_type stride_factor, + const size_type *nnz_per_row, size_type *result) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + reduce_max_nnz_per_slice( + num_rows, slice_size, stride_factor, + nnz_per_row, result, item_ct1); + }); + }); +} + + +void reduce_total_cols(size_type num_slices, + const size_type *__restrict__ max_nnz_per_slice, + size_type *__restrict__ result, + sycl::nd_item<3> item_ct1, uint8_t *dpct_local) +{ + auto block_result = (size_type *)dpct_local; + + reduce_array(num_slices, max_nnz_per_slice, block_result, + [](const size_type &x, const size_type &y) { return x + y; }); + + if (item_ct1.get_local_id(2) == 0) { + result[item_ct1.get_group(2)] = block_result[0]; + } +} + +void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_slices, + const size_type *max_nnz_per_slice, size_type *result) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor + dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); + + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + reduce_total_cols( + num_slices, max_nnz_per_slice, result, + item_ct1, dpct_local_acc_ct1.get_pointer()); + }); + }); +} + + +template +void symm_permute(size_type num_rows, size_type num_cols, + const IndexType *__restrict__ perm_idxs, + const ValueType *__restrict__ orig, size_type stride_orig, + ValueType *__restrict__ result, size_type stride_result, + sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + result[row_id * stride_result + col_id] = + orig[perm_idxs[row_id] * stride_orig + perm_idxs[col_id]]; + } +} + +template +void symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + const IndexType *perm_idxs, const ValueType *orig, + size_type stride_orig, ValueType *result, + size_type stride_result) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + symm_permute(num_rows, num_cols, perm_idxs, orig, + stride_orig, result, stride_result, + item_ct1); + }); + }); +} + + +template +void inv_symm_permute(size_type num_rows, size_type num_cols, + const IndexType *__restrict__ perm_idxs, + const ValueType *__restrict__ orig, size_type stride_orig, + ValueType *__restrict__ result, size_type stride_result, + sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + result[perm_idxs[row_id] * stride_result + perm_idxs[col_id]] = + orig[row_id * stride_orig + col_id]; + } +} + +template +void inv_symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, + size_type num_cols, const IndexType *perm_idxs, + const ValueType *orig, size_type stride_orig, + ValueType *result, size_type stride_result) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + inv_symm_permute(num_rows, num_cols, perm_idxs, + orig, stride_orig, result, + stride_result, item_ct1); + }); + }); +} + + +template +void row_gather(size_type num_rows, size_type num_cols, + const IndexType *__restrict__ perm_idxs, + const ValueType *__restrict__ orig, size_type stride_orig, + ValueType *__restrict__ result, size_type stride_result, + sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + result[row_id * stride_result + col_id] = + orig[perm_idxs[row_id] * stride_orig + col_id]; + } +} + +template +void row_gather(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + const IndexType *perm_idxs, const ValueType *orig, + size_type stride_orig, ValueType *result, + size_type stride_result) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + row_gather(num_rows, num_cols, perm_idxs, orig, + stride_orig, result, stride_result, + item_ct1); + }); + }); +} + + +template +void column_permute(size_type num_rows, size_type num_cols, + const IndexType *__restrict__ perm_idxs, + const ValueType *__restrict__ orig, size_type stride_orig, + ValueType *__restrict__ result, size_type stride_result, + sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + result[row_id * stride_result + col_id] = + orig[row_id * stride_orig + perm_idxs[col_id]]; + } +} + +template +void column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + const IndexType *perm_idxs, const ValueType *orig, + size_type stride_orig, ValueType *result, + size_type stride_result) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + column_permute(num_rows, num_cols, perm_idxs, orig, + stride_orig, result, stride_result, + item_ct1); + }); + }); +} + + +template +void inverse_row_permute(size_type num_rows, size_type num_cols, + const IndexType *__restrict__ perm_idxs, + const ValueType *__restrict__ orig, + size_type stride_orig, ValueType *__restrict__ result, + size_type stride_result, sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + result[perm_idxs[row_id] * stride_result + col_id] = + orig[row_id * stride_orig + col_id]; + } +} + +template +void inverse_row_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, + size_type num_cols, const IndexType *perm_idxs, + const ValueType *orig, size_type stride_orig, + ValueType *result, size_type stride_result) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + inverse_row_permute(num_rows, num_cols, perm_idxs, + orig, stride_orig, result, + stride_result, item_ct1); + }); + }); +} + + +template +void inverse_column_permute(size_type num_rows, size_type num_cols, + const IndexType *__restrict__ perm_idxs, + const ValueType *__restrict__ orig, + size_type stride_orig, + ValueType *__restrict__ result, + size_type stride_result, sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + result[row_id * stride_result + perm_idxs[col_id]] = + orig[row_id * stride_orig + col_id]; + } +} + +template +void inverse_column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, + size_type num_cols, const IndexType *perm_idxs, + const ValueType *orig, size_type stride_orig, + ValueType *result, size_type stride_result) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + inverse_column_permute( + num_rows, num_cols, perm_idxs, orig, + stride_orig, result, stride_result, item_ct1); + }); + }); +} + + +template +void extract_diagonal(size_type problem_size, + const ValueType *__restrict__ orig, size_type stride_orig, + ValueType *__restrict__ diag, sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + + if (tidx < problem_size) { + diag[tidx] = orig[tidx * stride_orig + tidx]; + } +} + +template +void extract_diagonal(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type problem_size, + const ValueType *orig, size_type stride_orig, + ValueType *diag) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + extract_diagonal(problem_size, orig, stride_orig, + diag, item_ct1); + }); + }); +} + + +template +void inplace_absolute_dense(size_type num_rows, size_type num_cols, + ValueType *__restrict__ data, size_type stride, + sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + auto row = tidx / num_cols; + auto col = tidx % num_cols; + if (row < num_rows) { + data[row * stride + col] = dpcpp::abs(data[row * stride + col]); + } +} + +template +void inplace_absolute_dense(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, + size_type num_cols, ValueType *data, + size_type stride) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + inplace_absolute_dense(num_rows, num_cols, data, + stride, item_ct1); + }); + }); +} + + +template +void outplace_absolute_dense(size_type num_rows, size_type num_cols, + const ValueType *__restrict__ in, + size_type stride_in, + remove_complex *__restrict__ out, + size_type stride_out, sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + auto row = tidx / num_cols; + auto col = tidx % num_cols; + if (row < num_rows) { + out[row * stride_out + col] = dpcpp::abs(in[row * stride_in + col]); + } +} + +template +void outplace_absolute_dense(dim3 grid, dim3 block, + size_t dynamic_shared_memory, sycl::queue *stream, + size_type num_rows, size_type num_cols, + const ValueType *in, size_type stride_in, + remove_complex *out, + size_type stride_out) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + outplace_absolute_dense(num_rows, num_cols, in, + stride_in, out, stride_out, + item_ct1); + }); + }); +} + + +template +void make_complex(size_type num_rows, size_type num_cols, + const ValueType *__restrict__ in, size_type stride_in, + ComplexType *__restrict__ out, size_type stride_out, + sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + auto row = tidx / num_cols; + auto col = tidx % num_cols; + if (row < num_rows) { + out[row * stride_out + col] = in[row * stride_in + col]; + } +} + +template +void make_complex(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + const ValueType *in, size_type stride_in, ComplexType *out, + size_type stride_out) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + make_complex(num_rows, num_cols, in, stride_in, + out, stride_out, item_ct1); + }); + }); +} + + +template +void get_real(size_type num_rows, size_type num_cols, + const ValueType *__restrict__ in, size_type stride_in, + remove_complex *__restrict__ out, size_type stride_out, + sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + auto row = tidx / num_cols; + auto col = tidx % num_cols; + if (row < num_rows) { + out[row * stride_out + col] = real(in[row * stride_in + col]); + } +} + +template +void get_real(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + const ValueType *in, size_type stride_in, + remove_complex *out, size_type stride_out) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + get_real(num_rows, num_cols, in, stride_in, out, + stride_out, item_ct1); + }); + }); +} + + +template +void get_imag(size_type num_rows, size_type num_cols, + const ValueType *__restrict__ in, size_type stride_in, + remove_complex *__restrict__ out, size_type stride_out, + sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + auto row = tidx / num_cols; + auto col = tidx % num_cols; + if (row < num_rows) { + out[row * stride_out + col] = imag(in[row * stride_in + col]); + } +} + +template +void get_imag(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_rows, size_type num_cols, + const ValueType *in, size_type stride_in, + remove_complex *out, size_type stride_out) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + get_imag(num_rows, num_cols, in, stride_in, out, + stride_out, item_ct1); + }); + }); +} + + +} // namespace kernel + + template void simple_apply(std::shared_ptr exec, const matrix::Dense *a, const matrix::Dense *b, - matrix::Dense *c) GKO_NOT_IMPLEMENTED; + matrix::Dense *c) +{ + if (cublas::is_supported::value) { + auto handle = exec->get_cublas_handle(); + { + cublas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + cublas::gemm(handle, oneapi::mkl::transpose::nontrans, + oneapi::mkl::transpose::nontrans, c->get_size()[1], + c->get_size()[0], a->get_size()[1], &alpha, + b->get_const_values(), b->get_stride(), + a->get_const_values(), a->get_stride(), &beta, + c->get_values(), c->get_stride()); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); @@ -78,8 +1317,20 @@ template void apply(std::shared_ptr exec, const matrix::Dense *alpha, const matrix::Dense *a, const matrix::Dense *b, - const matrix::Dense *beta, - matrix::Dense *c) GKO_NOT_IMPLEMENTED; + const matrix::Dense *beta, matrix::Dense *c) +{ + if (cublas::is_supported::value) { + cublas::gemm( + exec->get_cublas_handle(), oneapi::mkl::transpose::nontrans, + oneapi::mkl::transpose::nontrans, c->get_size()[1], + c->get_size()[0], a->get_size()[1], alpha->get_const_values(), + b->get_const_values(), b->get_stride(), a->get_const_values(), + a->get_stride(), beta->get_const_values(), c->get_values(), + c->get_stride()); + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); @@ -88,7 +1339,40 @@ template void compute_dot(std::shared_ptr exec, const matrix::Dense *x, const matrix::Dense *y, - matrix::Dense *result) GKO_NOT_IMPLEMENTED; + matrix::Dense *result) +{ + if (cublas::is_supported::value) { + // TODO: write a custom kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + cublas::dot(exec->get_cublas_handle(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), + result->get_values() + col); + } + } else { + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + constexpr auto block_size = 1024; + + constexpr auto work_per_block = work_per_thread * block_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{config::warp_size, 1, + block_size / config::warp_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + kernel::compute_partial_dot( + grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), work.get_data()); + kernel::finalize_dot_computation( + 1, block_dim, 0, exec->get_queue(), grid_dim.x, + work.get_const_data(), result->get_values() + col); + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); @@ -106,7 +1390,37 @@ template void compute_norm2(std::shared_ptr exec, const matrix::Dense *x, matrix::Dense> *result) - GKO_NOT_IMPLEMENTED; +{ + if (cublas::is_supported::value) { + for (size_type col = 0; col < x->get_size()[1]; ++col) { + cublas::norm2(exec->get_cublas_handle(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + result->get_values() + col); + } + } else { + using norm_type = remove_complex; + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + constexpr auto block_size = 1024; + + constexpr auto work_per_block = work_per_thread * block_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{config::warp_size, 1, + block_size / config::warp_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + kernel::compute_partial_norm2( + grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), work.get_data()); + kernel::finalize_norm2_computation( + 1, block_dim, 0, exec->get_queue(), grid_dim.x, + work.get_const_data(), result->get_values() + col); + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); @@ -115,7 +1429,28 @@ template void convert_to_coo(std::shared_ptr exec, const matrix::Dense *source, matrix::Coo *result) - GKO_NOT_IMPLEMENTED; +{ + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + + auto row_idxs = result->get_row_idxs(); + auto col_idxs = result->get_col_idxs(); + auto values = result->get_values(); + + auto stride = source->get_stride(); + + auto nnz_prefix_sum = Array(exec, num_rows); + calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum); + + components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows); + + size_type grid_dim = ceildiv(num_rows, default_block_size); + + kernel::fill_in_coo(grid_dim, default_block_size, 0, exec->get_queue(), + num_rows, num_cols, stride, + nnz_prefix_sum.get_const_data(), + source->get_const_values(), row_idxs, col_idxs, values); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); @@ -125,7 +1460,31 @@ template void convert_to_csr(std::shared_ptr exec, const matrix::Dense *source, matrix::Csr *result) - GKO_NOT_IMPLEMENTED; +{ + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + + auto row_ptrs = result->get_row_ptrs(); + auto col_idxs = result->get_col_idxs(); + auto values = result->get_values(); + + auto stride = source->get_stride(); + + const auto rows_per_block = ceildiv(default_block_size, config::warp_size); + const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); + + kernel::count_nnz_per_row(grid_dim_nnz, default_block_size, 0, + exec->get_queue(), num_rows, num_cols, stride, + source->get_const_values(), row_ptrs); + + components::prefix_sum(exec, row_ptrs, num_rows + 1); + + size_type grid_dim = ceildiv(num_rows, default_block_size); + + kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(), + num_rows, num_cols, stride, source->get_const_values(), + row_ptrs, col_idxs, values); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); @@ -135,7 +1494,23 @@ template void convert_to_ell(std::shared_ptr exec, const matrix::Dense *source, matrix::Ell *result) - GKO_NOT_IMPLEMENTED; +{ + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + auto max_nnz_per_row = result->get_num_stored_elements_per_row(); + + auto col_ptrs = result->get_col_idxs(); + auto values = result->get_values(); + + auto source_stride = source->get_stride(); + auto result_stride = result->get_stride(); + + auto grid_dim = ceildiv(result_stride, default_block_size); + kernel::fill_in_ell(grid_dim, default_block_size, 0, exec->get_queue(), + num_rows, num_cols, source_stride, + source->get_const_values(), max_nnz_per_row, + result_stride, col_ptrs, values); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL); @@ -155,7 +1530,46 @@ template void convert_to_sellp(std::shared_ptr exec, const matrix::Dense *source, matrix::Sellp *result) - GKO_NOT_IMPLEMENTED; +{ + const auto stride = source->get_stride(); + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + + auto vals = result->get_values(); + auto col_idxs = result->get_col_idxs(); + auto slice_lengths = result->get_slice_lengths(); + auto slice_sets = result->get_slice_sets(); + + const auto slice_size = (result->get_slice_size() == 0) + ? matrix::default_slice_size + : result->get_slice_size(); + const auto stride_factor = (result->get_stride_factor() == 0) + ? matrix::default_stride_factor + : result->get_stride_factor(); + const int slice_num = ceildiv(num_rows, slice_size); + + auto nnz_per_row = Array(exec, num_rows); + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + auto grid_dim = slice_num; + + if (grid_dim > 0) { + kernel::calculate_slice_lengths( + grid_dim, config::warp_size, 0, exec->get_queue(), num_rows, + slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(), + slice_lengths, slice_sets); + } + + components::prefix_sum(exec, slice_sets, slice_num + 1); + + grid_dim = ceildiv(num_rows, default_block_size); + if (grid_dim > 0) { + kernel::fill_in_sellp(grid_dim, default_block_size, 0, + exec->get_queue(), num_rows, num_cols, slice_size, + stride, source->get_const_values(), slice_lengths, + slice_sets, col_idxs, vals); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL); @@ -173,8 +1587,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void count_nonzeros(std::shared_ptr exec, - const matrix::Dense *source, - size_type *result) GKO_NOT_IMPLEMENTED; + const matrix::Dense *source, size_type *result) +{ + const auto num_rows = source->get_size()[0]; + auto nnz_per_row = Array(exec, num_rows); + + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); @@ -182,7 +1603,33 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL); template void calculate_max_nnz_per_row(std::shared_ptr exec, const matrix::Dense *source, - size_type *result) GKO_NOT_IMPLEMENTED; + size_type *result) +{ + const auto num_rows = source->get_size()[0]; + auto nnz_per_row = Array(exec, num_rows); + + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + const auto n = ceildiv(num_rows, default_block_size); + const size_type grid_dim = + (n <= default_block_size) ? n : default_block_size; + + auto block_results = Array(exec, grid_dim); + + kernel::reduce_max_nnz( + grid_dim, default_block_size, default_block_size * sizeof(size_type), + exec->get_queue(), num_rows, nnz_per_row.get_const_data(), + block_results.get_data()); + + auto d_result = Array(exec, 1); + + kernel::reduce_max_nnz(1, default_block_size, + default_block_size * sizeof(size_type), + exec->get_queue(), grid_dim, + block_results.get_const_data(), d_result.get_data()); + + *result = exec->copy_val_to_host(d_result.get_const_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL); @@ -191,7 +1638,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void calculate_nonzeros_per_row(std::shared_ptr exec, const matrix::Dense *source, - Array *result) GKO_NOT_IMPLEMENTED; + Array *result) +{ + const dim3 block_size(default_block_size, 1, 1); + auto rows_per_block = ceildiv(default_block_size, config::warp_size); + const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block); + const dim3 grid_size(grid_x, 1, 1); + if (grid_x > 0) { + kernel::count_nnz_per_row( + grid_size, block_size, 0, exec->get_queue(), source->get_size()[0], + source->get_size()[1], source->get_stride(), + source->get_const_values(), result->get_data()); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL); @@ -201,7 +1660,48 @@ template void calculate_total_cols(std::shared_ptr exec, const matrix::Dense *source, size_type *result, size_type stride_factor, - size_type slice_size) GKO_NOT_IMPLEMENTED; + size_type slice_size) +{ + const auto num_rows = source->get_size()[0]; + + if (num_rows == 0) { + *result = 0; + return; + } + + const auto num_cols = source->get_size()[1]; + const auto slice_num = ceildiv(num_rows, slice_size); + + auto nnz_per_row = Array(exec, num_rows); + + calculate_nonzeros_per_row(exec, source, &nnz_per_row); + + auto max_nnz_per_slice = Array(exec, slice_num); + + auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); + + kernel::reduce_max_nnz_per_slice( + grid_dim, default_block_size, 0, exec->get_queue(), num_rows, + slice_size, stride_factor, nnz_per_row.get_const_data(), + max_nnz_per_slice.get_data()); + + grid_dim = ceildiv(slice_num, default_block_size); + auto block_results = Array(exec, grid_dim); + + kernel::reduce_total_cols( + grid_dim, default_block_size, default_block_size * sizeof(size_type), + exec->get_queue(), slice_num, max_nnz_per_slice.get_const_data(), + block_results.get_data()); + + auto d_result = Array(exec, 1); + + kernel::reduce_total_cols( + 1, default_block_size, default_block_size * sizeof(size_type), + exec->get_queue(), grid_dim, block_results.get_const_data(), + d_result.get_data()); + + *result = exec->copy_val_to_host(d_result.get_const_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL); @@ -210,7 +1710,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void transpose(std::shared_ptr exec, const matrix::Dense *orig, - matrix::Dense *trans) GKO_NOT_IMPLEMENTED; + matrix::Dense *trans) +{ + if (cublas::is_supported::value) { + auto handle = exec->get_cublas_handle(); + { + cublas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + cublas::geam( + handle, oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, static_cast(nullptr), + trans->get_size()[1], trans->get_values(), trans->get_stride()); + } + } else { + GKO_NOT_IMPLEMENTED; + } +}; GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); @@ -218,7 +1736,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); template void conj_transpose(std::shared_ptr exec, const matrix::Dense *orig, - matrix::Dense *trans) GKO_NOT_IMPLEMENTED; + matrix::Dense *trans) +{ + if (cublas::is_supported::value) { + auto handle = exec->get_cublas_handle(); + { + cublas::pointer_mode_guard pm_guard(handle); + auto alpha = one(); + auto beta = zero(); + cublas::geam( + handle, oneapi::mkl::transpose::conjtrans, + oneapi::mkl::transpose::nontrans, orig->get_size()[0], + orig->get_size()[1], &alpha, orig->get_const_values(), + orig->get_stride(), &beta, static_cast(nullptr), + trans->get_size()[1], trans->get_values(), trans->get_stride()); + } + } else { + GKO_NOT_IMPLEMENTED; + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); From 2e63c8d321082effa6bdec5ecb8927f59c4d7a62 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 24 Feb 2021 16:09:26 +0800 Subject: [PATCH 05/22] dense, prefix_sum and uninitialized_array --- dpcpp/CMakeLists.txt | 1 + dpcpp/components/prefix_sum.dp.cpp | 33 +- dpcpp/components/prefix_sum.dp.hpp | 260 ++++++++ dpcpp/components/uninitialized_array.hpp | 9 +- dpcpp/matrix/dense_kernels.dp.cpp | 247 +++---- dpcpp/test/components/CMakeLists.txt | 1 + dpcpp/test/components/prefix_sum.cpp | 96 +++ dpcpp/test/matrix/CMakeLists.txt | 3 +- dpcpp/test/matrix/dense_kernels.cpp | 806 +++++++++++++++++++++++ dpcpp/test/utils.hpp | 54 ++ 10 files changed, 1372 insertions(+), 138 deletions(-) create mode 100644 dpcpp/components/prefix_sum.dp.hpp create mode 100644 dpcpp/test/components/prefix_sum.cpp create mode 100644 dpcpp/test/matrix/dense_kernels.cpp create mode 100644 dpcpp/test/utils.hpp diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 97cf8a5daf6..b3101d8b2e2 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -60,6 +60,7 @@ target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) +target_link_libraries(ginkgo_dpcpp PRIVATE "mkl_sycl;mkl_intel_ilp64;mkl_sequential;mkl_core") target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) if (GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp index 4b7f816b381..b4961809a8b 100644 --- a/dpcpp/components/prefix_sum.dp.cpp +++ b/dpcpp/components/prefix_sum.dp.cpp @@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include "dpcpp/components/prefix_sum.dp.hpp" namespace gko { @@ -45,22 +45,33 @@ namespace dpcpp { namespace components { +constexpr int prefix_sum_block_size = 256; + + template -void prefix_sum(std::shared_ptr exec, IndexType *counts, +void prefix_sum(std::shared_ptr exec, IndexType *counts, size_type num_entries) { - // TODO actually implement parallel prefix sum - exec->get_queue()->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::range<1>{1}, [=](sycl::id<1> idx) { - IndexType sum{}; - for (size_type i = 0; i < num_entries; i++) { - sum += std::exchange(counts[i], sum); - } - }); - }); + // prefix_sum should be on the valid array + if (num_entries > 0) { + auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); + Array block_sum_array(exec, num_blocks - 1); + auto block_sums = block_sum_array.get_data(); + start_prefix_sum( + num_blocks, prefix_sum_block_size, 0, exec->get_queue(), + num_entries, counts, block_sums); + // add the total sum of the previous block only when the number of block + // is larger than 1. + if (num_blocks > 1) { + finalize_prefix_sum( + num_blocks, prefix_sum_block_size, 0, exec->get_queue(), + num_entries, counts, block_sums); + } + } } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL); + // instantiate for size_type as well, as this is used in the Sellp format template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type); diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp new file mode 100644 index 00000000000..6b3498d1dea --- /dev/null +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -0,0 +1,260 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_ + + +#include + + +#include + + +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +// #include "common/components/prefix_sum.hpp.inc" +/** + * @internal + * Computes the prefix sum and total sum of `element` over a subwarp. + * + * @param element the element over which we compute the prefix sum. + * @param prefix_sum will be set to the sum of all `element`s from lower + * lanes, plus the local `element` if `inclusive` is `true`. + * @param total_sum will be set to the total sum of `element` in this subwarp. + * @param subwarp the cooperative group representing the subwarp. + * + * @tparam inclusive if this is true, the computed prefix sum will be + * inclusive, otherwise it will be exclusive. + * + * @note For this function to work on architectures with independent thread + * scheduling, all threads of the subwarp have to execute it. + */ +template +__dpct_inline__ void subwarp_prefix_sum(ValueType element, + ValueType &prefix_sum, + ValueType &total_sum, Group subwarp) +{ + prefix_sum = inclusive ? element : zero(); + total_sum = element; +#pragma unroll + // hypercube prefix sum + for (auto step = 1; step < subwarp.size(); step *= 2) { + auto neighbor = subwarp.shfl_xor(total_sum, step); + total_sum += neighbor; + prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0; + } +} + +/** + * @internal + * Computes the prefix sum of `element` over a subwarp. + * + * @param element the element over which we compute the prefix sum. + * @param prefix_sum will be set to the sum of all `element`s from lower + * lanes, plus the local `element` if `inclusive` is `true`. + * @param subwarp the cooperative group representing the subwarp. + * + * @tparam inclusive if this is true, the computed prefix sum will be + * inclusive, otherwise it will be exclusive. + * + * @note All threads of the subwarp have to execute this function for it to work + * (and not dead-lock on newer architectures). + */ +template +__dpct_inline__ void subwarp_prefix_sum(ValueType element, + ValueType &prefix_sum, Group subwarp) +{ + ValueType tmp{}; + subwarp_prefix_sum(element, prefix_sum, tmp, subwarp); +} + + +/** + * @internal + * First step of the calculation of a prefix sum. Calculates the prefix sum + * in-place on parts of the array `elements`. + * + * @param elements array on which the prefix sum is to be calculated + * @param block_sum array which stores the total sum of each block, requires at + * least `ceildiv(num_elements, block_size) - 1` elements + * @param num_elements total number of entries in `elements` + * + * @tparam block_size thread block size for this kernel, also size of blocks on + * which this kernel calculates the prefix sum in-place + * + * @note To calculate the prefix sum over an array of size bigger than + * `block_size`, `finalize_prefix_sum` has to be used as well. + */ +template +void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, + ValueType *__restrict__ block_sum, + sycl::nd_item<3> item_ct1, + UninitializedArray *prefix_helper) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + const auto element_id = item_ct1.get_local_id(2); + + // do not need to access the last element when exclusive prefix sum + (*prefix_helper)[element_id] = + (tidx + 1 < num_elements) ? elements[tidx] : zero(); + auto this_block = group::this_thread_block(item_ct1); + this_block.sync(); + + // Do a normal reduction +#pragma unroll + for (int i = 1; i < block_size; i <<= 1) { + const auto ai = i * (2 * element_id + 1) - 1; + const auto bi = i * (2 * element_id + 2) - 1; + if (bi < block_size) { + (*prefix_helper)[bi] += (*prefix_helper)[ai]; + } + this_block.sync(); + } + + if (element_id == 0) { + // Store the total sum except the last block + if (item_ct1.get_group(2) + 1 < item_ct1.get_group_range(2)) { + block_sum[item_ct1.get_group(2)] = (*prefix_helper)[block_size - 1]; + } + (*prefix_helper)[block_size - 1] = zero(); + } + + this_block.sync(); + + // Perform the down-sweep phase to get the true prefix sum +#pragma unroll + for (int i = block_size >> 1; i > 0; i >>= 1) { + const auto ai = i * (2 * element_id + 1) - 1; + const auto bi = i * (2 * element_id + 2) - 1; + if (bi < block_size) { + auto tmp = (*prefix_helper)[ai]; + (*prefix_helper)[ai] = (*prefix_helper)[bi]; + (*prefix_helper)[bi] += tmp; + } + this_block.sync(); + } + if (tidx < num_elements) { + elements[tidx] = (*prefix_helper)[element_id]; + } +} + +template +void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_elements, + ValueType *elements, ValueType *block_sum) +{ + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + prefix_helper_acc_ct1(cgh); + + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + start_prefix_sum( + num_elements, elements, block_sum, item_ct1, + (UninitializedArray *) + prefix_helper_acc_ct1.get_pointer()); + }); + }); +} + + +/** + * @internal + * Second step of the calculation of a prefix sum. Increases the value of each + * entry of `elements` by the total sum of all preceding blocks. + * + * @param elements array on which the prefix sum is to be calculated + * @param block_sum array storing the total sum of each block + * @param num_elements total number of entries in `elements` + * + * @tparam block_size thread block size for this kernel, has to be the same as + * for `start_prefix_sum` + * + * @note To calculate a prefix sum, first `start_prefix_sum` has to be called. + */ +template +void finalize_prefix_sum(size_type num_elements, + ValueType *__restrict__ elements, + const ValueType *__restrict__ block_sum, + sycl::nd_item<3> item_ct1) +{ + const auto tidx = thread::get_thread_id_flat(item_ct1); + + if (tidx < num_elements) { + ValueType prefix_block_sum = zero(); + for (size_type i = 0; i < item_ct1.get_group(2); i++) { + prefix_block_sum += block_sum[i]; + } + elements[tidx] += prefix_block_sum; + } +} + +template +void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, + sycl::queue *stream, size_type num_elements, + ValueType *elements, const ValueType *block_sum) +{ + stream->submit([&](sycl::handler &cgh) { + auto local_range = block.reverse(); + auto global_range = grid.reverse() * local_range; + + cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + [=](sycl::nd_item<3> item_ct1) { + finalize_prefix_sum( + num_elements, elements, block_sum, item_ct1); + }); + }); +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_ diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp index fb7575bc202..415126b8ed3 100644 --- a/dpcpp/components/uninitialized_array.hpp +++ b/dpcpp/components/uninitialized_array.hpp @@ -88,7 +88,7 @@ class UninitializedArray { constexpr __dpct_inline__ ValueType &operator[](size_type pos) const noexcept { - return reinterpret_cast(data_)[pos]; + return data_[pos]; } /** @@ -99,13 +99,14 @@ class UninitializedArray { * * @return a reference to the array entry at the given index. */ - GKO_ATTRIBUTES ValueType &operator[](size_type pos) noexcept + __dpct_inline__ ValueType &operator[](size_type pos) noexcept { - return reinterpret_cast(data_)[pos]; + return data_[pos]; } private: - unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size]; + // unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size]; + ValueType data_[size]; }; diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 36935b1a4d6..c5074b5cc38 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -33,11 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" -#include -#include - - #include +#include #include @@ -48,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/components/prefix_sum.hpp" @@ -70,7 +68,7 @@ namespace dpcpp { namespace dense { -constexpr auto default_block_size = 512; +constexpr auto default_block_size = 256; // #include "common/matrix/dense_kernels.hpp.inc" @@ -236,18 +234,20 @@ void compute_partial_reduce(size_type num_rows, OutType *__restrict__ work, const auto global_id = thread::get_thread_id(item_ct1); + OutType *tmp_work_array=*tmp_work; auto tmp = zero(); for (auto i = global_id; i < num_rows; i += block_size * num_blocks) { tmp = reduce_op(tmp, get_value(i)); } - (*tmp_work)[local_id] = tmp; + tmp_work_array[local_id] = tmp; - reduce(group::this_thread_block(item_ct1), - static_cast((*tmp_work)), reduce_op); + ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), + tmp_work_array, + reduce_op); if (local_id == 0) { - work[thread::get_block_id(item_ct1)] = (*tmp_work)[0]; + work[thread::get_block_id(item_ct1)] = tmp_work_array[0]; } } @@ -267,14 +267,15 @@ void finalize_reduce_computation( for (auto i = local_id; i < size; i += block_size) { tmp = reduce_op(tmp, work[i]); } + ValueType *tmp_work_array=*tmp_work; + tmp_work_array[local_id] = tmp; - (*tmp_work)[local_id] = tmp; - - reduce(group::this_thread_block(item_ct1), - static_cast((*tmp_work)), reduce_op); + ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), + tmp_work_array, + reduce_op); if (local_id == 0) { - *result = finalize_op((*tmp_work)[0]); + *result = finalize_op(tmp_work_array[0]); } } @@ -287,12 +288,7 @@ void compute_partial_dot(size_type num_rows, const ValueType *__restrict__ x, UninitializedArray *tmp_work) { compute_partial_reduce( - /* - DPCT1007:4: Migration of this CUDA API is not supported by the Intel(R) - DPC++ Compatibility Tool. - */ - num_rows, - work, + num_rows, work, [x, stride_x, y, stride_y](size_type i) { return x[i * stride_x] * conj(y[i * stride_y]); }, @@ -416,8 +412,7 @@ void finalize_norm2_computation( finalize_reduce_computation( size, work, result, [](const ValueType &x, const ValueType &y) { return x + y; }, - [](const ValueType &x) { return sycl::sqrt((float)x); }, item_ct1, - tmp_work); + [](const ValueType &x) { return sqrt(x); }, item_ct1, tmp_work); } template @@ -508,9 +503,7 @@ void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride, part_result += 1; } } - result[row_idx] = std::reduce( - oneapi::dpl::execution::make_device_policy( - dpct::get_default_queue()), + result[row_idx] = ::gko::kernels::dpcpp::reduce( warp_tile, part_result, [](const size_type &a, const size_type &b) { return a + b; }); } @@ -648,7 +641,7 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size, auto warp_tile = group::tiled_partition( group::this_thread_block(item_ct1)); - auto warp_result = reduce( + auto warp_result = ::gko::kernels::dpcpp::reduce( warp_tile, thread_result, [](const size_type &a, const size_type &b) { return max(a, b); }); @@ -745,7 +738,7 @@ void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, auto block_max = (size_type *)dpct_local; reduce_array( - size, nnz_per_row, block_max, + size, nnz_per_row, block_max, item_ct1, [](const size_type &x, const size_type &y) { return max(x, y); }); if (item_ct1.get_local_id(2) == 0) { @@ -795,7 +788,7 @@ void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size, } } - auto warp_result = reduce( + auto warp_result = ::gko::kernels::dpcpp::reduce( warp_tile, thread_result, [](const size_type &a, const size_type &b) { return max(a, b); }); @@ -831,7 +824,7 @@ void reduce_total_cols(size_type num_slices, { auto block_result = (size_type *)dpct_local; - reduce_array(num_slices, max_nnz_per_slice, block_result, + reduce_array(num_slices, max_nnz_per_slice, block_result, item_ct1, [](const size_type &x, const size_type &y) { return x + y; }); if (item_ct1.get_local_id(2) == 0) { @@ -1292,22 +1285,13 @@ void simple_apply(std::shared_ptr exec, const matrix::Dense *b, matrix::Dense *c) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - { - cublas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - cublas::gemm(handle, oneapi::mkl::transpose::nontrans, - oneapi::mkl::transpose::nontrans, c->get_size()[1], - c->get_size()[0], a->get_size()[1], &alpha, - b->get_const_values(), b->get_stride(), - a->get_const_values(), a->get_stride(), &beta, - c->get_values(), c->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } + using namespace oneapi::mkl; + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, transpose::nontrans, + c->get_size()[0], c->get_size()[1], a->get_size()[1], + one(), a->get_const_values(), a->get_stride(), + b->get_const_values(), b->get_stride(), zero(), + c->get_values(), c->get_stride()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); @@ -1319,17 +1303,14 @@ void apply(std::shared_ptr exec, const matrix::Dense *a, const matrix::Dense *b, const matrix::Dense *beta, matrix::Dense *c) { - if (cublas::is_supported::value) { - cublas::gemm( - exec->get_cublas_handle(), oneapi::mkl::transpose::nontrans, - oneapi::mkl::transpose::nontrans, c->get_size()[1], - c->get_size()[0], a->get_size()[1], alpha->get_const_values(), - b->get_const_values(), b->get_stride(), a->get_const_values(), - a->get_stride(), beta->get_const_values(), c->get_values(), - c->get_stride()); - } else { - GKO_NOT_IMPLEMENTED; - } + using namespace oneapi::mkl; + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, transpose::nontrans, + c->get_size()[0], c->get_size()[1], a->get_size()[1], + exec->copy_val_to_host(alpha->get_const_values()), + a->get_const_values(), a->get_stride(), b->get_const_values(), + b->get_stride(), exec->copy_val_to_host(beta->get_const_values()), + c->get_values(), c->get_stride()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); @@ -1341,20 +1322,20 @@ void compute_dot(std::shared_ptr exec, const matrix::Dense *y, matrix::Dense *result) { - if (cublas::is_supported::value) { + if (0) { // TODO: write a custom kernel which does this more efficiently for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::dot(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); + dot(*exec->get_queue(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), + result->get_values() + col); } } else { // TODO: these are tuning parameters obtained experimentally, once // we decide how to handle this uniformly, they should be modified // appropriately constexpr auto work_per_thread = 32; - constexpr auto block_size = 1024; + constexpr auto block_size = default_block_size; constexpr auto work_per_block = work_per_thread * block_size; const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); @@ -1366,7 +1347,8 @@ void compute_dot(std::shared_ptr exec, kernel::compute_partial_dot( grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), work.get_data()); + y->get_const_values() + col, y->get_stride(), + work.get_data()); kernel::finalize_dot_computation( 1, block_dim, 0, exec->get_queue(), grid_dim.x, work.get_const_data(), result->get_values() + col); @@ -1391,19 +1373,20 @@ void compute_norm2(std::shared_ptr exec, const matrix::Dense *x, matrix::Dense> *result) { - if (cublas::is_supported::value) { + if (0) { for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::norm2(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - result->get_values() + col); + oneapi::mkl::blas::row_major::nrm2( + *exec->get_queue(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + result->get_values() + col); } } else { using norm_type = remove_complex; - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately + // // TODO: these are tuning parameters obtained experimentally, once + // // we decide how to handle this uniformly, they should be modified + // // appropriately constexpr auto work_per_thread = 32; - constexpr auto block_size = 1024; + constexpr auto block_size = default_block_size; constexpr auto work_per_block = work_per_thread * block_size; const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); @@ -1414,7 +1397,8 @@ void compute_norm2(std::shared_ptr exec, for (size_type col = 0; col < x->get_size()[1]; ++col) { kernel::compute_partial_norm2( grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), work.get_data()); + x->get_const_values() + col, x->get_stride(), + work.get_data()); kernel::finalize_norm2_computation( 1, block_dim, 0, exec->get_queue(), grid_dim.x, work.get_const_data(), result->get_values() + col); @@ -1449,7 +1433,8 @@ void convert_to_coo(std::shared_ptr exec, kernel::fill_in_coo(grid_dim, default_block_size, 0, exec->get_queue(), num_rows, num_cols, stride, nnz_prefix_sum.get_const_data(), - source->get_const_values(), row_idxs, col_idxs, values); + source->get_const_values(), row_idxs, col_idxs, + values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1470,8 +1455,9 @@ void convert_to_csr(std::shared_ptr exec, auto stride = source->get_stride(); - const auto rows_per_block = ceildiv(default_block_size, config::warp_size); - const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); + const auto rows_per_block = ceildiv(default_block_size, + config::warp_size); const auto grid_dim_nnz = + ceildiv(source->get_size()[0], rows_per_block); kernel::count_nnz_per_row(grid_dim_nnz, default_block_size, 0, exec->get_queue(), num_rows, num_cols, stride, @@ -1482,8 +1468,9 @@ void convert_to_csr(std::shared_ptr exec, size_type grid_dim = ceildiv(num_rows, default_block_size); kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, num_cols, stride, source->get_const_values(), - row_ptrs, col_idxs, values); + num_rows, num_cols, stride, + source->get_const_values(), row_ptrs, col_idxs, + values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1549,25 +1536,34 @@ void convert_to_sellp(std::shared_ptr exec, const int slice_num = ceildiv(num_rows, slice_size); auto nnz_per_row = Array(exec, num_rows); + std::cout << "calculate_nonzeros_per_row" << std::endl; calculate_nonzeros_per_row(exec, source, &nnz_per_row); - + exec->synchronize(); + std::cout << "calculate_nonzeros_per_row finish" << std::endl; auto grid_dim = slice_num; if (grid_dim > 0) { + std::cout << "calculate_slice_lengths" << std::endl; kernel::calculate_slice_lengths( grid_dim, config::warp_size, 0, exec->get_queue(), num_rows, - slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(), - slice_lengths, slice_sets); + slice_size, slice_num, stride_factor, + nnz_per_row.get_const_data(), slice_lengths, slice_sets); + exec->synchronize(); + std::cout << "calculate_slice_lengths finish" << std::endl; } - + std::cout << "prefix_sum" << std::endl; components::prefix_sum(exec, slice_sets, slice_num + 1); - + // exec->synchronize(); + std::cout << "prefix_sum finish" << std::endl; grid_dim = ceildiv(num_rows, default_block_size); if (grid_dim > 0) { + std::cout << "fill_in_sellp" << std::endl; kernel::fill_in_sellp(grid_dim, default_block_size, 0, - exec->get_queue(), num_rows, num_cols, slice_size, - stride, source->get_const_values(), slice_lengths, - slice_sets, col_idxs, vals); + exec->get_queue(), num_rows, num_cols, + slice_size, stride, source->get_const_values(), + slice_lengths, slice_sets, col_idxs, vals); + exec->synchronize(); + std::cout << "fill_in_sellp finish" << std::endl; } } @@ -1626,7 +1622,8 @@ void calculate_max_nnz_per_row(std::shared_ptr exec, kernel::reduce_max_nnz(1, default_block_size, default_block_size * sizeof(size_type), exec->get_queue(), grid_dim, - block_results.get_const_data(), d_result.get_data()); + block_results.get_const_data(), + d_result.get_data()); *result = exec->copy_val_to_host(d_result.get_const_data()); } @@ -1646,9 +1643,10 @@ void calculate_nonzeros_per_row(std::shared_ptr exec, const dim3 grid_size(grid_x, 1, 1); if (grid_x > 0) { kernel::count_nnz_per_row( - grid_size, block_size, 0, exec->get_queue(), source->get_size()[0], - source->get_size()[1], source->get_stride(), - source->get_const_values(), result->get_data()); + grid_size, block_size, 0, exec->get_queue(), + source->get_size()[0], source->get_size()[1], + source->get_stride(), source->get_const_values(), + result->get_data()); } } @@ -1678,7 +1676,8 @@ void calculate_total_cols(std::shared_ptr exec, auto max_nnz_per_slice = Array(exec, slice_num); - auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); + auto grid_dim = ceildiv(slice_num * config::warp_size, + default_block_size); kernel::reduce_max_nnz_per_slice( grid_dim, default_block_size, 0, exec->get_queue(), num_rows, @@ -1712,22 +1711,24 @@ void transpose(std::shared_ptr exec, const matrix::Dense *orig, matrix::Dense *trans) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - { - cublas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - cublas::geam( - handle, oneapi::mkl::transpose::trans, - oneapi::mkl::transpose::nontrans, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, static_cast(nullptr), - trans->get_size()[1], trans->get_values(), trans->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } + // if (cublas::is_supported::value) { + // auto handle = exec->get_cublas_handle(); + // { + // cublas::pointer_mode_guard pm_guard(handle); + // auto alpha = one(); + // auto beta = zero(); + // cublas::geam( + // handle, oneapi::mkl::transpose::trans, + // oneapi::mkl::transpose::nontrans, orig->get_size()[0], + // orig->get_size()[1], &alpha, orig->get_const_values(), + // orig->get_stride(), &beta, static_cast(nullptr), trans->get_size()[1], trans->get_values(), + // trans->get_stride()); + // } + // } else { + // GKO_NOT_IMPLEMENTED; + // } + GKO_NOT_IMPLEMENTED; }; GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); @@ -1738,22 +1739,24 @@ void conj_transpose(std::shared_ptr exec, const matrix::Dense *orig, matrix::Dense *trans) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - { - cublas::pointer_mode_guard pm_guard(handle); - auto alpha = one(); - auto beta = zero(); - cublas::geam( - handle, oneapi::mkl::transpose::conjtrans, - oneapi::mkl::transpose::nontrans, orig->get_size()[0], - orig->get_size()[1], &alpha, orig->get_const_values(), - orig->get_stride(), &beta, static_cast(nullptr), - trans->get_size()[1], trans->get_values(), trans->get_stride()); - } - } else { - GKO_NOT_IMPLEMENTED; - } + // if (cublas::is_supported::value) { + // auto handle = exec->get_cublas_handle(); + // { + // cublas::pointer_mode_guard pm_guard(handle); + // auto alpha = one(); + // auto beta = zero(); + // cublas::geam( + // handle, oneapi::mkl::transpose::conjtrans, + // oneapi::mkl::transpose::nontrans, orig->get_size()[0], + // orig->get_size()[1], &alpha, orig->get_const_values(), + // orig->get_stride(), &beta, static_cast(nullptr), trans->get_size()[1], trans->get_values(), + // trans->get_stride()); + // } + // } else { + // GKO_NOT_IMPLEMENTED; + // } + GKO_NOT_IMPLEMENTED; } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); diff --git a/dpcpp/test/components/CMakeLists.txt b/dpcpp/test/components/CMakeLists.txt index 87a034a64df..77ad6684840 100644 --- a/dpcpp/test/components/CMakeLists.txt +++ b/dpcpp/test/components/CMakeLists.txt @@ -2,3 +2,4 @@ ginkgo_create_test(absolute_array) ginkgo_create_dpcpp_test(cooperative_groups_kernels) ginkgo_create_test(fill_array) ginkgo_create_test(precision_conversion) +ginkgo_create_test(prefix_sum) diff --git a/dpcpp/test/components/prefix_sum.cpp b/dpcpp/test/components/prefix_sum.cpp new file mode 100644 index 00000000000..3e2e7ca9d64 --- /dev/null +++ b/dpcpp/test/components/prefix_sum.cpp @@ -0,0 +1,96 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/components/prefix_sum.hpp" + + +#include +#include +#include + + +#include + + +#include + + +#include "dpcpp/test/utils.hpp" + + +namespace { + + +class PrefixSum : public ::testing::Test { +protected: + using index_type = gko::int32; + PrefixSum() + : ref(gko::ReferenceExecutor::create()), + exec(gko::DpcppExecutor::create(0, ref)), + rand(293), + total_size(42793), + vals(ref, total_size), + dvals(exec) + { + std::uniform_int_distribution dist(0, 1000); + for (gko::size_type i = 0; i < total_size; ++i) { + vals.get_data()[i] = dist(rand); + } + dvals = vals; + } + + void test(gko::size_type size) + { + gko::kernels::reference::components::prefix_sum(ref, vals.get_data(), + size); + gko::kernels::dpcpp::components::prefix_sum(exec, dvals.get_data(), + size); + + GKO_ASSERT_ARRAY_EQ(vals, dvals); + } + + std::shared_ptr ref; + std::shared_ptr exec; + std::default_random_engine rand; + gko::size_type total_size; + gko::Array vals; + gko::Array dvals; +}; + + +TEST_F(PrefixSum, SmallEqualsReference) { test(100); } + + +TEST_F(PrefixSum, BigEqualsReference) { test(total_size); } + + +} // namespace diff --git a/dpcpp/test/matrix/CMakeLists.txt b/dpcpp/test/matrix/CMakeLists.txt index af64b693718..ba8f0fb70fe 100644 --- a/dpcpp/test/matrix/CMakeLists.txt +++ b/dpcpp/test/matrix/CMakeLists.txt @@ -1 +1,2 @@ -ginkgo_create_test(csr_kernels) \ No newline at end of file +ginkgo_create_test(csr_kernels) +ginkgo_create_test(dense_kernels) diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp new file mode 100644 index 00000000000..7c65e8b0f84 --- /dev/null +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -0,0 +1,806 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include +#include + + +#include "core/components/fill_array.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "dpcpp/test/utils.hpp" + + +namespace { + + +class Dense : public ::testing::Test { +protected: + using itype = int; + using vtype = double; + using Mtx = gko::matrix::Dense; + using NormVector = gko::matrix::Dense>; + using Arr = gko::Array; + using ComplexMtx = gko::matrix::Dense>; + + Dense() : rand_engine(15) {} + + void SetUp() + { + ASSERT_GT(gko::DpcppExecutor::get_num_devices("gpu"), 0); + ref = gko::ReferenceExecutor::create(); + dpcpp = gko::DpcppExecutor::create(0, ref); + } + + void TearDown() + { + if (dpcpp != nullptr) { + ASSERT_NO_THROW(dpcpp->synchronize()); + } + } + + template + std::unique_ptr gen_mtx(int num_rows, int num_cols) + { + return gko::test::generate_random_matrix( + num_rows, num_cols, + std::uniform_int_distribution<>(num_cols, num_cols), + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); + } + + void set_up_vector_data(gko::size_type num_vecs, + bool different_alpha = false) + { + x = gen_mtx(1000, num_vecs); + y = gen_mtx(1000, num_vecs); + if (different_alpha) { + alpha = gen_mtx(1, num_vecs); + } else { + alpha = gko::initialize({2.0}, ref); + } + dx = Mtx::create(dpcpp); + dx->copy_from(x.get()); + dy = Mtx::create(dpcpp); + dy->copy_from(y.get()); + dalpha = Mtx::create(dpcpp); + dalpha->copy_from(alpha.get()); + expected = Mtx::create(ref, gko::dim<2>{1, num_vecs}); + dresult = Mtx::create(dpcpp, gko::dim<2>{1, num_vecs}); + } + + void set_up_apply_data() + { + x = gen_mtx(65, 25); + c_x = gen_mtx(65, 25); + y = gen_mtx(25, 35); + expected = gen_mtx(65, 35); + alpha = gko::initialize({2.0}, ref); + beta = gko::initialize({-1.0}, ref); + square = gen_mtx(x->get_size()[0], x->get_size()[0]); + dx = Mtx::create(dpcpp); + dx->copy_from(x.get()); + dc_x = ComplexMtx::create(dpcpp); + dc_x->copy_from(c_x.get()); + dy = Mtx::create(dpcpp); + dy->copy_from(y.get()); + dresult = Mtx::create(dpcpp); + dresult->copy_from(expected.get()); + dalpha = Mtx::create(dpcpp); + dalpha->copy_from(alpha.get()); + dbeta = Mtx::create(dpcpp); + dbeta->copy_from(beta.get()); + dsquare = Mtx::create(dpcpp); + dsquare->copy_from(square.get()); + + std::vector tmp(x->get_size()[0], 0); + auto rng = std::default_random_engine{}; + std::iota(tmp.begin(), tmp.end(), 0); + std::shuffle(tmp.begin(), tmp.end(), rng); + std::vector tmp2(x->get_size()[1], 0); + std::iota(tmp2.begin(), tmp2.end(), 0); + std::shuffle(tmp2.begin(), tmp2.end(), rng); + std::vector tmp3(x->get_size()[0] / 10); + std::uniform_int_distribution row_dist(0, x->get_size()[0] - 1); + for (auto &i : tmp3) { + i = row_dist(rng); + } + rpermute_idxs = + std::unique_ptr(new Arr{ref, tmp.begin(), tmp.end()}); + cpermute_idxs = + std::unique_ptr(new Arr{ref, tmp2.begin(), tmp2.end()}); + rgather_idxs = + std::unique_ptr(new Arr{ref, tmp3.begin(), tmp3.end()}); + } + + std::shared_ptr ref; + std::shared_ptr dpcpp; + + std::ranlux48 rand_engine; + + std::unique_ptr x; + std::unique_ptr c_x; + std::unique_ptr y; + std::unique_ptr alpha; + std::unique_ptr beta; + std::unique_ptr expected; + std::unique_ptr square; + std::unique_ptr dresult; + std::unique_ptr dx; + std::unique_ptr dc_x; + std::unique_ptr dy; + std::unique_ptr dalpha; + std::unique_ptr dbeta; + std::unique_ptr dsquare; + std::unique_ptr rpermute_idxs; + std::unique_ptr cpermute_idxs; + std::unique_ptr rgather_idxs; +}; + + +TEST_F(Dense, DpcppFillIsEquivalentToRef) +{ + set_up_vector_data(3); + auto result = Mtx::create(ref); + + x->fill(42); + dx->fill(42); + result->copy_from(dx.get()); + + GKO_ASSERT_MTX_NEAR(result, x, 1e-14); +} + + +TEST_F(Dense, DpcppStridedFillIsEquivalentToRef) +{ + using T = double; + auto x = gko::initialize>( + 4, {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}, ref); + auto dx = gko::initialize>( + 4, {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}, dpcpp); + auto result = Mtx::create(ref); + + x->fill(42); + dx->fill(42); + result->copy_from(dx.get()); + + GKO_ASSERT_MTX_NEAR(result, x, 1e-14); +} + + +TEST_F(Dense, SingleVectorDpcppScaleIsEquivalentToRef) +{ + set_up_vector_data(1); + auto result = Mtx::create(ref); + + x->scale(alpha.get()); + dx->scale(dalpha.get()); + result->copy_from(dx.get()); + + GKO_ASSERT_MTX_NEAR(result, x, 1e-14); +} + + +TEST_F(Dense, MultipleVectorDpcppScaleIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->scale(alpha.get()); + dx->scale(dalpha.get()); + + GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(Dense, MultipleVectorDpcppScaleWithDifferentAlphaIsEquivalentToRef) +{ + set_up_vector_data(20, true); + + x->scale(alpha.get()); + dx->scale(dalpha.get()); + + GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(Dense, SingleVectorDpcppAddScaledIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->add_scaled(alpha.get(), y.get()); + dx->add_scaled(dalpha.get(), dy.get()); + + GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(Dense, MultipleVectorDpcppAddScaledIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->add_scaled(alpha.get(), y.get()); + dx->add_scaled(dalpha.get(), dy.get()); + + GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(Dense, MultipleVectorDpcppAddScaledWithDifferentAlphaIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->add_scaled(alpha.get(), y.get()); + dx->add_scaled(dalpha.get(), dy.get()); + + GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); +} + + +TEST_F(Dense, AddsScaledDiagIsEquivalentToRef) +{ + auto mat = gen_mtx(532, 532); + gko::Array diag_values(ref, 532); + gko::kernels::reference::components::fill_array(ref, diag_values.get_data(), + 532, Mtx::value_type{2.0}); + auto diag = + gko::matrix::Diagonal::create(ref, 532, diag_values); + alpha = gko::initialize({2.0}, ref); + auto dmat = Mtx::create(dpcpp); + dmat->copy_from(mat.get()); + auto ddiag = gko::matrix::Diagonal::create(dpcpp); + ddiag->copy_from(diag.get()); + dalpha = Mtx::create(dpcpp); + dalpha->copy_from(alpha.get()); + + mat->add_scaled(alpha.get(), diag.get()); + dmat->add_scaled(dalpha.get(), ddiag.get()); + + GKO_ASSERT_MTX_NEAR(mat, dmat, 1e-14); +} + + +TEST_F(Dense, SingleVectorDpcppComputeDotIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->compute_dot(y.get(), expected.get()); + dx->compute_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->compute_dot(y.get(), expected.get()); + dx->compute_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef) +{ + set_up_vector_data(20); + auto norm_size = gko::dim<2>{1, x->get_size()[1]}; + auto norm_expected = NormVector::create(this->ref, norm_size); + auto dnorm = NormVector::create(this->dpcpp, norm_size); + + x->compute_norm2(norm_expected.get()); + dx->compute_norm2(dnorm.get()); + + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14); +} + + +TEST_F(Dense, SimpleApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + x->apply(y.get(), expected.get()); + dx->apply(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Dense, AdvancedApplyIsEquivalentToRef) +{ + set_up_apply_data(); + + x->apply(alpha.get(), y.get(), beta.get(), expected.get()); + dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); +} + + +TEST_F(Dense, ApplyToComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(complex_b.get(), complex_x.get()); + dx->apply(dcomplex_b.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); +} + + +TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get()); + dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); +} + + +TEST_F(Dense, IsTransposable) +{ + set_up_apply_data(); + + auto trans = x->transpose(); + auto dtrans = dx->transpose(); + + GKO_ASSERT_MTX_NEAR(static_cast(dtrans.get()), + static_cast(trans.get()), 0); +} + + +TEST_F(Dense, IsConjugateTransposable) +{ + set_up_apply_data(); + + auto trans = c_x->conj_transpose(); + auto dtrans = dc_x->conj_transpose(); + + GKO_ASSERT_MTX_NEAR(static_cast(dtrans.get()), + static_cast(trans.get()), 0); +} + + +TEST_F(Dense, ConvertToCooIsEquivalentToRef) +{ + set_up_apply_data(); + auto coo_mtx = gko::matrix::Coo<>::create(ref); + auto dcoo_mtx = gko::matrix::Coo<>::create(dpcpp); + + x->convert_to(coo_mtx.get()); + dx->convert_to(dcoo_mtx.get()); + + ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), + coo_mtx->get_num_stored_elements()); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14); +} + + +TEST_F(Dense, MoveToCooIsEquivalentToRef) +{ + set_up_apply_data(); + auto coo_mtx = gko::matrix::Coo<>::create(ref); + auto dcoo_mtx = gko::matrix::Coo<>::create(dpcpp); + + x->move_to(coo_mtx.get()); + dx->move_to(dcoo_mtx.get()); + + ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), + coo_mtx->get_num_stored_elements()); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14); +} + + +TEST_F(Dense, ConvertToCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(dpcpp); + + x->convert_to(csr_mtx.get()); + dx->convert_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14); +} + + +TEST_F(Dense, MoveToCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(dpcpp); + + x->move_to(csr_mtx.get()); + dx->move_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14); +} + + +TEST_F(Dense, ConvertToEllIsEquivalentToRef) +{ + set_up_apply_data(); + auto ell_mtx = gko::matrix::Ell<>::create(ref); + auto dell_mtx = gko::matrix::Ell<>::create(dpcpp); + + x->convert_to(ell_mtx.get()); + dx->convert_to(dell_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14); +} + + +TEST_F(Dense, MoveToEllIsEquivalentToRef) +{ + set_up_apply_data(); + auto ell_mtx = gko::matrix::Ell<>::create(ref); + auto dell_mtx = gko::matrix::Ell<>::create(dpcpp); + + x->move_to(ell_mtx.get()); + dx->move_to(dell_mtx.get()); + + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14); +} + + +TEST_F(Dense, ConvertToSellpIsEquivalentToRef) +{ + set_up_apply_data(); + auto sellp_mtx = gko::matrix::Sellp<>::create(ref); + auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); + + x->convert_to(sellp_mtx.get()); + dx->convert_to(dsellp_mtx.get()); + + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14); +} + + +TEST_F(Dense, MoveToSellpIsEquivalentToRef) +{ + set_up_apply_data(); + auto sellp_mtx = gko::matrix::Sellp<>::create(ref); + auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); + + x->move_to(sellp_mtx.get()); + dx->move_to(dsellp_mtx.get()); + + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14); +} + + +TEST_F(Dense, ConvertsEmptyToSellp) +{ + auto dempty_mtx = Mtx::create(dpcpp); + auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); + + dempty_mtx->convert_to(dsellp_mtx.get()); + + ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); + ASSERT_FALSE(dsellp_mtx->get_size()); +} + + +TEST_F(Dense, CountNNZIsEquivalentToRef) +{ + set_up_apply_data(); + gko::size_type nnz; + gko::size_type dnnz; + + gko::kernels::reference::dense::count_nonzeros(ref, x.get(), &nnz); + gko::kernels::dpcpp::dense::count_nonzeros(dpcpp, dx.get(), &dnnz); + + ASSERT_EQ(nnz, dnnz); +} + + +TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) +{ + set_up_apply_data(); + gko::Array nnz_per_row(ref); + nnz_per_row.resize_and_reset(x->get_size()[0]); + gko::Array dnnz_per_row(dpcpp); + dnnz_per_row.resize_and_reset(dx->get_size()[0]); + + gko::kernels::reference::dense::calculate_nonzeros_per_row(ref, x.get(), + &nnz_per_row); + gko::kernels::dpcpp::dense::calculate_nonzeros_per_row(dpcpp, dx.get(), + &dnnz_per_row); + + auto tmp = gko::Array(ref, dnnz_per_row); + for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); + } +} + + +TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef) +{ + set_up_apply_data(); + gko::size_type max_nnz; + gko::size_type dmax_nnz; + + gko::kernels::reference::dense::calculate_max_nnz_per_row(ref, x.get(), + &max_nnz); + gko::kernels::dpcpp::dense::calculate_max_nnz_per_row(dpcpp, dx.get(), + &dmax_nnz); + + ASSERT_EQ(max_nnz, dmax_nnz); +} + + +TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) +{ + set_up_apply_data(); + gko::size_type total_cols; + gko::size_type dtotal_cols; + + gko::kernels::reference::dense::calculate_total_cols( + ref, x.get(), &total_cols, 2, gko::matrix::default_slice_size); + gko::kernels::dpcpp::dense::calculate_total_cols( + dpcpp, dx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size); + + ASSERT_EQ(total_cols, dtotal_cols); +} + + +TEST_F(Dense, CanGatherRows) +{ + set_up_apply_data(); + + auto r_gather = x->row_gather(rgather_idxs.get()); + auto dr_gather = dx->row_gather(rgather_idxs.get()); + + GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0); +} + + +TEST_F(Dense, CanGatherRowsIntoDense) +{ + set_up_apply_data(); + auto gather_size = + gko::dim<2>{rgather_idxs->get_num_elems(), x->get_size()[1]}; + auto r_gather = Mtx::create(ref, gather_size); + // test make_temporary_clone and non-default stride + auto dr_gather = Mtx::create(ref, gather_size, x->get_size()[1] + 2); + + x->row_gather(rgather_idxs.get(), r_gather.get()); + dx->row_gather(rgather_idxs.get(), dr_gather.get()); + + GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0); +} + + +TEST_F(Dense, IsPermutable) +{ + set_up_apply_data(); + + auto permuted = square->permute(rpermute_idxs.get()); + auto dpermuted = dsquare->permute(rpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(permuted.get()), + static_cast(dpermuted.get()), 0); +} + + +TEST_F(Dense, IsInversePermutable) +{ + set_up_apply_data(); + + auto permuted = square->inverse_permute(rpermute_idxs.get()); + auto dpermuted = dsquare->inverse_permute(rpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(permuted.get()), + static_cast(dpermuted.get()), 0); +} + + +TEST_F(Dense, IsRowPermutable) +{ + set_up_apply_data(); + + auto r_permute = x->row_permute(rpermute_idxs.get()); + auto dr_permute = dx->row_permute(rpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(r_permute.get()), + static_cast(dr_permute.get()), 0); +} + + +TEST_F(Dense, IsColPermutable) +{ + set_up_apply_data(); + + auto c_permute = x->column_permute(cpermute_idxs.get()); + auto dc_permute = dx->column_permute(cpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(c_permute.get()), + static_cast(dc_permute.get()), 0); +} + + +TEST_F(Dense, IsInverseRowPermutable) +{ + set_up_apply_data(); + + auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get()); + auto d_inverse_r_permute = dx->inverse_row_permute(rpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(inverse_r_permute.get()), + static_cast(d_inverse_r_permute.get()), 0); +} + + +TEST_F(Dense, IsInverseColPermutable) +{ + set_up_apply_data(); + + auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get()); + auto d_inverse_c_permute = dx->inverse_column_permute(cpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(inverse_c_permute.get()), + static_cast(d_inverse_c_permute.get()), 0); +} + + +TEST_F(Dense, ExtractDiagonalIsEquivalentToRef) +{ + set_up_apply_data(); + + auto diag = x->extract_diagonal(); + auto ddiag = dx->extract_diagonal(); + + GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0); +} + + +TEST_F(Dense, InplaceAbsoluteMatrixIsEquivalentToRef) +{ + set_up_apply_data(); + + x->compute_absolute_inplace(); + dx->compute_absolute_inplace(); + + GKO_ASSERT_MTX_NEAR(x, dx, 1e-14); +} + + +TEST_F(Dense, OutplaceAbsoluteMatrixIsEquivalentToRef) +{ + set_up_apply_data(); + + auto abs_x = x->compute_absolute(); + auto dabs_x = dx->compute_absolute(); + + GKO_ASSERT_MTX_NEAR(abs_x, dabs_x, 1e-14); +} + + +TEST_F(Dense, MakeComplexIsEquivalentToRef) +{ + set_up_apply_data(); + + auto complex_x = x->make_complex(); + auto dcomplex_x = dx->make_complex(); + + GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0); +} + + +TEST_F(Dense, MakeComplexWithGivenResultIsEquivalentToRef) +{ + set_up_apply_data(); + + auto complex_x = ComplexMtx::create(ref, x->get_size()); + x->make_complex(complex_x.get()); + auto dcomplex_x = ComplexMtx::create(dpcpp, x->get_size()); + dx->make_complex(dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0); +} + + +TEST_F(Dense, GetRealIsEquivalentToRef) +{ + set_up_apply_data(); + + auto real_x = x->get_real(); + auto dreal_x = dx->get_real(); + + GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0); +} + + +TEST_F(Dense, GetRealWithGivenResultIsEquivalentToRef) +{ + set_up_apply_data(); + + auto real_x = Mtx::create(ref, x->get_size()); + x->get_real(real_x.get()); + auto dreal_x = Mtx::create(dpcpp, dx->get_size()); + dx->get_real(dreal_x.get()); + + GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0); +} + + +TEST_F(Dense, GetImagIsEquivalentToRef) +{ + set_up_apply_data(); + + auto imag_x = x->get_imag(); + auto dimag_x = dx->get_imag(); + + GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0); +} + + +TEST_F(Dense, GetImagWithGivenResultIsEquivalentToRef) +{ + set_up_apply_data(); + + auto imag_x = Mtx::create(ref, x->get_size()); + x->get_imag(imag_x.get()); + auto dimag_x = Mtx::create(dpcpp, dx->get_size()); + dx->get_imag(dimag_x.get()); + + GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0); +} + + +} // namespace diff --git a/dpcpp/test/utils.hpp b/dpcpp/test/utils.hpp new file mode 100644 index 00000000000..88d98f0d9f6 --- /dev/null +++ b/dpcpp/test/utils.hpp @@ -0,0 +1,54 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_TEST_UTILS_HPP_ +#define GKO_DPCPP_TEST_UTILS_HPP_ + + +#include "core/test/utils.hpp" + + +#include + + +namespace { + + +// prevent device reset after each test +auto no_reset_exec = + gko::DpcppExecutor::create(0, gko::ReferenceExecutor::create()); + + +} // namespace + + +#endif // GKO_DPCPP_TEST_UTILS_HPP_ From ccc609853574a902d4312d7f59eab80a90fc6b32 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 20 May 2021 11:42:51 +0200 Subject: [PATCH 06/22] use warp_size 32 to check --- dpcpp/base/config.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp index 78fe25978a7..78fdcc2b819 100644 --- a/dpcpp/base/config.hpp +++ b/dpcpp/base/config.hpp @@ -49,6 +49,12 @@ struct config { */ using lane_mask_type = uint64; + + /** + * The number of threads within a CUDA warp. + */ + static constexpr uint32 warp_size = 32; + /** * The bitmask of the entire warp. */ From 8e8f9c7c1802766fc539fdc8913caacdff29a50a Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 20 May 2021 16:06:36 +0200 Subject: [PATCH 07/22] fix extract_diag, try default, and single test --- dpcpp/base/helper.hpp | 22 ++ dpcpp/components/prefix_sum.dp.hpp | 8 +- dpcpp/components/reduction.dp.hpp | 4 +- dpcpp/matrix/dense_kernels.dp.cpp | 363 +++++++++------------------ dpcpp/test/matrix/dense_kernels.cpp | 374 +++++++++------------------- 5 files changed, 274 insertions(+), 497 deletions(-) diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index c888eb9d99d..f8eee93f25b 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -46,6 +46,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/base/dim3.dp.hpp" +/** + * GKO_ENABLE_DEFAULT_HOST gives a default host implementation for those + * kernels which require encoded config but do not need explicit template + * parameter and share memory + * + * @param name_ the name of the host function with config + * @param kernel_ the kernel name + */ +#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ + template \ + void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \ + sycl::queue *queue, InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler &cgh) { \ + cgh.parallel_for(sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ + } + + /** * GKO_ENABLE_DEFAULT_HOST_CONFIG gives a default host implementation for those * kernels which require encoded config but do not need explicit template diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index 6b3498d1dea..c6f7c7cfb20 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -189,8 +189,8 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::target::local> prefix_helper_acc_ct1(cgh); - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; + auto local_range = block.get_range(); + auto global_range = grid.get_range() * local_range; cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), [=](sycl::nd_item<3> item_ct1) { @@ -240,8 +240,8 @@ void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, ValueType *elements, const ValueType *block_sum) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; + auto local_range = block.get_range(); + auto global_range = grid.get_range() * local_range; cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), [=](sycl::nd_item<3> item_ct1) { diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index 4caf46229c8..e47d9038af3 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -216,8 +216,8 @@ void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::target::local> block_sum_acc_ct1(cgh); - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; + auto local_range = block.get_range(); + auto global_range = grid.get_range() * local_range; cgh.parallel_for( sycl::nd_range<3>(global_range, local_range), diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index c5074b5cc38..494caff94c2 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" @@ -88,22 +89,7 @@ void strided_fill(size_type num_rows, size_type num_cols, size_type stride, } } -template -void strided_fill(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - size_type stride, ValueType *mat, ValueType value) -{ - stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - strided_fill(num_rows, num_cols, stride, mat, - value, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(strided_fill, strided_fill) template @@ -132,15 +118,11 @@ void scale(dim3 grid, dim3 block, size_t dynamic_shared_memory, size_type stride_x) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - scale(num_rows, num_cols, - num_alpha_cols, alpha, x, - stride_x, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + scale(num_rows, num_cols, num_alpha_cols, alpha, x, + stride_x, item_ct1); + }); }); } @@ -172,15 +154,12 @@ void add_scaled(dim3 grid, dim3 block, size_t dynamic_shared_memory, size_type stride_y) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - add_scaled( - num_rows, num_cols, num_alpha_cols, alpha, x, - stride_x, y, stride_y, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + add_scaled(num_rows, num_cols, num_alpha_cols, + alpha, x, stride_x, y, stride_y, + item_ct1); + }); }); } @@ -207,14 +186,10 @@ void add_scaled_diag(dim3 grid, dim3 block, size_t dynamic_shared_memory, ValueType *y, size_type stride_y) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - add_scaled_diag(size, alpha, diag, y, stride_y, - item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + add_scaled_diag(size, alpha, diag, y, stride_y, item_ct1); + }); }); } @@ -234,7 +209,7 @@ void compute_partial_reduce(size_type num_rows, OutType *__restrict__ work, const auto global_id = thread::get_thread_id(item_ct1); - OutType *tmp_work_array=*tmp_work; + OutType *tmp_work_array = *tmp_work; auto tmp = zero(); for (auto i = global_id; i < num_rows; i += block_size * num_blocks) { tmp = reduce_op(tmp, get_value(i)); @@ -243,8 +218,7 @@ void compute_partial_reduce(size_type num_rows, OutType *__restrict__ work, tmp_work_array[local_id] = tmp; ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), - tmp_work_array, - reduce_op); + tmp_work_array, reduce_op); if (local_id == 0) { work[thread::get_block_id(item_ct1)] = tmp_work_array[0]; @@ -267,12 +241,11 @@ void finalize_reduce_computation( for (auto i = local_id; i < size; i += block_size) { tmp = reduce_op(tmp, work[i]); } - ValueType *tmp_work_array=*tmp_work; + ValueType *tmp_work_array = *tmp_work; tmp_work_array[local_id] = tmp; ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), - tmp_work_array, - reduce_op); + tmp_work_array, reduce_op); if (local_id == 0) { *result = finalize_op(tmp_work_array[0]); @@ -309,17 +282,14 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::target::local> tmp_work_acc_ct1(cgh); - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - compute_partial_dot( - num_rows, x, stride_x, y, stride_y, work, - item_ct1, - (UninitializedArray *) - tmp_work_acc_ct1.get_pointer()); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + compute_partial_dot( + num_rows, x, stride_x, y, stride_y, work, item_ct1, + (UninitializedArray *) + tmp_work_acc_ct1.get_pointer()); + }); }); } @@ -348,10 +318,8 @@ void finalize_dot_computation(dim3 grid, dim3 block, sycl::access::target::local> tmp_work_acc_ct1(cgh); - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { finalize_dot_computation( size, work, result, item_ct1, @@ -388,12 +356,9 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::mode::read_write, sycl::access::target::local> tmp_work_acc_ct1(cgh); - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; cgh.parallel_for( - sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { compute_partial_norm2( num_rows, x, stride_x, work, item_ct1, (UninitializedArray, block_size> @@ -427,10 +392,8 @@ void finalize_norm2_computation(dim3 grid, dim3 block, sycl::access::target::local> tmp_work_acc_ct1(cgh); - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { finalize_norm2_computation( size, work, result, item_ct1, @@ -472,15 +435,11 @@ void fill_in_coo(dim3 grid, dim3 block, size_t dynamic_shared_memory, IndexType *col_idxs, ValueType *values) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - fill_in_coo(num_rows, num_cols, stride, row_ptrs, - source, row_idxs, col_idxs, values, - item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + fill_in_coo(num_rows, num_cols, stride, row_ptrs, source, + row_idxs, col_idxs, values, item_ct1); + }); }); } @@ -516,10 +475,7 @@ void count_nnz_per_row(dim3 grid, dim3 block, size_t dynamic_shared_memory, const ValueType *work, IndexType *result) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { count_nnz_per_row(num_rows, num_cols, stride, work, result, item_ct1); @@ -556,10 +512,7 @@ void fill_in_csr(dim3 grid, dim3 block, size_t dynamic_shared_memory, IndexType *col_idxs, ValueType *values) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { fill_in_csr(num_rows, num_cols, stride, source, row_ptrs, col_idxs, values, item_ct1); @@ -606,10 +559,7 @@ void fill_in_ell(dim3 grid, dim3 block, size_t dynamic_shared_memory, IndexType *col_ptrs, ValueType *values) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { fill_in_ell(num_rows, num_cols, source_stride, source, max_nnz_per_row, result_stride, @@ -662,16 +612,12 @@ void calculate_slice_lengths(dim3 grid, dim3 block, size_type *slice_lengths, size_type *slice_sets) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - calculate_slice_lengths(num_rows, slice_size, - slice_num, stride_factor, - nnz_per_row, slice_lengths, - slice_sets, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + calculate_slice_lengths(num_rows, slice_size, slice_num, + stride_factor, nnz_per_row, + slice_lengths, slice_sets, item_ct1); + }); }); } @@ -717,16 +663,12 @@ void fill_in_sellp(dim3 grid, dim3 block, size_t dynamic_shared_memory, size_type *slice_sets, IndexType *col_idxs, ValueType *vals) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - fill_in_sellp(num_rows, num_cols, slice_size, - stride, source, slice_lengths, - slice_sets, col_idxs, vals, - item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + fill_in_sellp(num_rows, num_cols, slice_size, stride, source, + slice_lengths, slice_sets, col_idxs, vals, + item_ct1); + }); }); } @@ -755,10 +697,8 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::target::local> dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { reduce_max_nnz(size, nnz_per_row, result, item_ct1, dpct_local_acc_ct1.get_pointer()); @@ -804,15 +744,11 @@ void reduce_max_nnz_per_slice(dim3 grid, dim3 block, const size_type *nnz_per_row, size_type *result) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - reduce_max_nnz_per_slice( - num_rows, slice_size, stride_factor, - nnz_per_row, result, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + reduce_max_nnz_per_slice(num_rows, slice_size, stride_factor, + nnz_per_row, result, item_ct1); + }); }); } @@ -841,15 +777,12 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::target::local> dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - reduce_total_cols( - num_slices, max_nnz_per_slice, result, - item_ct1, dpct_local_acc_ct1.get_pointer()); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + reduce_total_cols(num_slices, max_nnz_per_slice, result, + item_ct1, dpct_local_acc_ct1.get_pointer()); + }); }); } @@ -878,15 +811,11 @@ void symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, size_type stride_result) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - symm_permute(num_rows, num_cols, perm_idxs, orig, - stride_orig, result, stride_result, - item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + symm_permute(num_rows, num_cols, perm_idxs, orig, stride_orig, + result, stride_result, item_ct1); + }); }); } @@ -915,15 +844,11 @@ void inv_symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, ValueType *result, size_type stride_result) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - inv_symm_permute(num_rows, num_cols, perm_idxs, - orig, stride_orig, result, - stride_result, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + inv_symm_permute(num_rows, num_cols, perm_idxs, orig, + stride_orig, result, stride_result, item_ct1); + }); }); } @@ -952,15 +877,11 @@ void row_gather(dim3 grid, dim3 block, size_t dynamic_shared_memory, size_type stride_result) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - row_gather(num_rows, num_cols, perm_idxs, orig, - stride_orig, result, stride_result, - item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + row_gather(num_rows, num_cols, perm_idxs, orig, stride_orig, + result, stride_result, item_ct1); + }); }); } @@ -989,15 +910,11 @@ void column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, size_type stride_result) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - column_permute(num_rows, num_cols, perm_idxs, orig, - stride_orig, result, stride_result, - item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + column_permute(num_rows, num_cols, perm_idxs, orig, stride_orig, + result, stride_result, item_ct1); + }); }); } @@ -1026,10 +943,7 @@ void inverse_row_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, ValueType *result, size_type stride_result) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { inverse_row_permute(num_rows, num_cols, perm_idxs, orig, stride_orig, result, @@ -1064,26 +978,21 @@ void inverse_column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, ValueType *result, size_type stride_result) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - inverse_column_permute( - num_rows, num_cols, perm_idxs, orig, - stride_orig, result, stride_result, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + inverse_column_permute(num_rows, num_cols, perm_idxs, orig, + stride_orig, result, stride_result, + item_ct1); + }); }); } - template void extract_diagonal(size_type problem_size, const ValueType *__restrict__ orig, size_type stride_orig, ValueType *__restrict__ diag, sycl::nd_item<3> item_ct1) { - const auto tidx = thread::get_thread_id_flat(item_ct1); - + const auto tidx = thread::get_thread_id_flat(item_ct1); if (tidx < problem_size) { diag[tidx] = orig[tidx * stride_orig + tidx]; } @@ -1096,10 +1005,7 @@ void extract_diagonal(dim3 grid, dim3 block, size_t dynamic_shared_memory, ValueType *diag) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { extract_diagonal(problem_size, orig, stride_orig, diag, item_ct1); @@ -1117,7 +1023,7 @@ void inplace_absolute_dense(size_type num_rows, size_type num_cols, auto row = tidx / num_cols; auto col = tidx % num_cols; if (row < num_rows) { - data[row * stride + col] = dpcpp::abs(data[row * stride + col]); + data[row * stride + col] = std::abs(data[row * stride + col]); } } @@ -1128,10 +1034,7 @@ void inplace_absolute_dense(dim3 grid, dim3 block, size_t dynamic_shared_memory, size_type stride) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { inplace_absolute_dense(num_rows, num_cols, data, stride, item_ct1); @@ -1151,7 +1054,7 @@ void outplace_absolute_dense(size_type num_rows, size_type num_cols, auto row = tidx / num_cols; auto col = tidx % num_cols; if (row < num_rows) { - out[row * stride_out + col] = dpcpp::abs(in[row * stride_in + col]); + out[row * stride_out + col] = std::abs(in[row * stride_in + col]); } } @@ -1164,15 +1067,11 @@ void outplace_absolute_dense(dim3 grid, dim3 block, size_type stride_out) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - outplace_absolute_dense(num_rows, num_cols, in, - stride_in, out, stride_out, - item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + outplace_absolute_dense(num_rows, num_cols, in, stride_in, out, + stride_out, item_ct1); + }); }); } @@ -1198,10 +1097,7 @@ void make_complex(dim3 grid, dim3 block, size_t dynamic_shared_memory, size_type stride_out) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { make_complex(num_rows, num_cols, in, stride_in, out, stride_out, item_ct1); @@ -1231,10 +1127,7 @@ void get_real(dim3 grid, dim3 block, size_t dynamic_shared_memory, remove_complex *out, size_type stride_out) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { get_real(num_rows, num_cols, in, stride_in, out, stride_out, item_ct1); @@ -1264,10 +1157,7 @@ void get_imag(dim3 grid, dim3 block, size_t dynamic_shared_memory, remove_complex *out, size_type stride_out) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.reverse(); - auto global_range = grid.reverse() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { get_imag(num_rows, num_cols, in, stride_in, out, stride_out, item_ct1); @@ -1288,10 +1178,9 @@ void simple_apply(std::shared_ptr exec, using namespace oneapi::mkl; oneapi::mkl::blas::row_major::gemm( *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], - one(), a->get_const_values(), a->get_stride(), - b->get_const_values(), b->get_stride(), zero(), - c->get_values(), c->get_stride()); + c->get_size()[0], c->get_size()[1], a->get_size()[1], one(), + a->get_const_values(), a->get_stride(), b->get_const_values(), + b->get_stride(), zero(), c->get_values(), c->get_stride()); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL); @@ -1347,8 +1236,7 @@ void compute_dot(std::shared_ptr exec, kernel::compute_partial_dot( grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - work.get_data()); + y->get_const_values() + col, y->get_stride(), work.get_data()); kernel::finalize_dot_computation( 1, block_dim, 0, exec->get_queue(), grid_dim.x, work.get_const_data(), result->get_values() + col); @@ -1397,8 +1285,7 @@ void compute_norm2(std::shared_ptr exec, for (size_type col = 0; col < x->get_size()[1]; ++col) { kernel::compute_partial_norm2( grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - work.get_data()); + x->get_const_values() + col, x->get_stride(), work.get_data()); kernel::finalize_norm2_computation( 1, block_dim, 0, exec->get_queue(), grid_dim.x, work.get_const_data(), result->get_values() + col); @@ -1433,8 +1320,7 @@ void convert_to_coo(std::shared_ptr exec, kernel::fill_in_coo(grid_dim, default_block_size, 0, exec->get_queue(), num_rows, num_cols, stride, nnz_prefix_sum.get_const_data(), - source->get_const_values(), row_idxs, col_idxs, - values); + source->get_const_values(), row_idxs, col_idxs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1455,9 +1341,8 @@ void convert_to_csr(std::shared_ptr exec, auto stride = source->get_stride(); - const auto rows_per_block = ceildiv(default_block_size, - config::warp_size); const auto grid_dim_nnz = - ceildiv(source->get_size()[0], rows_per_block); + const auto rows_per_block = ceildiv(default_block_size, config::warp_size); + const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); kernel::count_nnz_per_row(grid_dim_nnz, default_block_size, 0, exec->get_queue(), num_rows, num_cols, stride, @@ -1468,9 +1353,8 @@ void convert_to_csr(std::shared_ptr exec, size_type grid_dim = ceildiv(num_rows, default_block_size); kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, num_cols, stride, - source->get_const_values(), row_ptrs, col_idxs, - values); + num_rows, num_cols, stride, source->get_const_values(), + row_ptrs, col_idxs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1546,8 +1430,8 @@ void convert_to_sellp(std::shared_ptr exec, std::cout << "calculate_slice_lengths" << std::endl; kernel::calculate_slice_lengths( grid_dim, config::warp_size, 0, exec->get_queue(), num_rows, - slice_size, slice_num, stride_factor, - nnz_per_row.get_const_data(), slice_lengths, slice_sets); + slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(), + slice_lengths, slice_sets); exec->synchronize(); std::cout << "calculate_slice_lengths finish" << std::endl; } @@ -1559,9 +1443,9 @@ void convert_to_sellp(std::shared_ptr exec, if (grid_dim > 0) { std::cout << "fill_in_sellp" << std::endl; kernel::fill_in_sellp(grid_dim, default_block_size, 0, - exec->get_queue(), num_rows, num_cols, - slice_size, stride, source->get_const_values(), - slice_lengths, slice_sets, col_idxs, vals); + exec->get_queue(), num_rows, num_cols, slice_size, + stride, source->get_const_values(), slice_lengths, + slice_sets, col_idxs, vals); exec->synchronize(); std::cout << "fill_in_sellp finish" << std::endl; } @@ -1622,8 +1506,7 @@ void calculate_max_nnz_per_row(std::shared_ptr exec, kernel::reduce_max_nnz(1, default_block_size, default_block_size * sizeof(size_type), exec->get_queue(), grid_dim, - block_results.get_const_data(), - d_result.get_data()); + block_results.get_const_data(), d_result.get_data()); *result = exec->copy_val_to_host(d_result.get_const_data()); } @@ -1643,10 +1526,9 @@ void calculate_nonzeros_per_row(std::shared_ptr exec, const dim3 grid_size(grid_x, 1, 1); if (grid_x > 0) { kernel::count_nnz_per_row( - grid_size, block_size, 0, exec->get_queue(), - source->get_size()[0], source->get_size()[1], - source->get_stride(), source->get_const_values(), - result->get_data()); + grid_size, block_size, 0, exec->get_queue(), source->get_size()[0], + source->get_size()[1], source->get_stride(), + source->get_const_values(), result->get_data()); } } @@ -1676,8 +1558,7 @@ void calculate_total_cols(std::shared_ptr exec, auto max_nnz_per_slice = Array(exec, slice_num); - auto grid_dim = ceildiv(slice_num * config::warp_size, - default_block_size); + auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); kernel::reduce_max_nnz_per_slice( grid_dim, default_block_size, 0, exec->get_queue(), num_rows, diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index 7c65e8b0f84..3cd080313cf 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -59,17 +59,21 @@ namespace { class Dense : public ::testing::Test { protected: using itype = int; +#if GINKGO_DPCPP_SINGLE_MODE + using vtype = float; +#else using vtype = double; +#endif // GINKGO_DPCPP_SINGLE_MODE using Mtx = gko::matrix::Dense; using NormVector = gko::matrix::Dense>; using Arr = gko::Array; - using ComplexMtx = gko::matrix::Dense>; + // using ComplexMtx = gko::matrix::Dense>; Dense() : rand_engine(15) {} void SetUp() { - ASSERT_GT(gko::DpcppExecutor::get_num_devices("gpu"), 0); + ASSERT_GT(gko::DpcppExecutor::get_num_devices("all"), 0); ref = gko::ReferenceExecutor::create(); dpcpp = gko::DpcppExecutor::create(0, ref); } @@ -113,16 +117,15 @@ class Dense : public ::testing::Test { void set_up_apply_data() { x = gen_mtx(65, 25); - c_x = gen_mtx(65, 25); + // c_x = gen_mtx(65, 25); y = gen_mtx(25, 35); expected = gen_mtx(65, 35); alpha = gko::initialize({2.0}, ref); beta = gko::initialize({-1.0}, ref); - square = gen_mtx(x->get_size()[0], x->get_size()[0]); dx = Mtx::create(dpcpp); dx->copy_from(x.get()); - dc_x = ComplexMtx::create(dpcpp); - dc_x->copy_from(c_x.get()); + // dc_x = ComplexMtx::create(dpcpp); + // dc_x->copy_from(c_x.get()); dy = Mtx::create(dpcpp); dy->copy_from(y.get()); dresult = Mtx::create(dpcpp); @@ -131,8 +134,6 @@ class Dense : public ::testing::Test { dalpha->copy_from(alpha.get()); dbeta = Mtx::create(dpcpp); dbeta->copy_from(beta.get()); - dsquare = Mtx::create(dpcpp); - dsquare->copy_from(square.get()); std::vector tmp(x->get_size()[0], 0); auto rng = std::default_random_engine{}; @@ -141,17 +142,14 @@ class Dense : public ::testing::Test { std::vector tmp2(x->get_size()[1], 0); std::iota(tmp2.begin(), tmp2.end(), 0); std::shuffle(tmp2.begin(), tmp2.end(), rng); - std::vector tmp3(x->get_size()[0] / 10); - std::uniform_int_distribution row_dist(0, x->get_size()[0] - 1); - for (auto &i : tmp3) { - i = row_dist(rng); - } rpermute_idxs = std::unique_ptr(new Arr{ref, tmp.begin(), tmp.end()}); + drpermute_idxs = + std::unique_ptr(new Arr{dpcpp, tmp.begin(), tmp.end()}); cpermute_idxs = std::unique_ptr(new Arr{ref, tmp2.begin(), tmp2.end()}); - rgather_idxs = - std::unique_ptr(new Arr{ref, tmp3.begin(), tmp3.end()}); + dcpermute_idxs = + std::unique_ptr(new Arr{dpcpp, tmp2.begin(), tmp2.end()}); } std::shared_ptr ref; @@ -160,22 +158,21 @@ class Dense : public ::testing::Test { std::ranlux48 rand_engine; std::unique_ptr x; - std::unique_ptr c_x; + // std::unique_ptr c_x; std::unique_ptr y; std::unique_ptr alpha; std::unique_ptr beta; std::unique_ptr expected; - std::unique_ptr square; std::unique_ptr dresult; std::unique_ptr dx; - std::unique_ptr dc_x; + // std::unique_ptr dc_x; std::unique_ptr dy; std::unique_ptr dalpha; std::unique_ptr dbeta; - std::unique_ptr dsquare; std::unique_ptr rpermute_idxs; + std::unique_ptr drpermute_idxs; std::unique_ptr cpermute_idxs; - std::unique_ptr rgather_idxs; + std::unique_ptr dcpermute_idxs; }; @@ -188,13 +185,13 @@ TEST_F(Dense, DpcppFillIsEquivalentToRef) dx->fill(42); result->copy_from(dx.get()); - GKO_ASSERT_MTX_NEAR(result, x, 1e-14); + GKO_ASSERT_MTX_NEAR(result, x, r::value); } TEST_F(Dense, DpcppStridedFillIsEquivalentToRef) { - using T = double; + using T = vtype; auto x = gko::initialize>( 4, {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}, ref); auto dx = gko::initialize>( @@ -205,7 +202,7 @@ TEST_F(Dense, DpcppStridedFillIsEquivalentToRef) dx->fill(42); result->copy_from(dx.get()); - GKO_ASSERT_MTX_NEAR(result, x, 1e-14); + GKO_ASSERT_MTX_NEAR(result, x, r::value); } @@ -218,7 +215,7 @@ TEST_F(Dense, SingleVectorDpcppScaleIsEquivalentToRef) dx->scale(dalpha.get()); result->copy_from(dx.get()); - GKO_ASSERT_MTX_NEAR(result, x, 1e-14); + GKO_ASSERT_MTX_NEAR(result, x, r::value); } @@ -229,7 +226,7 @@ TEST_F(Dense, MultipleVectorDpcppScaleIsEquivalentToRef) x->scale(alpha.get()); dx->scale(dalpha.get()); - GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_MTX_NEAR(dx, x, r::value); } @@ -240,7 +237,7 @@ TEST_F(Dense, MultipleVectorDpcppScaleWithDifferentAlphaIsEquivalentToRef) x->scale(alpha.get()); dx->scale(dalpha.get()); - GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_MTX_NEAR(dx, x, r::value); } @@ -251,7 +248,7 @@ TEST_F(Dense, SingleVectorDpcppAddScaledIsEquivalentToRef) x->add_scaled(alpha.get(), y.get()); dx->add_scaled(dalpha.get(), dy.get()); - GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_MTX_NEAR(dx, x, r::value); } @@ -262,7 +259,7 @@ TEST_F(Dense, MultipleVectorDpcppAddScaledIsEquivalentToRef) x->add_scaled(alpha.get(), y.get()); dx->add_scaled(dalpha.get(), dy.get()); - GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_MTX_NEAR(dx, x, r::value); } @@ -273,7 +270,7 @@ TEST_F(Dense, MultipleVectorDpcppAddScaledWithDifferentAlphaIsEquivalentToRef) x->add_scaled(alpha.get(), y.get()); dx->add_scaled(dalpha.get(), dy.get()); - GKO_ASSERT_MTX_NEAR(dx, x, 1e-14); + GKO_ASSERT_MTX_NEAR(dx, x, r::value); } @@ -296,7 +293,7 @@ TEST_F(Dense, AddsScaledDiagIsEquivalentToRef) mat->add_scaled(alpha.get(), diag.get()); dmat->add_scaled(dalpha.get(), ddiag.get()); - GKO_ASSERT_MTX_NEAR(mat, dmat, 1e-14); + GKO_ASSERT_MTX_NEAR(mat, dmat, r::value); } @@ -307,7 +304,7 @@ TEST_F(Dense, SingleVectorDpcppComputeDotIsEquivalentToRef) x->compute_dot(y.get(), expected.get()); dx->compute_dot(dy.get(), dresult.get()); - GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); } @@ -318,7 +315,7 @@ TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef) x->compute_dot(y.get(), expected.get()); dx->compute_dot(dy.get(), dresult.get()); - GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); } @@ -332,7 +329,7 @@ TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef) x->compute_norm2(norm_expected.get()); dx->compute_norm2(dnorm.get()); - GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14); + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r::value); } @@ -343,7 +340,7 @@ TEST_F(Dense, SimpleApplyIsEquivalentToRef) x->apply(y.get(), expected.get()); dx->apply(dy.get(), dresult.get()); - GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); } @@ -354,186 +351,186 @@ TEST_F(Dense, AdvancedApplyIsEquivalentToRef) x->apply(alpha.get(), y.get(), beta.get(), expected.get()); dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get()); - GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14); + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); } -TEST_F(Dense, ApplyToComplexIsEquivalentToRef) -{ - set_up_apply_data(); - auto complex_b = gen_mtx(25, 1); - auto dcomplex_b = ComplexMtx::create(dpcpp); - dcomplex_b->copy_from(complex_b.get()); - auto complex_x = gen_mtx(65, 1); - auto dcomplex_x = ComplexMtx::create(dpcpp); - dcomplex_x->copy_from(complex_x.get()); +// TEST_F(Dense, ApplyToComplexIsEquivalentToRef) +// { +// set_up_apply_data(); +// auto complex_b = gen_mtx(25, 1); +// auto dcomplex_b = ComplexMtx::create(dpcpp); +// dcomplex_b->copy_from(complex_b.get()); +// auto complex_x = gen_mtx(65, 1); +// auto dcomplex_x = ComplexMtx::create(dpcpp); +// dcomplex_x->copy_from(complex_x.get()); - x->apply(complex_b.get(), complex_x.get()); - dx->apply(dcomplex_b.get(), dcomplex_x.get()); +// x->apply(complex_b.get(), complex_x.get()); +// dx->apply(dcomplex_b.get(), dcomplex_x.get()); - GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); -} +// GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); +// } -TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef) -{ - set_up_apply_data(); - auto complex_b = gen_mtx(25, 1); - auto dcomplex_b = ComplexMtx::create(dpcpp); - dcomplex_b->copy_from(complex_b.get()); - auto complex_x = gen_mtx(65, 1); - auto dcomplex_x = ComplexMtx::create(dpcpp); - dcomplex_x->copy_from(complex_x.get()); +// TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef) +// { +// set_up_apply_data(); +// auto complex_b = gen_mtx(25, 1); +// auto dcomplex_b = ComplexMtx::create(dpcpp); +// dcomplex_b->copy_from(complex_b.get()); +// auto complex_x = gen_mtx(65, 1); +// auto dcomplex_x = ComplexMtx::create(dpcpp); +// dcomplex_x->copy_from(complex_x.get()); - x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get()); - dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get()); +// x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get()); +// dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get()); - GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); -} +// GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); +// } -TEST_F(Dense, IsTransposable) -{ - set_up_apply_data(); +// TEST_F(Dense, IsTransposable) +// { +// set_up_apply_data(); - auto trans = x->transpose(); - auto dtrans = dx->transpose(); +// auto trans = x->transpose(); +// auto dtrans = dx->transpose(); - GKO_ASSERT_MTX_NEAR(static_cast(dtrans.get()), - static_cast(trans.get()), 0); -} +// GKO_ASSERT_MTX_NEAR(static_cast(dtrans.get()), +// static_cast(trans.get()), 0); +// } -TEST_F(Dense, IsConjugateTransposable) -{ - set_up_apply_data(); +// TEST_F(Dense, IsConjugateTransposable) +// { +// set_up_apply_data(); - auto trans = c_x->conj_transpose(); - auto dtrans = dc_x->conj_transpose(); +// auto trans = c_x->conj_transpose(); +// auto dtrans = dc_x->conj_transpose(); - GKO_ASSERT_MTX_NEAR(static_cast(dtrans.get()), - static_cast(trans.get()), 0); -} +// GKO_ASSERT_MTX_NEAR(static_cast(dtrans.get()), +// static_cast(trans.get()), 0); +// } TEST_F(Dense, ConvertToCooIsEquivalentToRef) { set_up_apply_data(); - auto coo_mtx = gko::matrix::Coo<>::create(ref); - auto dcoo_mtx = gko::matrix::Coo<>::create(dpcpp); + auto coo_mtx = gko::matrix::Coo::create(ref); + auto dcoo_mtx = gko::matrix::Coo::create(dpcpp); x->convert_to(coo_mtx.get()); dx->convert_to(dcoo_mtx.get()); ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), coo_mtx->get_num_stored_elements()); - GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), r::value); } TEST_F(Dense, MoveToCooIsEquivalentToRef) { set_up_apply_data(); - auto coo_mtx = gko::matrix::Coo<>::create(ref); - auto dcoo_mtx = gko::matrix::Coo<>::create(dpcpp); + auto coo_mtx = gko::matrix::Coo::create(ref); + auto dcoo_mtx = gko::matrix::Coo::create(dpcpp); x->move_to(coo_mtx.get()); dx->move_to(dcoo_mtx.get()); ASSERT_EQ(dcoo_mtx->get_num_stored_elements(), coo_mtx->get_num_stored_elements()); - GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), r::value); } TEST_F(Dense, ConvertToCsrIsEquivalentToRef) { set_up_apply_data(); - auto csr_mtx = gko::matrix::Csr<>::create(ref); - auto dcsr_mtx = gko::matrix::Csr<>::create(dpcpp); + auto csr_mtx = gko::matrix::Csr::create(ref); + auto dcsr_mtx = gko::matrix::Csr::create(dpcpp); x->convert_to(csr_mtx.get()); dx->convert_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), r::value); } TEST_F(Dense, MoveToCsrIsEquivalentToRef) { set_up_apply_data(); - auto csr_mtx = gko::matrix::Csr<>::create(ref); - auto dcsr_mtx = gko::matrix::Csr<>::create(dpcpp); + auto csr_mtx = gko::matrix::Csr::create(ref); + auto dcsr_mtx = gko::matrix::Csr::create(dpcpp); x->move_to(csr_mtx.get()); dx->move_to(dcsr_mtx.get()); - GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), r::value); } TEST_F(Dense, ConvertToEllIsEquivalentToRef) { set_up_apply_data(); - auto ell_mtx = gko::matrix::Ell<>::create(ref); - auto dell_mtx = gko::matrix::Ell<>::create(dpcpp); + auto ell_mtx = gko::matrix::Ell::create(ref); + auto dell_mtx = gko::matrix::Ell::create(dpcpp); x->convert_to(ell_mtx.get()); dx->convert_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), r::value); } TEST_F(Dense, MoveToEllIsEquivalentToRef) { set_up_apply_data(); - auto ell_mtx = gko::matrix::Ell<>::create(ref); - auto dell_mtx = gko::matrix::Ell<>::create(dpcpp); + auto ell_mtx = gko::matrix::Ell::create(ref); + auto dell_mtx = gko::matrix::Ell::create(dpcpp); x->move_to(ell_mtx.get()); dx->move_to(dell_mtx.get()); - GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14); + GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), r::value); } -TEST_F(Dense, ConvertToSellpIsEquivalentToRef) -{ - set_up_apply_data(); - auto sellp_mtx = gko::matrix::Sellp<>::create(ref); - auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); +// TEST_F(Dense, ConvertToSellpIsEquivalentToRef) +// { +// set_up_apply_data(); +// auto sellp_mtx = gko::matrix::Sellp<>::create(ref); +// auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); - x->convert_to(sellp_mtx.get()); - dx->convert_to(dsellp_mtx.get()); +// x->convert_to(sellp_mtx.get()); +// dx->convert_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14); -} +// GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6); +// } -TEST_F(Dense, MoveToSellpIsEquivalentToRef) -{ - set_up_apply_data(); - auto sellp_mtx = gko::matrix::Sellp<>::create(ref); - auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); +// TEST_F(Dense, MoveToSellpIsEquivalentToRef) +// { +// set_up_apply_data(); +// auto sellp_mtx = gko::matrix::Sellp<>::create(ref); +// auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); - x->move_to(sellp_mtx.get()); - dx->move_to(dsellp_mtx.get()); +// x->move_to(sellp_mtx.get()); +// dx->move_to(dsellp_mtx.get()); - GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14); -} +// GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6); +// } -TEST_F(Dense, ConvertsEmptyToSellp) -{ - auto dempty_mtx = Mtx::create(dpcpp); - auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); +// TEST_F(Dense, ConvertsEmptyToSellp) +// { +// auto dempty_mtx = Mtx::create(dpcpp); +// auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); - dempty_mtx->convert_to(dsellp_mtx.get()); +// dempty_mtx->convert_to(dsellp_mtx.get()); - ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); - ASSERT_FALSE(dsellp_mtx->get_size()); -} +// ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), +// 0); ASSERT_FALSE(dsellp_mtx->get_size()); +// } TEST_F(Dense, CountNNZIsEquivalentToRef) @@ -599,63 +596,12 @@ TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) } -TEST_F(Dense, CanGatherRows) -{ - set_up_apply_data(); - - auto r_gather = x->row_gather(rgather_idxs.get()); - auto dr_gather = dx->row_gather(rgather_idxs.get()); - - GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0); -} - - -TEST_F(Dense, CanGatherRowsIntoDense) -{ - set_up_apply_data(); - auto gather_size = - gko::dim<2>{rgather_idxs->get_num_elems(), x->get_size()[1]}; - auto r_gather = Mtx::create(ref, gather_size); - // test make_temporary_clone and non-default stride - auto dr_gather = Mtx::create(ref, gather_size, x->get_size()[1] + 2); - - x->row_gather(rgather_idxs.get(), r_gather.get()); - dx->row_gather(rgather_idxs.get(), dr_gather.get()); - - GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0); -} - - -TEST_F(Dense, IsPermutable) -{ - set_up_apply_data(); - - auto permuted = square->permute(rpermute_idxs.get()); - auto dpermuted = dsquare->permute(rpermute_idxs.get()); - - GKO_ASSERT_MTX_NEAR(static_cast(permuted.get()), - static_cast(dpermuted.get()), 0); -} - - -TEST_F(Dense, IsInversePermutable) -{ - set_up_apply_data(); - - auto permuted = square->inverse_permute(rpermute_idxs.get()); - auto dpermuted = dsquare->inverse_permute(rpermute_idxs.get()); - - GKO_ASSERT_MTX_NEAR(static_cast(permuted.get()), - static_cast(dpermuted.get()), 0); -} - - TEST_F(Dense, IsRowPermutable) { set_up_apply_data(); auto r_permute = x->row_permute(rpermute_idxs.get()); - auto dr_permute = dx->row_permute(rpermute_idxs.get()); + auto dr_permute = dx->row_permute(drpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(r_permute.get()), static_cast(dr_permute.get()), 0); @@ -667,7 +613,7 @@ TEST_F(Dense, IsColPermutable) set_up_apply_data(); auto c_permute = x->column_permute(cpermute_idxs.get()); - auto dc_permute = dx->column_permute(cpermute_idxs.get()); + auto dc_permute = dx->column_permute(dcpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(c_permute.get()), static_cast(dc_permute.get()), 0); @@ -679,7 +625,7 @@ TEST_F(Dense, IsInverseRowPermutable) set_up_apply_data(); auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get()); - auto d_inverse_r_permute = dx->inverse_row_permute(rpermute_idxs.get()); + auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(inverse_r_permute.get()), static_cast(d_inverse_r_permute.get()), 0); @@ -691,7 +637,7 @@ TEST_F(Dense, IsInverseColPermutable) set_up_apply_data(); auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get()); - auto d_inverse_c_permute = dx->inverse_column_permute(cpermute_idxs.get()); + auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(inverse_c_permute.get()), static_cast(d_inverse_c_permute.get()), 0); @@ -716,7 +662,7 @@ TEST_F(Dense, InplaceAbsoluteMatrixIsEquivalentToRef) x->compute_absolute_inplace(); dx->compute_absolute_inplace(); - GKO_ASSERT_MTX_NEAR(x, dx, 1e-14); + GKO_ASSERT_MTX_NEAR(x, dx, r::value); } @@ -727,79 +673,7 @@ TEST_F(Dense, OutplaceAbsoluteMatrixIsEquivalentToRef) auto abs_x = x->compute_absolute(); auto dabs_x = dx->compute_absolute(); - GKO_ASSERT_MTX_NEAR(abs_x, dabs_x, 1e-14); -} - - -TEST_F(Dense, MakeComplexIsEquivalentToRef) -{ - set_up_apply_data(); - - auto complex_x = x->make_complex(); - auto dcomplex_x = dx->make_complex(); - - GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0); -} - - -TEST_F(Dense, MakeComplexWithGivenResultIsEquivalentToRef) -{ - set_up_apply_data(); - - auto complex_x = ComplexMtx::create(ref, x->get_size()); - x->make_complex(complex_x.get()); - auto dcomplex_x = ComplexMtx::create(dpcpp, x->get_size()); - dx->make_complex(dcomplex_x.get()); - - GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0); -} - - -TEST_F(Dense, GetRealIsEquivalentToRef) -{ - set_up_apply_data(); - - auto real_x = x->get_real(); - auto dreal_x = dx->get_real(); - - GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0); -} - - -TEST_F(Dense, GetRealWithGivenResultIsEquivalentToRef) -{ - set_up_apply_data(); - - auto real_x = Mtx::create(ref, x->get_size()); - x->get_real(real_x.get()); - auto dreal_x = Mtx::create(dpcpp, dx->get_size()); - dx->get_real(dreal_x.get()); - - GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0); -} - - -TEST_F(Dense, GetImagIsEquivalentToRef) -{ - set_up_apply_data(); - - auto imag_x = x->get_imag(); - auto dimag_x = dx->get_imag(); - - GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0); -} - - -TEST_F(Dense, GetImagWithGivenResultIsEquivalentToRef) -{ - set_up_apply_data(); - - auto imag_x = Mtx::create(ref, x->get_size()); - x->get_imag(imag_x.get()); - auto dimag_x = Mtx::create(dpcpp, dx->get_size()); - dx->get_imag(dimag_x.get()); - - GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0); + GKO_ASSERT_MTX_NEAR(abs_x, dabs_x, r::value); } From 9e54bd2f5e7a07b977b463614882af7330ce627b Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Fri, 21 May 2021 14:31:26 +0200 Subject: [PATCH 08/22] use simple macro --- dpcpp/matrix/dense_kernels.dp.cpp | 319 +++--------------------------- 1 file changed, 24 insertions(+), 295 deletions(-) diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 494caff94c2..58199a221d6 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -92,14 +92,12 @@ void strided_fill(size_type num_rows, size_type num_cols, size_type stride, GKO_ENABLE_DEFAULT_HOST(strided_fill, strided_fill) -template +template void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols, const ValueType *__restrict__ alpha, ValueType *__restrict__ x, size_type stride_x, sycl::nd_item<3> item_ct1) { - constexpr auto warps_per_block = block_size / config::warp_size; - const auto global_id = - thread::get_thread_id(item_ct1); + const auto global_id = thread::get_thread_id_flat(item_ct1); const auto row_id = global_id / num_cols; const auto col_id = global_id % num_cols; const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id; @@ -111,32 +109,16 @@ void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols, } } -template -void scale(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - size_type num_alpha_cols, const ValueType *alpha, ValueType *x, - size_type stride_x) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - scale(num_rows, num_cols, num_alpha_cols, alpha, x, - stride_x, item_ct1); - }); - }); -} - +GKO_ENABLE_DEFAULT_HOST(scale, scale) -template +template void add_scaled(size_type num_rows, size_type num_cols, size_type num_alpha_cols, const ValueType *__restrict__ alpha, const ValueType *__restrict__ x, size_type stride_x, ValueType *__restrict__ y, size_type stride_y, sycl::nd_item<3> item_ct1) { - constexpr auto warps_per_block = block_size / config::warp_size; - const auto global_id = - thread::get_thread_id(item_ct1); + const auto global_id = thread::get_thread_id_flat(item_ct1); const auto row_id = global_id / num_cols; const auto col_id = global_id % num_cols; const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id; @@ -146,22 +128,7 @@ void add_scaled(size_type num_rows, size_type num_cols, } } -template -void add_scaled(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - size_type num_alpha_cols, const ValueType *alpha, - const ValueType *x, size_type stride_x, ValueType *y, - size_type stride_y) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - add_scaled(num_rows, num_cols, num_alpha_cols, - alpha, x, stride_x, y, stride_y, - item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(add_scaled, add_scaled) template @@ -179,19 +146,7 @@ void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha, y[tidx * stride_y + tidx] += alpha[0] * diag[tidx]; } -template -void add_scaled_diag(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type size, - const ValueType *alpha, const ValueType *diag, - ValueType *y, size_type stride_y) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - add_scaled_diag(size, alpha, diag, y, stride_y, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag) template tmp_work_acc_ct1(cgh); - cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { compute_partial_dot( @@ -318,7 +272,6 @@ void finalize_dot_computation(dim3 grid, dim3 block, sycl::access::target::local> tmp_work_acc_ct1(cgh); - cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { finalize_dot_computation( @@ -356,7 +309,6 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::mode::read_write, sycl::access::target::local> tmp_work_acc_ct1(cgh); - cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { compute_partial_norm2( @@ -427,21 +379,7 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, } } -template -void fill_in_coo(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - size_type stride, const size_type *row_ptrs, - const ValueType *source, IndexType *row_idxs, - IndexType *col_idxs, ValueType *values) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - fill_in_coo(num_rows, num_cols, stride, row_ptrs, source, - row_idxs, col_idxs, values, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo) template @@ -505,20 +443,7 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, } } -template -void fill_in_csr(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - size_type stride, const ValueType *source, IndexType *row_ptrs, - IndexType *col_idxs, ValueType *values) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - fill_in_csr(num_rows, num_cols, stride, source, - row_ptrs, col_idxs, values, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr) template @@ -551,22 +476,7 @@ void fill_in_ell(size_type num_rows, size_type num_cols, } } -template -void fill_in_ell(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - size_type source_stride, const ValueType *source, - size_type max_nnz_per_row, size_type result_stride, - IndexType *col_ptrs, ValueType *values) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - fill_in_ell(num_rows, num_cols, source_stride, - source, max_nnz_per_row, result_stride, - col_ptrs, values, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell) void calculate_slice_lengths(size_type num_rows, size_type slice_size, @@ -655,22 +565,7 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, } } -template -void fill_in_sellp(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - size_type slice_size, size_type stride, - const ValueType *source, size_type *slice_lengths, - size_type *slice_sets, IndexType *col_idxs, ValueType *vals) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - fill_in_sellp(num_rows, num_cols, slice_size, stride, source, - slice_lengths, slice_sets, col_idxs, vals, - item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp) void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, @@ -777,7 +672,6 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::target::local> dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); - cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { reduce_total_cols(num_slices, max_nnz_per_slice, result, @@ -803,21 +697,7 @@ void symm_permute(size_type num_rows, size_type num_cols, } } -template -void symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - const IndexType *perm_idxs, const ValueType *orig, - size_type stride_orig, ValueType *result, - size_type stride_result) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - symm_permute(num_rows, num_cols, perm_idxs, orig, stride_orig, - result, stride_result, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(symm_permute, symm_permute) template @@ -836,21 +716,7 @@ void inv_symm_permute(size_type num_rows, size_type num_cols, } } -template -void inv_symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, - size_type num_cols, const IndexType *perm_idxs, - const ValueType *orig, size_type stride_orig, - ValueType *result, size_type stride_result) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - inv_symm_permute(num_rows, num_cols, perm_idxs, orig, - stride_orig, result, stride_result, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(inv_symm_permute, inv_symm_permute) template @@ -869,21 +735,7 @@ void row_gather(size_type num_rows, size_type num_cols, } } -template -void row_gather(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - const IndexType *perm_idxs, const ValueType *orig, - size_type stride_orig, ValueType *result, - size_type stride_result) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - row_gather(num_rows, num_cols, perm_idxs, orig, stride_orig, - result, stride_result, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(row_gather, row_gather) template @@ -902,21 +754,7 @@ void column_permute(size_type num_rows, size_type num_cols, } } -template -void column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - const IndexType *perm_idxs, const ValueType *orig, - size_type stride_orig, ValueType *result, - size_type stride_result) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - column_permute(num_rows, num_cols, perm_idxs, orig, stride_orig, - result, stride_result, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(column_permute, column_permute) template @@ -935,22 +773,7 @@ void inverse_row_permute(size_type num_rows, size_type num_cols, } } -template -void inverse_row_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, - size_type num_cols, const IndexType *perm_idxs, - const ValueType *orig, size_type stride_orig, - ValueType *result, size_type stride_result) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - inverse_row_permute(num_rows, num_cols, perm_idxs, - orig, stride_orig, result, - stride_result, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(inverse_row_permute, inverse_row_permute) template @@ -970,22 +793,8 @@ void inverse_column_permute(size_type num_rows, size_type num_cols, } } -template -void inverse_column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, - size_type num_cols, const IndexType *perm_idxs, - const ValueType *orig, size_type stride_orig, - ValueType *result, size_type stride_result) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - inverse_column_permute(num_rows, num_cols, perm_idxs, orig, - stride_orig, result, stride_result, - item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(inverse_column_permute, inverse_column_permute) + template void extract_diagonal(size_type problem_size, @@ -998,20 +807,7 @@ void extract_diagonal(size_type problem_size, } } -template -void extract_diagonal(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type problem_size, - const ValueType *orig, size_type stride_orig, - ValueType *diag) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - extract_diagonal(problem_size, orig, stride_orig, - diag, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(extract_diagonal, extract_diagonal) template @@ -1027,20 +823,7 @@ void inplace_absolute_dense(size_type num_rows, size_type num_cols, } } -template -void inplace_absolute_dense(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, - size_type num_cols, ValueType *data, - size_type stride) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - inplace_absolute_dense(num_rows, num_cols, data, - stride, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(inplace_absolute_dense, inplace_absolute_dense) template @@ -1058,22 +841,7 @@ void outplace_absolute_dense(size_type num_rows, size_type num_cols, } } -template -void outplace_absolute_dense(dim3 grid, dim3 block, - size_t dynamic_shared_memory, sycl::queue *stream, - size_type num_rows, size_type num_cols, - const ValueType *in, size_type stride_in, - remove_complex *out, - size_type stride_out) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - outplace_absolute_dense(num_rows, num_cols, in, stride_in, out, - stride_out, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(outplace_absolute_dense, outplace_absolute_dense) template @@ -1090,20 +858,7 @@ void make_complex(size_type num_rows, size_type num_cols, } } -template -void make_complex(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - const ValueType *in, size_type stride_in, ComplexType *out, - size_type stride_out) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - make_complex(num_rows, num_cols, in, stride_in, - out, stride_out, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(make_complex, make_complex) template @@ -1120,20 +875,7 @@ void get_real(size_type num_rows, size_type num_cols, } } -template -void get_real(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - const ValueType *in, size_type stride_in, - remove_complex *out, size_type stride_out) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - get_real(num_rows, num_cols, in, stride_in, out, - stride_out, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(get_real, get_real) template @@ -1150,20 +892,7 @@ void get_imag(size_type num_rows, size_type num_cols, } } -template -void get_imag(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, size_type num_cols, - const ValueType *in, size_type stride_in, - remove_complex *out, size_type stride_out) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - get_imag(num_rows, num_cols, in, stride_in, out, - stride_out, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST(get_imag, get_imag) } // namespace kernel From d9c6f6432d3ac7f1de0c83710b6a5f02f0e4adfd Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sun, 23 May 2021 00:08:32 +0200 Subject: [PATCH 09/22] add as_array and reduce_add_array for config --- dpcpp/components/reduction.dp.hpp | 89 ++++++++++++------- .../ginkgo/core/synthesizer/containers.hpp | 8 ++ 2 files changed, 63 insertions(+), 34 deletions(-) diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index e47d9038af3..4f1835dfea7 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -44,21 +44,29 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/helper.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/components/uninitialized_array.hpp" - namespace gko { namespace kernels { namespace dpcpp { constexpr int default_block_size = 256; - +using KCFG_1D = ConfigSet<11, 7>; +constexpr auto kcfg_1d_list = + syn::value_list(); +constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); // #include "common/components/reduction.hpp.inc" /** @@ -130,21 +138,22 @@ __dpct_inline__ int choose_pivot(const Group &group, ValueType local_data, * array. */ template < - typename Group, typename ValueType, typename Operator, + unsigned int sg_size = 32, typename Group, typename ValueType, + typename Operator, typename = std::enable_if_t::value>> void reduce(const Group &__restrict__ group, ValueType *__restrict__ data, Operator reduce_op = Operator{}) { const auto local_id = group.thread_rank(); - for (int k = group.size() / 2; k >= config::warp_size; k /= 2) { + for (int k = group.size() / 2; k >= sg_size; k /= 2) { group.sync(); if (local_id < k) { data[local_id] = reduce_op(data[local_id], data[local_id + k]); } } - const auto warp = group::tiled_partition(group); + const auto warp = group::tiled_partition(group); const auto warp_id = group.thread_rank() / warp.size(); if (warp_id > 0) { return; @@ -164,7 +173,7 @@ void reduce(const Group &__restrict__ group, ValueType *__restrict__ data, * `source` of any size. Has to be called a second time on `result` to reduce * an array larger than `block_size`. */ -template +template void reduce_array(size_type size, const ValueType *__restrict__ source, ValueType *__restrict__ result, sycl::nd_item<3> item_ct1, Operator reduce_op = Operator{}) @@ -180,7 +189,7 @@ void reduce_array(size_type size, const ValueType *__restrict__ source, group::this_thread_block(item_ct1).sync(); // Stores the result of the reduction inside `result[0]` - reduce(group::this_thread_block(item_ct1), result, reduce_op); + reduce(group::this_thread_block(item_ct1), result, reduce_op); } @@ -189,47 +198,50 @@ void reduce_array(size_type size, const ValueType *__restrict__ source, * * Computes a reduction using the add operation (+) on an array * `source` of any size. Has to be called a second time on `result` to reduce - * an array larger than `default_block_size`. + * an array larger than `block_size`. */ -template +template void reduce_add_array( size_type size, const ValueType *__restrict__ source, ValueType *__restrict__ result, sycl::nd_item<3> item_ct1, - UninitializedArray *block_sum) + UninitializedArray(cfg)> *block_sum) { - reduce_array(size, source, static_cast((*block_sum)), item_ct1, - [](const ValueType &x, const ValueType &y) { return x + y; }); + reduce_array(cfg)>( + size, source, static_cast((*block_sum)), item_ct1, + [](const ValueType &x, const ValueType &y) { return x + y; }); if (item_ct1.get_local_id(2) == 0) { result[item_ct1.get_group(2)] = (*block_sum)[0]; } } -template +template void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, const ValueType *source, ValueType *result) { stream->submit([&](sycl::handler &cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, + sycl::accessor(cfg)>, + 0, sycl::access::mode::read_write, sycl::access::target::local> block_sum_acc_ct1(cgh); - auto local_range = block.get_range(); - auto global_range = grid.get_range() * local_range; - cgh.parallel_for( - sycl::nd_range<3>(global_range, local_range), - [=](sycl::nd_item<3> item_ct1) { - reduce_add_array( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + reduce_add_array( size, source, result, item_ct1, - (UninitializedArray *) + (UninitializedArray(cfg)> *) block_sum_acc_ct1.get_pointer()); }); }); } +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_add_array_config, + reduce_add_array); + +GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config, + KCFG_1D, kcfg_1d_list); + /** * Compute a reduction using add operation (+). @@ -247,23 +259,32 @@ ValueType reduce_add_array(std::shared_ptr exec, auto block_results_val = source; size_type grid_dim = size; auto block_results = Array(exec); - if (size > default_block_size) { - const auto n = ceildiv(size, default_block_size); - grid_dim = (n <= default_block_size) ? n : default_block_size; + ValueType answer = zero(); + for (auto &cfg : kcfg_1d_array) { + const auto block_size = KCFG_1D::decode<0>(cfg); + const auto warp_size = KCFG_1D::decode<1>(cfg); + if (!validate(exec->get_queue(), block_size, warp_size)) { + continue; + } + if (size > block_size) { + const auto n = ceildiv(size, block_size); + grid_dim = (n <= block_size) ? n : block_size; - block_results.resize_and_reset(grid_dim); + block_results.resize_and_reset(grid_dim); - reduce_add_array(grid_dim, default_block_size, 0, exec->get_queue(), - size, source, block_results.get_data()); + reduce_add_array_call(grid_dim, block_size, 0, exec->get_queue(), + size, source, block_results.get_data()); - block_results_val = block_results.get_const_data(); - } + block_results_val = block_results.get_const_data(); + } - auto d_result = Array(exec, 1); + auto d_result = Array(exec, 1); - reduce_add_array(1, default_block_size, 0, exec->get_queue(), grid_dim, - block_results_val, d_result.get_data()); - auto answer = exec->copy_val_to_host(d_result.get_const_data()); + reduce_add_array_call(1, block_size, 0, exec->get_queue(), grid_dim, + block_results_val, d_result.get_data()); + answer = exec->copy_val_to_host(d_result.get_const_data()); + break; + } return answer; } diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp index ebd5b441f6c..3c79c7b7455 100644 --- a/include/ginkgo/core/synthesizer/containers.hpp +++ b/include/ginkgo/core/synthesizer/containers.hpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_PUBLIC_CORE_SYNTHESIZER_CONTAINERS_HPP_ +#include #include @@ -113,6 +114,13 @@ template using as_list = typename detail::as_list_impl::type; +template +constexpr std::array as_array(value_list vl) +{ + return std::array{Value...}; +} + + } // namespace syn } // namespace gko From 3d06e1779feb8adff5135c8a299c943d65f767d7 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 25 May 2021 00:46:29 +0200 Subject: [PATCH 10/22] fix dpcpp doesn't have subwarp and use ConfigSet --- dpcpp/CMakeLists.txt | 1 + dpcpp/base/helper.dp.cpp | 31 ++ dpcpp/base/helper.hpp | 22 +- dpcpp/components/reduction.dp.hpp | 1 + dpcpp/components/thread_ids.dp.hpp | 6 +- dpcpp/matrix/dense_kernels.dp.cpp | 485 +++++++++++++++++------------ 6 files changed, 336 insertions(+), 210 deletions(-) create mode 100644 dpcpp/base/helper.dp.cpp diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index b3101d8b2e2..48addebaf5f 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -11,6 +11,7 @@ target_sources(ginkgo_dpcpp PRIVATE base/version.dp.cpp base/executor.dp.cpp + base/helper.dp.cpp components/absolute_array.dp.cpp components/fill_array.dp.cpp components/prefix_sum.dp.cpp diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp new file mode 100644 index 00000000000..fe4395e2534 --- /dev/null +++ b/dpcpp/base/helper.dp.cpp @@ -0,0 +1,31 @@ +#include + +#include "dpcpp/base/helper.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +bool validate(sycl::queue *queue, unsigned int workgroup_size, + unsigned int subgroup_size) +{ + { + auto device = queue->get_device(); + auto subgroup_size_list = + device.get_info(); + auto max_workgroup_size = + device.get_info(); + bool allowed = false; + for (auto &i : subgroup_size_list) { + allowed |= (i == subgroup_size); + } + return allowed && (workgroup_size <= max_workgroup_size); + } +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index f8eee93f25b..3979caa905c 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -137,18 +138,19 @@ namespace dpcpp { bool validate(sycl::queue *queue, unsigned workgroup_size, - unsigned subgroup_size) + unsigned subgroup_size); + + +template +ConfigSetType get_first_cfg(IterArr &arr, Validate verify) { - auto device = queue->get_device(); - auto subgroup_size_list = - device.get_info(); - auto max_workgroup_size = - device.get_info(); - bool allowed = false; - for (auto &i : subgroup_size_list) { - allowed |= (i == subgroup_size); + for (auto &cfg : arr) { + if (verify(cfg)) { + return cfg; + } } - return allowed && (workgroup_size <= max_workgroup_size); + GKO_NOT_SUPPORTED(arr); + return 0; } diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index 4f1835dfea7..bc9937ddb1a 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -54,6 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/components/thread_ids.dp.hpp" #include "dpcpp/components/uninitialized_array.hpp" + namespace gko { namespace kernels { namespace dpcpp { diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp index 8694d6a88c9..5b656c5e0db 100644 --- a/dpcpp/components/thread_ids.dp.hpp +++ b/dpcpp/components/thread_ids.dp.hpp @@ -124,7 +124,8 @@ __dpct_inline__ size_type get_local_warp_id(sycl::nd_item<3> item_ct1) template __dpct_inline__ size_type get_local_subwarp_id(sycl::nd_item<3> item_ct1) { - constexpr auto subwarps_per_warp = config::warp_size / subwarp_size; + // dpcpp does not have subwarp. + constexpr auto subwarps_per_warp = subwarp_size / subwarp_size; return get_local_warp_id(item_ct1) * subwarps_per_warp + item_ct1.get_local_id(1); } @@ -195,7 +196,8 @@ __dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1) template __dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1) { - constexpr auto subwarps_per_warp = config::warp_size / subwarp_size; + // dpcpp dose not have subwarp + constexpr auto subwarps_per_warp = subwarp_size / subwarp_size; return get_warp_id(item_ct1) * subwarps_per_warp + item_ct1.get_local_id(1); } diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 58199a221d6..0d1be735e17 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -68,7 +68,13 @@ namespace dpcpp { */ namespace dense { - +using KCFG_1D = ConfigSet<11, 7>; +constexpr auto kcfg_1d_list = + syn::value_list(); +constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); constexpr auto default_block_size = 256; @@ -149,58 +155,67 @@ void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha, GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag) -template -void compute_partial_reduce(size_type num_rows, OutType *__restrict__ work, - CallableGetValue get_value, - CallableReduce reduce_op, sycl::nd_item<3> item_ct1, - UninitializedArray *tmp_work) +template +void compute_partial_reduce( + size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value, + CallableReduce reduce_op, sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *tmp_work) { - constexpr auto warps_per_block = block_size / config::warp_size; + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + + constexpr auto warps_per_block = wg_size / sg_size; const auto num_blocks = item_ct1.get_group_range(2); - const auto local_id = - thread::get_local_thread_id(item_ct1); + const auto local_id = thread::get_local_thread_id(item_ct1); const auto global_id = - thread::get_thread_id(item_ct1); + thread::get_thread_id(item_ct1); OutType *tmp_work_array = *tmp_work; auto tmp = zero(); - for (auto i = global_id; i < num_rows; i += block_size * num_blocks) { + for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) { tmp = reduce_op(tmp, get_value(i)); } tmp_work_array[local_id] = tmp; - ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), - tmp_work_array, reduce_op); + ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), + tmp_work_array, reduce_op); if (local_id == 0) { work[thread::get_block_id(item_ct1)] = tmp_work_array[0]; } } +// GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_reduce_config, +// compute_partial_reduce); +// GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_reduce_call, +// compute_partial_reduce_config, +// KCFG_1D, kcfg_1d_list); -template +template void finalize_reduce_computation( size_type size, const ValueType *work, ValueType *result, CallableReduce reduce_op, CallableFinalize finalize_op, sycl::nd_item<3> item_ct1, - UninitializedArray *tmp_work) + UninitializedArray(cfg)> *tmp_work) { - const auto local_id = - thread::get_local_thread_id(item_ct1); + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + + const auto local_id = thread::get_local_thread_id(item_ct1); ValueType tmp = zero(); - for (auto i = local_id; i < size; i += block_size) { + for (auto i = local_id; i < size; i += wg_size) { tmp = reduce_op(tmp, work[i]); } ValueType *tmp_work_array = *tmp_work; tmp_work_array[local_id] = tmp; - ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), - tmp_work_array, reduce_op); + ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), + tmp_work_array, reduce_op); if (local_id == 0) { *result = finalize_op(tmp_work_array[0]); @@ -208,14 +223,14 @@ void finalize_reduce_computation( } -template -void compute_partial_dot(size_type num_rows, const ValueType *__restrict__ x, - size_type stride_x, const ValueType *__restrict__ y, - size_type stride_y, ValueType *__restrict__ work, - sycl::nd_item<3> item_ct1, - UninitializedArray *tmp_work) +template +void compute_partial_dot( + size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, + const ValueType *__restrict__ y, size_type stride_y, + ValueType *__restrict__ work, sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *tmp_work) { - compute_partial_reduce( + compute_partial_reduce( num_rows, work, [x, stride_x, y, stride_y](size_type i) { return x[i * stride_x] * conj(y[i * stride_y]); @@ -224,122 +239,175 @@ void compute_partial_dot(size_type num_rows, const ValueType *__restrict__ x, tmp_work); } -template +template void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_rows, const ValueType *x, size_type stride_x, const ValueType *y, size_type stride_y, ValueType *work) { + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + std::cout << "partial " << cfg << std::endl; stream->submit([&](sycl::handler &cgh) { - sycl::accessor, 0, + sycl::accessor, 0, sycl::access::mode::read_write, sycl::access::target::local> tmp_work_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_dot( + compute_partial_dot( num_rows, x, stride_x, y, stride_y, work, item_ct1, - (UninitializedArray *) + (UninitializedArray *) tmp_work_acc_ct1.get_pointer()); }); }); } -template +template void finalize_dot_computation( size_type size, const ValueType *work, ValueType *result, sycl::nd_item<3> item_ct1, - UninitializedArray *tmp_work) + UninitializedArray(cfg)> *tmp_work) { - finalize_reduce_computation( + finalize_reduce_computation( size, work, result, [](const ValueType &x, const ValueType &y) { return x + y; }, [](const ValueType &x) { return x; }, item_ct1, tmp_work); } -template +template void finalize_dot_computation(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, const ValueType *work, ValueType *result) { + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + std::cout << "finalize " << cfg << std::endl; stream->submit([&](sycl::handler &cgh) { - sycl::accessor, 0, + sycl::accessor, 0, sycl::access::mode::read_write, sycl::access::target::local> tmp_work_acc_ct1(cgh); cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - finalize_dot_computation( + finalize_dot_computation( size, work, result, item_ct1, - (UninitializedArray *) + (UninitializedArray *) tmp_work_acc_ct1.get_pointer()); }); }); } +template +void compute_dot(std::shared_ptr exec, + const matrix::Dense *x, + const matrix::Dense *y, + matrix::Dense *result) +{ + constexpr auto work_per_thread = 32; + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + std::cout << "dot " << cfg << " " << wg_size << " " << sg_size << std::endl; + constexpr auto work_per_block = work_per_thread * wg_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{sg_size, 1, wg_size / sg_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + compute_partial_dot(grid_dim, block_dim, 0, exec->get_queue(), + x->get_size()[0], x->get_const_values() + col, + x->get_stride(), y->get_const_values() + col, + y->get_stride(), work.get_data()); + finalize_dot_computation(1, block_dim, 0, exec->get_queue(), + grid_dim.x, work.get_const_data(), + result->get_values() + col); + } +} -template +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_dot_config, compute_dot) + +template +void compute_dot_call(std::shared_ptr exec, + const matrix::Dense *x, + const matrix::Dense *y, + matrix::Dense *result) +{ + auto queue = exec->get_queue(); + compute_dot_config( + kcfg_1d_list, + [&queue](::gko::ConfigSetType cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }, + ::gko::syn::value_list(), ::gko::syn::value_list(), + ::gko::syn::value_list(), ::gko::syn::type_list<>(), + exec, x, y, result); +} + + +template void compute_partial_norm2( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, remove_complex *__restrict__ work, sycl::nd_item<3> item_ct1, - UninitializedArray, block_size> *tmp_work) + UninitializedArray, KCFG_1D::decode<0>(cfg)> + *tmp_work) { using norm_type = remove_complex; - compute_partial_reduce( + compute_partial_reduce( num_rows, work, [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); }, [](const norm_type &x, const norm_type &y) { return x + y; }, item_ct1, tmp_work); } -template +template void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_rows, const ValueType *x, size_type stride_x, remove_complex *work) { + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); stream->submit([&](sycl::handler &cgh) { - sycl::accessor< - UninitializedArray, block_size>, 0, - sycl::access::mode::read_write, sycl::access::target::local> + sycl::accessor, wg_size>, + 0, sycl::access::mode::read_write, + sycl::access::target::local> tmp_work_acc_ct1(cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_norm2( + compute_partial_norm2( num_rows, x, stride_x, work, item_ct1, - (UninitializedArray, block_size> - *)tmp_work_acc_ct1.get_pointer()); + (UninitializedArray, wg_size> *) + tmp_work_acc_ct1.get_pointer()); }); }); } -template +template void finalize_norm2_computation( size_type size, const ValueType *work, ValueType *result, sycl::nd_item<3> item_ct1, - UninitializedArray *tmp_work) + UninitializedArray(cfg)> *tmp_work) { - finalize_reduce_computation( + finalize_reduce_computation( size, work, result, [](const ValueType &x, const ValueType &y) { return x + y; }, [](const ValueType &x) { return sqrt(x); }, item_ct1, tmp_work); } -template +template void finalize_norm2_computation(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, const ValueType *work, ValueType *result) { + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); stream->submit([&](sycl::handler &cgh) { - sycl::accessor, 0, + sycl::accessor, 0, sycl::access::mode::read_write, sycl::access::target::local> tmp_work_acc_ct1(cgh); @@ -347,15 +415,63 @@ void finalize_norm2_computation(dim3 grid, dim3 block, cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - finalize_norm2_computation( + finalize_norm2_computation( size, work, result, item_ct1, - (UninitializedArray *) + (UninitializedArray *) tmp_work_acc_ct1.get_pointer()); }); }); } +template +void compute_norm2(std::shared_ptr exec, + const matrix::Dense *x, + matrix::Dense> *result) +{ + using norm_type = remove_complex; + // // TODO: these are tuning parameters obtained experimentally, once + // // we decide how to handle this uniformly, they should be modified + // // appropriately + constexpr auto work_per_thread = 32; + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + + constexpr auto work_per_block = work_per_thread * wg_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{sg_size, 1, wg_size / sg_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + compute_partial_norm2( + grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), work.get_data()); + finalize_norm2_computation(1, block_dim, 0, exec->get_queue(), + grid_dim.x, work.get_const_data(), + result->get_values() + col); + } +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_norm2_config, compute_norm2) + +template +void compute_norm2_call(std::shared_ptr exec, + const matrix::Dense *x, + matrix::Dense> *result) +{ + auto queue = exec->get_queue(); + compute_norm2_config( + kcfg_1d_list, + [&queue](::gko::ConfigSetType cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }, + ::gko::syn::value_list(), ::gko::syn::value_list(), + ::gko::syn::value_list(), ::gko::syn::type_list<>(), + exec, x, result); +} + + template void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, const size_type *__restrict__ row_ptrs, @@ -382,20 +498,20 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo) -template +template void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride, const ValueType *__restrict__ work, IndexType *__restrict__ result, sycl::nd_item<3> item_ct1) { - constexpr auto warp_size = config::warp_size; - const auto row_idx = thread::get_subwarp_id_flat(item_ct1); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + const auto row_idx = thread::get_subwarp_id_flat(item_ct1); auto warp_tile = - group::tiled_partition(group::this_thread_block(item_ct1)); + group::tiled_partition(group::this_thread_block(item_ct1)); if (row_idx < num_rows) { IndexType part_result{}; - for (auto i = warp_tile.thread_rank(); i < num_cols; i += warp_size) { + for (auto i = warp_tile.thread_rank(); i < num_cols; i += sg_size) { if (work[stride * row_idx + i] != zero()) { part_result += 1; } @@ -406,20 +522,10 @@ void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride, } } -template -void count_nnz_per_row(dim3 grid, dim3 block, size_t dynamic_shared_memory, - sycl::queue *stream, size_type num_rows, - size_type num_cols, size_type stride, - const ValueType *work, IndexType *result) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - count_nnz_per_row(num_rows, num_cols, stride, work, - result, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST_CONFIG(count_nnz_per_row, count_nnz_per_row) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(count_nnz_per_row, count_nnz_per_row) +GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row, + KCFG_1D, kcfg_1d_list) template @@ -479,6 +585,7 @@ void fill_in_ell(size_type num_rows, size_type num_cols, GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell) +template void calculate_slice_lengths(size_type num_rows, size_type slice_size, int slice_num, size_type stride_factor, const size_type *__restrict__ nnz_per_row, @@ -486,21 +593,21 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size, size_type *__restrict__ slice_sets, sycl::nd_item<3> item_ct1) { - constexpr auto warp_size = config::warp_size; + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); const auto sliceid = item_ct1.get_group(2); const auto tid_in_warp = item_ct1.get_local_id(2); if (sliceid * slice_size + tid_in_warp < num_rows) { size_type thread_result = 0; - for (size_type i = tid_in_warp; i < slice_size; i += warp_size) { + for (size_type i = tid_in_warp; i < slice_size; i += sg_size) { thread_result = (i + slice_size * sliceid < num_rows) ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) : thread_result; } - auto warp_tile = group::tiled_partition( - group::this_thread_block(item_ct1)); + auto warp_tile = + group::tiled_partition(group::this_thread_block(item_ct1)); auto warp_result = ::gko::kernels::dpcpp::reduce( warp_tile, thread_result, [](const size_type &a, const size_type &b) { return max(a, b); }); @@ -514,22 +621,11 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size, } } -void calculate_slice_lengths(dim3 grid, dim3 block, - size_t dynamic_shared_memory, sycl::queue *stream, - size_type num_rows, size_type slice_size, - int slice_num, size_type stride_factor, - const size_type *nnz_per_row, - size_type *slice_lengths, size_type *slice_sets) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - calculate_slice_lengths(num_rows, slice_size, slice_num, - stride_factor, nnz_per_row, - slice_lengths, slice_sets, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths, + calculate_slice_lengths) +GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call, + calculate_slice_lengths, KCFG_1D, kcfg_1d_list) template @@ -567,14 +663,15 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp) - +template void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result, sycl::nd_item<3> item_ct1, uint8_t *dpct_local) { + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); auto block_max = (size_type *)dpct_local; - reduce_array( + reduce_array( size, nnz_per_row, block_max, item_ct1, [](const size_type &x, const size_type &y) { return max(x, y); }); @@ -583,6 +680,7 @@ void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, } } +template void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, const size_type *nnz_per_row, size_type *result) @@ -593,30 +691,34 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory, dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - reduce_max_nnz(size, nnz_per_row, result, item_ct1, - dpct_local_acc_ct1.get_pointer()); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + reduce_max_nnz(size, nnz_per_row, result, item_ct1, + dpct_local_acc_ct1.get_pointer()); + }); }); } +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz); +GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz, KCFG_1D, + kcfg_1d_list) +template void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size, size_type stride_factor, const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result, sycl::nd_item<3> item_ct1) { - constexpr auto warp_size = config::warp_size; + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); auto warp_tile = - group::tiled_partition(group::this_thread_block(item_ct1)); - const auto warpid = thread::get_subwarp_id_flat(item_ct1); + group::tiled_partition(group::this_thread_block(item_ct1)); + const auto warpid = thread::get_subwarp_id_flat(item_ct1); const auto tid_in_warp = warp_tile.thread_rank(); const auto slice_num = ceildiv(num_rows, slice_size); size_type thread_result = 0; - for (size_type i = tid_in_warp; i < slice_size; i += warp_size) { + for (size_type i = tid_in_warp; i < slice_size; i += sg_size) { if (warpid * slice_size + i < num_rows) { thread_result = max(thread_result, nnz_per_row[warpid * slice_size + i]); @@ -632,37 +734,32 @@ void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size, } } -void reduce_max_nnz_per_slice(dim3 grid, dim3 block, - size_t dynamic_shared_memory, sycl::queue *stream, - size_type num_rows, size_type slice_size, - size_type stride_factor, - const size_type *nnz_per_row, size_type *result) -{ - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - reduce_max_nnz_per_slice(num_rows, slice_size, stride_factor, - nnz_per_row, result, item_ct1); - }); - }); -} +GKO_ENABLE_DEFAULT_HOST_CONFIG(reduce_max_nnz_per_slice, + reduce_max_nnz_per_slice) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz_per_slice, + reduce_max_nnz_per_slice) +GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call, + reduce_max_nnz_per_slice, KCFG_1D, kcfg_1d_list) +template void reduce_total_cols(size_type num_slices, const size_type *__restrict__ max_nnz_per_slice, size_type *__restrict__ result, sycl::nd_item<3> item_ct1, uint8_t *dpct_local) { auto block_result = (size_type *)dpct_local; - - reduce_array(num_slices, max_nnz_per_slice, block_result, item_ct1, - [](const size_type &x, const size_type &y) { return x + y; }); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + reduce_array( + num_slices, max_nnz_per_slice, block_result, item_ct1, + [](const size_type &x, const size_type &y) { return x + y; }); if (item_ct1.get_local_id(2) == 0) { result[item_ct1.get_group(2)] = block_result[0]; } } +template void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_slices, const size_type *max_nnz_per_slice, size_type *result) @@ -674,12 +771,18 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - reduce_total_cols(num_slices, max_nnz_per_slice, result, - item_ct1, dpct_local_acc_ct1.get_pointer()); + reduce_total_cols(num_slices, max_nnz_per_slice, result, + item_ct1, + dpct_local_acc_ct1.get_pointer()); }); }); } +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_total_cols, + reduce_total_cols); +GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols, + KCFG_1D, kcfg_1d_list) + template void symm_permute(size_type num_rows, size_type num_cols, @@ -952,24 +1055,7 @@ void compute_dot(std::shared_ptr exec, // TODO: these are tuning parameters obtained experimentally, once // we decide how to handle this uniformly, they should be modified // appropriately - constexpr auto work_per_thread = 32; - constexpr auto block_size = default_block_size; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_dot( - grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), work.get_data()); - kernel::finalize_dot_computation( - 1, block_dim, 0, exec->get_queue(), grid_dim.x, - work.get_const_data(), result->get_values() + col); - } + kernel::compute_dot_call(exec, x, y, result); } } @@ -998,27 +1084,7 @@ void compute_norm2(std::shared_ptr exec, result->get_values() + col); } } else { - using norm_type = remove_complex; - // // TODO: these are tuning parameters obtained experimentally, once - // // we decide how to handle this uniformly, they should be modified - // // appropriately - constexpr auto work_per_thread = 32; - constexpr auto block_size = default_block_size; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_norm2( - grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), work.get_data()); - kernel::finalize_norm2_computation( - 1, block_dim, 0, exec->get_queue(), grid_dim.x, - work.get_const_data(), result->get_values() + col); - } + kernel::compute_norm2_call(exec, x, result); } } @@ -1073,9 +1139,9 @@ void convert_to_csr(std::shared_ptr exec, const auto rows_per_block = ceildiv(default_block_size, config::warp_size); const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); - kernel::count_nnz_per_row(grid_dim_nnz, default_block_size, 0, - exec->get_queue(), num_rows, num_cols, stride, - source->get_const_values(), row_ptrs); + kernel::count_nnz_per_row_call( + grid_dim_nnz, default_block_size, 0, exec->get_queue(), num_rows, + num_cols, stride, source->get_const_values(), row_ptrs); components::prefix_sum(exec, row_ptrs, num_rows + 1); @@ -1157,7 +1223,7 @@ void convert_to_sellp(std::shared_ptr exec, if (grid_dim > 0) { std::cout << "calculate_slice_lengths" << std::endl; - kernel::calculate_slice_lengths( + kernel::calculate_slice_lengths_call( grid_dim, config::warp_size, 0, exec->get_queue(), num_rows, slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(), slice_lengths, slice_sets); @@ -1218,24 +1284,30 @@ void calculate_max_nnz_per_row(std::shared_ptr exec, auto nnz_per_row = Array(exec, num_rows); calculate_nonzeros_per_row(exec, source, &nnz_per_row); - - const auto n = ceildiv(num_rows, default_block_size); - const size_type grid_dim = - (n <= default_block_size) ? n : default_block_size; + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const ConfigSetType cfg = + get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + std::cout << "wg_size " << wg_size << "sg_size " << KCFG_1D::decode<1>(cfg) + << std::endl; + const auto n = ceildiv(num_rows, wg_size); + const size_type grid_dim = (n <= wg_size) ? n : wg_size; auto block_results = Array(exec, grid_dim); - kernel::reduce_max_nnz( - grid_dim, default_block_size, default_block_size * sizeof(size_type), - exec->get_queue(), num_rows, nnz_per_row.get_const_data(), - block_results.get_data()); + kernel::reduce_max_nnz_call( + grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), + num_rows, nnz_per_row.get_const_data(), block_results.get_data()); auto d_result = Array(exec, 1); - kernel::reduce_max_nnz(1, default_block_size, - default_block_size * sizeof(size_type), - exec->get_queue(), grid_dim, - block_results.get_const_data(), d_result.get_data()); + kernel::reduce_max_nnz_call( + 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), grid_dim, + block_results.get_const_data(), d_result.get_data()); *result = exec->copy_val_to_host(d_result.get_const_data()); } @@ -1249,12 +1321,21 @@ void calculate_nonzeros_per_row(std::shared_ptr exec, const matrix::Dense *source, Array *result) { - const dim3 block_size(default_block_size, 1, 1); - auto rows_per_block = ceildiv(default_block_size, config::warp_size); + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const ConfigSetType cfg = + get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + const dim3 block_size(wg_size, 1, 1); + auto rows_per_block = ceildiv(wg_size, sg_size); const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block); const dim3 grid_size(grid_x, 1, 1); if (grid_x > 0) { - kernel::count_nnz_per_row( + kernel::count_nnz_per_row_call( grid_size, block_size, 0, exec->get_queue(), source->get_size()[0], source->get_size()[1], source->get_stride(), source->get_const_values(), result->get_data()); @@ -1286,28 +1367,36 @@ void calculate_total_cols(std::shared_ptr exec, calculate_nonzeros_per_row(exec, source, &nnz_per_row); auto max_nnz_per_slice = Array(exec, slice_num); - - auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size); - - kernel::reduce_max_nnz_per_slice( - grid_dim, default_block_size, 0, exec->get_queue(), num_rows, - slice_size, stride_factor, nnz_per_row.get_const_data(), + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const ConfigSetType cfg = + get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + + auto grid_dim = ceildiv(slice_num * sg_size, wg_size); + + kernel::reduce_max_nnz_per_slice_call( + grid_dim, wg_size, 0, exec->get_queue(), num_rows, slice_size, + stride_factor, nnz_per_row.get_const_data(), max_nnz_per_slice.get_data()); - grid_dim = ceildiv(slice_num, default_block_size); + grid_dim = ceildiv(slice_num, wg_size); auto block_results = Array(exec, grid_dim); - kernel::reduce_total_cols( - grid_dim, default_block_size, default_block_size * sizeof(size_type), - exec->get_queue(), slice_num, max_nnz_per_slice.get_const_data(), - block_results.get_data()); + kernel::reduce_total_cols(grid_dim, wg_size, wg_size * sizeof(size_type), + exec->get_queue(), slice_num, + max_nnz_per_slice.get_const_data(), + block_results.get_data()); auto d_result = Array(exec, 1); kernel::reduce_total_cols( - 1, default_block_size, default_block_size * sizeof(size_type), - exec->get_queue(), grid_dim, block_results.get_const_data(), - d_result.get_data()); + 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), grid_dim, + block_results.get_const_data(), d_result.get_data()); *result = exec->copy_val_to_host(d_result.get_const_data()); } From 20061b64477142be4b834b733002553879b9ef04 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 25 May 2021 22:02:18 +0200 Subject: [PATCH 11/22] move to new style for config selection --- dpcpp/components/prefix_sum.dp.cpp | 38 ++++- dpcpp/components/prefix_sum.dp.hpp | 18 +- dpcpp/components/reduction.dp.hpp | 42 ++--- dpcpp/matrix/dense_kernels.dp.cpp | 256 ++++++++++++++--------------- 4 files changed, 181 insertions(+), 173 deletions(-) diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp index b4961809a8b..62a9700473d 100644 --- a/dpcpp/components/prefix_sum.dp.cpp +++ b/dpcpp/components/prefix_sum.dp.cpp @@ -36,6 +36,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + +#include "dpcpp/base/helper.hpp" #include "dpcpp/components/prefix_sum.dp.hpp" @@ -45,7 +49,20 @@ namespace dpcpp { namespace components { -constexpr int prefix_sum_block_size = 256; +using BlockCfg = ConfigSet<11>; + +constexpr auto block_cfg_list = + ::gko::syn::value_list(); + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum) +GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum, + BlockCfg, block_cfg_list) + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_prefix_sum, + finalize_prefix_sum) +GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_prefix_sum_call, finalize_prefix_sum, + BlockCfg, block_cfg_list) template @@ -54,18 +71,23 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, { // prefix_sum should be on the valid array if (num_entries > 0) { - auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); + auto queue = exec->get_queue(); + constexpr auto block_cfg_array = as_array(block_cfg_list); + const ConfigSetType cfg = + get_first_cfg(block_cfg_array, [&queue](ConfigSetType cfg) { + return validate(queue, BlockCfg::decode<0>(cfg), 16); + }); + const auto wg_size = BlockCfg::decode<0>(cfg); + auto num_blocks = ceildiv(num_entries, wg_size); Array block_sum_array(exec, num_blocks - 1); auto block_sums = block_sum_array.get_data(); - start_prefix_sum( - num_blocks, prefix_sum_block_size, 0, exec->get_queue(), - num_entries, counts, block_sums); + start_prefix_sum_call(num_blocks, wg_size, 0, exec->get_queue(), cfg, + num_entries, counts, block_sums); // add the total sum of the previous block only when the number of block // is larger than 1. if (num_blocks > 1) { - finalize_prefix_sum( - num_blocks, prefix_sum_block_size, 0, exec->get_queue(), - num_entries, counts, block_sums); + finalize_prefix_sum_call(num_blocks, wg_size, 0, exec->get_queue(), + cfg, num_entries, counts, block_sums); } } } diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index c6f7c7cfb20..fd9ff2ac263 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -125,7 +125,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element, * @note To calculate the prefix sum over an array of size bigger than * `block_size`, `finalize_prefix_sum` has to be used as well. */ -template +template void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, ValueType *__restrict__ block_sum, sycl::nd_item<3> item_ct1, @@ -178,7 +178,7 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, } } -template +template void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_elements, ValueType *elements, ValueType *block_sum) @@ -189,10 +189,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::target::local> prefix_helper_acc_ct1(cgh); - auto local_range = block.get_range(); - auto global_range = grid.get_range() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { start_prefix_sum( num_elements, elements, block_sum, item_ct1, @@ -217,7 +214,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, * * @note To calculate a prefix sum, first `start_prefix_sum` has to be called. */ -template +template void finalize_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, const ValueType *__restrict__ block_sum, @@ -234,16 +231,13 @@ void finalize_prefix_sum(size_type num_elements, } } -template +template void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_elements, ValueType *elements, const ValueType *block_sum) { stream->submit([&](sycl::handler &cgh) { - auto local_range = block.get_range(); - auto global_range = grid.get_range() * local_range; - - cgh.parallel_for(sycl::nd_range<3>(global_range, local_range), + cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { finalize_prefix_sum( num_elements, elements, block_sum, item_ct1); diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index bc9937ddb1a..2626d40b314 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -261,31 +261,33 @@ ValueType reduce_add_array(std::shared_ptr exec, size_type grid_dim = size; auto block_results = Array(exec); ValueType answer = zero(); - for (auto &cfg : kcfg_1d_array) { - const auto block_size = KCFG_1D::decode<0>(cfg); - const auto warp_size = KCFG_1D::decode<1>(cfg); - if (!validate(exec->get_queue(), block_size, warp_size)) { - continue; - } - if (size > block_size) { - const auto n = ceildiv(size, block_size); - grid_dim = (n <= block_size) ? n : block_size; - - block_results.resize_and_reset(grid_dim); + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const ConfigSetType cfg = + get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); - reduce_add_array_call(grid_dim, block_size, 0, exec->get_queue(), - size, source, block_results.get_data()); + if (size > wg_size) { + const auto n = ceildiv(size, wg_size); + grid_dim = (n <= wg_size) ? n : wg_size; - block_results_val = block_results.get_const_data(); - } + block_results.resize_and_reset(grid_dim); - auto d_result = Array(exec, 1); + reduce_add_array_call(grid_dim, wg_size, 0, exec->get_queue(), cfg, + size, source, block_results.get_data()); - reduce_add_array_call(1, block_size, 0, exec->get_queue(), grid_dim, - block_results_val, d_result.get_data()); - answer = exec->copy_val_to_host(d_result.get_const_data()); - break; + block_results_val = block_results.get_const_data(); } + + auto d_result = Array(exec, 1); + + reduce_add_array_call(1, wg_size, 0, exec->get_queue(), cfg, grid_dim, + block_results_val, d_result.get_data()); + answer = exec->copy_val_to_host(d_result.get_const_data()); return answer; } diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 0d1be735e17..3e4fee0a2d5 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -264,6 +264,11 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, }); } +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot, + compute_partial_dot) +GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot, + KCFG_1D, kcfg_1d_list) + template void finalize_dot_computation( @@ -301,51 +306,10 @@ void finalize_dot_computation(dim3 grid, dim3 block, }); } -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense *x, - const matrix::Dense *y, - matrix::Dense *result) -{ - constexpr auto work_per_thread = 32; - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); - std::cout << "dot " << cfg << " " << wg_size << " " << sg_size << std::endl; - constexpr auto work_per_block = work_per_thread * wg_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{sg_size, 1, wg_size / sg_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - compute_partial_dot(grid_dim, block_dim, 0, exec->get_queue(), - x->get_size()[0], x->get_const_values() + col, - x->get_stride(), y->get_const_values() + col, - y->get_stride(), work.get_data()); - finalize_dot_computation(1, block_dim, 0, exec->get_queue(), - grid_dim.x, work.get_const_data(), - result->get_values() + col); - } -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_dot_config, compute_dot) - -template -void compute_dot_call(std::shared_ptr exec, - const matrix::Dense *x, - const matrix::Dense *y, - matrix::Dense *result) -{ - auto queue = exec->get_queue(); - compute_dot_config( - kcfg_1d_list, - [&queue](::gko::ConfigSetType cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }, - ::gko::syn::value_list(), ::gko::syn::value_list(), - ::gko::syn::value_list(), ::gko::syn::type_list<>(), - exec, x, y, result); -} +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_dot_computation, + finalize_dot_computation) +GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_dot_computation_call, + finalize_dot_computation, KCFG_1D, kcfg_1d_list) template @@ -386,6 +350,11 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, }); } +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2, + compute_partial_norm2) +GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call, + compute_partial_norm2, KCFG_1D, kcfg_1d_list) + template void finalize_norm2_computation( @@ -423,53 +392,11 @@ void finalize_norm2_computation(dim3 grid, dim3 block, }); } - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense *x, - matrix::Dense> *result) -{ - using norm_type = remove_complex; - // // TODO: these are tuning parameters obtained experimentally, once - // // we decide how to handle this uniformly, they should be modified - // // appropriately - constexpr auto work_per_thread = 32; - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); - - constexpr auto work_per_block = work_per_thread * wg_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{sg_size, 1, wg_size / sg_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - compute_partial_norm2( - grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), work.get_data()); - finalize_norm2_computation(1, block_dim, 0, exec->get_queue(), - grid_dim.x, work.get_const_data(), - result->get_values() + col); - } -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_norm2_config, compute_norm2) - -template -void compute_norm2_call(std::shared_ptr exec, - const matrix::Dense *x, - matrix::Dense> *result) -{ - auto queue = exec->get_queue(); - compute_norm2_config( - kcfg_1d_list, - [&queue](::gko::ConfigSetType cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }, - ::gko::syn::value_list(), ::gko::syn::value_list(), - ::gko::syn::value_list(), ::gko::syn::type_list<>(), - exec, x, result); -} +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_norm2_computation, + finalize_norm2_computation) +GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_norm2_computation_call, + finalize_norm2_computation, KCFG_1D, + kcfg_1d_list) template @@ -1055,7 +982,32 @@ void compute_dot(std::shared_ptr exec, // TODO: these are tuning parameters obtained experimentally, once // we decide how to handle this uniformly, they should be modified // appropriately - kernel::compute_dot_call(exec, x, y, result); + constexpr auto work_per_thread = 32; + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const ConfigSetType cfg = + get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + std::cout << "dot " << cfg << " " << wg_size << " " << sg_size + << std::endl; + const auto work_per_block = work_per_thread * wg_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{sg_size, 1, wg_size / sg_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + kernel::compute_partial_dot_call( + grid_dim, block_dim, 0, exec->get_queue(), cfg, + x->get_size()[0], x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), work.get_data()); + kernel::finalize_dot_computation_call( + 1, block_dim, 0, exec->get_queue(), cfg, grid_dim.x, + work.get_const_data(), result->get_values() + col); + } } } @@ -1084,7 +1036,35 @@ void compute_norm2(std::shared_ptr exec, result->get_values() + col); } } else { - kernel::compute_norm2_call(exec, x, result); + using norm_type = remove_complex; + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const ConfigSetType cfg = + get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + + const auto work_per_block = work_per_thread * wg_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{sg_size, 1, wg_size / sg_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + kernel::compute_partial_norm2_call( + grid_dim, block_dim, 0, exec->get_queue(), cfg, + x->get_size()[0], x->get_const_values() + col, x->get_stride(), + work.get_data()); + kernel::finalize_norm2_computation_call( + 1, block_dim, 0, exec->get_queue(), cfg, grid_dim.x, + work.get_const_data(), result->get_values() + col); + } } } @@ -1127,6 +1107,16 @@ void convert_to_csr(std::shared_ptr exec, const matrix::Dense *source, matrix::Csr *result) { + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const ConfigSetType cfg = + get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; @@ -1136,20 +1126,20 @@ void convert_to_csr(std::shared_ptr exec, auto stride = source->get_stride(); - const auto rows_per_block = ceildiv(default_block_size, config::warp_size); + const auto rows_per_block = ceildiv(wg_size, sg_size); const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); - kernel::count_nnz_per_row_call( - grid_dim_nnz, default_block_size, 0, exec->get_queue(), num_rows, - num_cols, stride, source->get_const_values(), row_ptrs); + kernel::count_nnz_per_row_call(grid_dim_nnz, wg_size, 0, exec->get_queue(), + cfg, num_rows, num_cols, stride, + source->get_const_values(), row_ptrs); components::prefix_sum(exec, row_ptrs, num_rows + 1); - size_type grid_dim = ceildiv(num_rows, default_block_size); + size_type grid_dim = ceildiv(num_rows, wg_size); - kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, num_cols, stride, source->get_const_values(), - row_ptrs, col_idxs, values); + kernel::fill_in_csr(grid_dim, wg_size, 0, exec->get_queue(), num_rows, + num_cols, stride, source->get_const_values(), row_ptrs, + col_idxs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1197,6 +1187,16 @@ void convert_to_sellp(std::shared_ptr exec, const matrix::Dense *source, matrix::Sellp *result) { + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const ConfigSetType cfg = + get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + const auto stride = source->get_stride(); const auto num_rows = result->get_size()[0]; const auto num_cols = result->get_size()[1]; @@ -1215,34 +1215,24 @@ void convert_to_sellp(std::shared_ptr exec, const int slice_num = ceildiv(num_rows, slice_size); auto nnz_per_row = Array(exec, num_rows); - std::cout << "calculate_nonzeros_per_row" << std::endl; + calculate_nonzeros_per_row(exec, source, &nnz_per_row); - exec->synchronize(); - std::cout << "calculate_nonzeros_per_row finish" << std::endl; + auto grid_dim = slice_num; if (grid_dim > 0) { - std::cout << "calculate_slice_lengths" << std::endl; kernel::calculate_slice_lengths_call( - grid_dim, config::warp_size, 0, exec->get_queue(), num_rows, - slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(), + grid_dim, sg_size, 0, exec->get_queue(), cfg, num_rows, slice_size, + slice_num, stride_factor, nnz_per_row.get_const_data(), slice_lengths, slice_sets); - exec->synchronize(); - std::cout << "calculate_slice_lengths finish" << std::endl; } - std::cout << "prefix_sum" << std::endl; components::prefix_sum(exec, slice_sets, slice_num + 1); - // exec->synchronize(); - std::cout << "prefix_sum finish" << std::endl; - grid_dim = ceildiv(num_rows, default_block_size); + grid_dim = ceildiv(num_rows, wg_size); if (grid_dim > 0) { - std::cout << "fill_in_sellp" << std::endl; - kernel::fill_in_sellp(grid_dim, default_block_size, 0, - exec->get_queue(), num_rows, num_cols, slice_size, - stride, source->get_const_values(), slice_lengths, + kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows, + num_cols, slice_size, stride, + source->get_const_values(), slice_lengths, slice_sets, col_idxs, vals); - exec->synchronize(); - std::cout << "fill_in_sellp finish" << std::endl; } } @@ -1300,14 +1290,14 @@ void calculate_max_nnz_per_row(std::shared_ptr exec, auto block_results = Array(exec, grid_dim); kernel::reduce_max_nnz_call( - grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), + grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg, num_rows, nnz_per_row.get_const_data(), block_results.get_data()); auto d_result = Array(exec, 1); kernel::reduce_max_nnz_call( - 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), grid_dim, - block_results.get_const_data(), d_result.get_data()); + 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg, + grid_dim, block_results.get_const_data(), d_result.get_data()); *result = exec->copy_val_to_host(d_result.get_const_data()); } @@ -1336,8 +1326,8 @@ void calculate_nonzeros_per_row(std::shared_ptr exec, const dim3 grid_size(grid_x, 1, 1); if (grid_x > 0) { kernel::count_nnz_per_row_call( - grid_size, block_size, 0, exec->get_queue(), source->get_size()[0], - source->get_size()[1], source->get_stride(), + grid_size, block_size, 0, exec->get_queue(), cfg, + source->get_size()[0], source->get_size()[1], source->get_stride(), source->get_const_values(), result->get_data()); } } @@ -1380,23 +1370,23 @@ void calculate_total_cols(std::shared_ptr exec, auto grid_dim = ceildiv(slice_num * sg_size, wg_size); kernel::reduce_max_nnz_per_slice_call( - grid_dim, wg_size, 0, exec->get_queue(), num_rows, slice_size, + grid_dim, wg_size, 0, exec->get_queue(), cfg, num_rows, slice_size, stride_factor, nnz_per_row.get_const_data(), max_nnz_per_slice.get_data()); grid_dim = ceildiv(slice_num, wg_size); auto block_results = Array(exec, grid_dim); - kernel::reduce_total_cols(grid_dim, wg_size, wg_size * sizeof(size_type), - exec->get_queue(), slice_num, - max_nnz_per_slice.get_const_data(), - block_results.get_data()); + kernel::reduce_total_cols_call( + grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg, + slice_num, max_nnz_per_slice.get_const_data(), + block_results.get_data()); auto d_result = Array(exec, 1); - kernel::reduce_total_cols( - 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), grid_dim, - block_results.get_const_data(), d_result.get_data()); + kernel::reduce_total_cols_call( + 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg, + grid_dim, block_results.get_const_data(), d_result.get_data()); *result = exec->copy_val_to_host(d_result.get_const_data()); } From 295ea5d079f9f4692b90e82e0bd9f69fde3e45bc Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 25 May 2021 23:07:31 +0200 Subject: [PATCH 12/22] mv config to the first argument --- dev_tools/scripts/regroup | 2 +- dpcpp/base/helper.dp.cpp | 33 ++++++++++++++++ dpcpp/components/prefix_sum.dp.cpp | 13 ++++--- dpcpp/components/reduction.dp.hpp | 8 ++-- dpcpp/matrix/dense_kernels.dp.cpp | 56 +++++++++++++--------------- dpcpp/test/components/prefix_sum.cpp | 3 -- dpcpp/test/matrix/dense_kernels.cpp | 1 - dpcpp/test/utils.hpp | 54 --------------------------- 8 files changed, 71 insertions(+), 99 deletions(-) delete mode 100644 dpcpp/test/utils.hpp diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup index 9e8a5172d05..1756481e2e4 100644 --- a/dev_tools/scripts/regroup +++ b/dev_tools/scripts/regroup @@ -2,7 +2,7 @@ IncludeBlocks: Regroup IncludeCategories: - Regex: '^<(rapidjson|gflags|gtest|papi).*' Priority: 3 - - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative).*' + - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi).*' Priority: 2 - Regex: '^****************************** +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + #include + #include "dpcpp/base/helper.hpp" diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp index 62a9700473d..330fa297e58 100644 --- a/dpcpp/components/prefix_sum.dp.cpp +++ b/dpcpp/components/prefix_sum.dp.cpp @@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include +#include #include "dpcpp/base/helper.hpp" @@ -57,12 +57,12 @@ constexpr auto block_cfg_list = GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum) GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum, - BlockCfg, block_cfg_list) + block_cfg_list) GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_prefix_sum, finalize_prefix_sum) GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_prefix_sum_call, finalize_prefix_sum, - BlockCfg, block_cfg_list) + block_cfg_list) template @@ -81,13 +81,14 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, auto num_blocks = ceildiv(num_entries, wg_size); Array block_sum_array(exec, num_blocks - 1); auto block_sums = block_sum_array.get_data(); - start_prefix_sum_call(num_blocks, wg_size, 0, exec->get_queue(), cfg, + start_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(), num_entries, counts, block_sums); // add the total sum of the previous block only when the number of block // is larger than 1. if (num_blocks > 1) { - finalize_prefix_sum_call(num_blocks, wg_size, 0, exec->get_queue(), - cfg, num_entries, counts, block_sums); + finalize_prefix_sum_call(cfg, num_blocks, wg_size, 0, + exec->get_queue(), num_entries, counts, + block_sums); } } } diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index 2626d40b314..d3e925ee4ba 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -42,9 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include -#include #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" @@ -241,7 +241,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_add_array_config, reduce_add_array); GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config, - KCFG_1D, kcfg_1d_list); + kcfg_1d_list); /** @@ -277,7 +277,7 @@ ValueType reduce_add_array(std::shared_ptr exec, block_results.resize_and_reset(grid_dim); - reduce_add_array_call(grid_dim, wg_size, 0, exec->get_queue(), cfg, + reduce_add_array_call(cfg, grid_dim, wg_size, 0, exec->get_queue(), size, source, block_results.get_data()); block_results_val = block_results.get_const_data(); @@ -285,7 +285,7 @@ ValueType reduce_add_array(std::shared_ptr exec, auto d_result = Array(exec, 1); - reduce_add_array_call(1, wg_size, 0, exec->get_queue(), cfg, grid_dim, + reduce_add_array_call(cfg, 1, wg_size, 0, exec->get_queue(), grid_dim, block_results_val, d_result.get_data()); answer = exec->copy_val_to_host(d_result.get_const_data()); return answer; diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 3e4fee0a2d5..e83c5dba5cc 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" +#include + + #include #include @@ -45,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include "core/components/prefix_sum.hpp" @@ -188,11 +190,6 @@ void compute_partial_reduce( } } -// GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_reduce_config, -// compute_partial_reduce); -// GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_reduce_call, -// compute_partial_reduce_config, -// KCFG_1D, kcfg_1d_list); template @@ -267,7 +264,7 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot, compute_partial_dot) GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot, - KCFG_1D, kcfg_1d_list) + kcfg_1d_list) template @@ -309,7 +306,7 @@ void finalize_dot_computation(dim3 grid, dim3 block, GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_dot_computation, finalize_dot_computation) GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_dot_computation_call, - finalize_dot_computation, KCFG_1D, kcfg_1d_list) + finalize_dot_computation, kcfg_1d_list) template @@ -353,7 +350,7 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2, compute_partial_norm2) GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call, - compute_partial_norm2, KCFG_1D, kcfg_1d_list) + compute_partial_norm2, kcfg_1d_list) template @@ -395,8 +392,7 @@ void finalize_norm2_computation(dim3 grid, dim3 block, GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_norm2_computation, finalize_norm2_computation) GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_norm2_computation_call, - finalize_norm2_computation, KCFG_1D, - kcfg_1d_list) + finalize_norm2_computation, kcfg_1d_list) template @@ -452,7 +448,7 @@ void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride, GKO_ENABLE_DEFAULT_HOST_CONFIG(count_nnz_per_row, count_nnz_per_row) GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(count_nnz_per_row, count_nnz_per_row) GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row, - KCFG_1D, kcfg_1d_list) + kcfg_1d_list) template @@ -552,7 +548,7 @@ GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths) GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths, calculate_slice_lengths) GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call, - calculate_slice_lengths, KCFG_1D, kcfg_1d_list) + calculate_slice_lengths, kcfg_1d_list) template @@ -627,7 +623,7 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory, } GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz); -GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz, KCFG_1D, +GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz, kcfg_1d_list) template @@ -666,7 +662,7 @@ GKO_ENABLE_DEFAULT_HOST_CONFIG(reduce_max_nnz_per_slice, GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz_per_slice, reduce_max_nnz_per_slice) GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call, - reduce_max_nnz_per_slice, KCFG_1D, kcfg_1d_list) + reduce_max_nnz_per_slice, kcfg_1d_list) template @@ -708,7 +704,7 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_total_cols, reduce_total_cols); GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols, - KCFG_1D, kcfg_1d_list) + kcfg_1d_list) template @@ -1001,11 +997,11 @@ void compute_dot(std::shared_ptr exec, // TODO: write a kernel which does this more efficiently for (size_type col = 0; col < x->get_size()[1]; ++col) { kernel::compute_partial_dot_call( - grid_dim, block_dim, 0, exec->get_queue(), cfg, + cfg, grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], x->get_const_values() + col, x->get_stride(), y->get_const_values() + col, y->get_stride(), work.get_data()); kernel::finalize_dot_computation_call( - 1, block_dim, 0, exec->get_queue(), cfg, grid_dim.x, + cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, work.get_const_data(), result->get_values() + col); } } @@ -1058,11 +1054,11 @@ void compute_norm2(std::shared_ptr exec, // TODO: write a kernel which does this more efficiently for (size_type col = 0; col < x->get_size()[1]; ++col) { kernel::compute_partial_norm2_call( - grid_dim, block_dim, 0, exec->get_queue(), cfg, + cfg, grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], x->get_const_values() + col, x->get_stride(), work.get_data()); kernel::finalize_norm2_computation_call( - 1, block_dim, 0, exec->get_queue(), cfg, grid_dim.x, + cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, work.get_const_data(), result->get_values() + col); } } @@ -1129,9 +1125,9 @@ void convert_to_csr(std::shared_ptr exec, const auto rows_per_block = ceildiv(wg_size, sg_size); const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block); - kernel::count_nnz_per_row_call(grid_dim_nnz, wg_size, 0, exec->get_queue(), - cfg, num_rows, num_cols, stride, - source->get_const_values(), row_ptrs); + kernel::count_nnz_per_row_call( + cfg, grid_dim_nnz, wg_size, 0, exec->get_queue(), num_rows, num_cols, + stride, source->get_const_values(), row_ptrs); components::prefix_sum(exec, row_ptrs, num_rows + 1); @@ -1222,7 +1218,7 @@ void convert_to_sellp(std::shared_ptr exec, if (grid_dim > 0) { kernel::calculate_slice_lengths_call( - grid_dim, sg_size, 0, exec->get_queue(), cfg, num_rows, slice_size, + cfg, grid_dim, sg_size, 0, exec->get_queue(), num_rows, slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(), slice_lengths, slice_sets); } @@ -1290,13 +1286,13 @@ void calculate_max_nnz_per_row(std::shared_ptr exec, auto block_results = Array(exec, grid_dim); kernel::reduce_max_nnz_call( - grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg, + cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), num_rows, nnz_per_row.get_const_data(), block_results.get_data()); auto d_result = Array(exec, 1); kernel::reduce_max_nnz_call( - 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg, + cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), grid_dim, block_results.get_const_data(), d_result.get_data()); *result = exec->copy_val_to_host(d_result.get_const_data()); @@ -1326,7 +1322,7 @@ void calculate_nonzeros_per_row(std::shared_ptr exec, const dim3 grid_size(grid_x, 1, 1); if (grid_x > 0) { kernel::count_nnz_per_row_call( - grid_size, block_size, 0, exec->get_queue(), cfg, + cfg, grid_size, block_size, 0, exec->get_queue(), source->get_size()[0], source->get_size()[1], source->get_stride(), source->get_const_values(), result->get_data()); } @@ -1370,7 +1366,7 @@ void calculate_total_cols(std::shared_ptr exec, auto grid_dim = ceildiv(slice_num * sg_size, wg_size); kernel::reduce_max_nnz_per_slice_call( - grid_dim, wg_size, 0, exec->get_queue(), cfg, num_rows, slice_size, + cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, slice_size, stride_factor, nnz_per_row.get_const_data(), max_nnz_per_slice.get_data()); @@ -1378,14 +1374,14 @@ void calculate_total_cols(std::shared_ptr exec, auto block_results = Array(exec, grid_dim); kernel::reduce_total_cols_call( - grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg, + cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), slice_num, max_nnz_per_slice.get_const_data(), block_results.get_data()); auto d_result = Array(exec, 1); kernel::reduce_total_cols_call( - 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg, + cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), grid_dim, block_results.get_const_data(), d_result.get_data()); *result = exec->copy_val_to_host(d_result.get_const_data()); diff --git a/dpcpp/test/components/prefix_sum.cpp b/dpcpp/test/components/prefix_sum.cpp index 3e2e7ca9d64..2ae72880443 100644 --- a/dpcpp/test/components/prefix_sum.cpp +++ b/dpcpp/test/components/prefix_sum.cpp @@ -44,9 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "dpcpp/test/utils.hpp" - - namespace { diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index 3cd080313cf..e47de0a6487 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -50,7 +50,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/fill_array.hpp" #include "core/matrix/dense_kernels.hpp" -#include "dpcpp/test/utils.hpp" namespace { diff --git a/dpcpp/test/utils.hpp b/dpcpp/test/utils.hpp deleted file mode 100644 index 88d98f0d9f6..00000000000 --- a/dpcpp/test/utils.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2020, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#ifndef GKO_DPCPP_TEST_UTILS_HPP_ -#define GKO_DPCPP_TEST_UTILS_HPP_ - - -#include "core/test/utils.hpp" - - -#include - - -namespace { - - -// prevent device reset after each test -auto no_reset_exec = - gko::DpcppExecutor::create(0, gko::ReferenceExecutor::create()); - - -} // namespace - - -#endif // GKO_DPCPP_TEST_UTILS_HPP_ From e0ed0c3145855c7c33317d0ca510f10fdc1f0cf3 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 1 Jun 2021 17:40:52 +0200 Subject: [PATCH 13/22] update lateset ConfigSet and dense kernel/test --- dpcpp/base/helper.hpp | 3 +- dpcpp/components/prefix_sum.dp.cpp | 7 +- dpcpp/components/prefix_sum.dp.hpp | 9 +- dpcpp/components/reduction.dp.hpp | 11 +- dpcpp/matrix/dense_kernels.dp.cpp | 374 ++++++++++++++--------- dpcpp/test/components/prefix_sum.cpp | 3 + dpcpp/test/matrix/dense_kernels.cpp | 436 ++++++++++++++++++++++----- 7 files changed, 622 insertions(+), 221 deletions(-) diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index 3979caa905c..8c7f45e5174 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/types.hpp" #include "dpcpp/base/dim3.dp.hpp" @@ -142,7 +143,7 @@ bool validate(sycl::queue *queue, unsigned workgroup_size, template -ConfigSetType get_first_cfg(IterArr &arr, Validate verify) +std::uint32_t get_first_cfg(IterArr &arr, Validate verify) { for (auto &cfg : arr) { if (verify(cfg)) { diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp index 330fa297e58..07cdb5b38aa 100644 --- a/dpcpp/components/prefix_sum.dp.cpp +++ b/dpcpp/components/prefix_sum.dp.cpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/types.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/prefix_sum.dp.hpp" @@ -52,7 +53,7 @@ namespace components { using BlockCfg = ConfigSet<11>; constexpr auto block_cfg_list = - ::gko::syn::value_list(); GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum) @@ -73,8 +74,8 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, if (num_entries > 0) { auto queue = exec->get_queue(); constexpr auto block_cfg_array = as_array(block_cfg_list); - const ConfigSetType cfg = - get_first_cfg(block_cfg_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) { return validate(queue, BlockCfg::decode<0>(cfg), 16); }); const auto wg_size = BlockCfg::decode<0>(cfg); diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index fd9ff2ac263..f76f85135eb 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/types.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" @@ -125,7 +126,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element, * @note To calculate the prefix sum over an array of size bigger than * `block_size`, `finalize_prefix_sum` has to be used as well. */ -template +template void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, ValueType *__restrict__ block_sum, sycl::nd_item<3> item_ct1, @@ -178,7 +179,7 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, } } -template +template void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_elements, ValueType *elements, ValueType *block_sum) @@ -214,7 +215,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, * * @note To calculate a prefix sum, first `start_prefix_sum` has to be called. */ -template +template void finalize_prefix_sum(size_type num_elements, ValueType *__restrict__ elements, const ValueType *__restrict__ block_sum, @@ -231,7 +232,7 @@ void finalize_prefix_sum(size_type num_elements, } } -template +template void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_elements, ValueType *elements, const ValueType *block_sum) diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index d3e925ee4ba..9c2387a7113 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/base/types.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" @@ -63,7 +64,7 @@ namespace dpcpp { constexpr int default_block_size = 256; using KCFG_1D = ConfigSet<11, 7>; constexpr auto kcfg_1d_list = - syn::value_list(); @@ -201,7 +202,7 @@ void reduce_array(size_type size, const ValueType *__restrict__ source, * `source` of any size. Has to be called a second time on `result` to reduce * an array larger than `block_size`. */ -template +template void reduce_add_array( size_type size, const ValueType *__restrict__ source, ValueType *__restrict__ result, sycl::nd_item<3> item_ct1, @@ -216,7 +217,7 @@ void reduce_add_array( } } -template +template void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, const ValueType *source, ValueType *result) @@ -263,8 +264,8 @@ ValueType reduce_add_array(std::shared_ptr exec, ValueType answer = zero(); auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index e83c5dba5cc..58f688951c9 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" -#include - - #include #include @@ -70,12 +67,14 @@ namespace dpcpp { */ namespace dense { + using KCFG_1D = ConfigSet<11, 7>; constexpr auto kcfg_1d_list = - syn::value_list(); +constexpr auto subgroup_list = syn::value_list(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); constexpr auto default_block_size = 256; @@ -119,6 +118,7 @@ void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols, GKO_ENABLE_DEFAULT_HOST(scale, scale) + template void add_scaled(size_type num_rows, size_type num_cols, size_type num_alpha_cols, const ValueType *__restrict__ alpha, @@ -157,7 +157,7 @@ void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha, GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag) -template void compute_partial_reduce( size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value, @@ -191,7 +191,7 @@ void compute_partial_reduce( } -template void finalize_reduce_computation( size_type size, const ValueType *work, ValueType *result, @@ -220,7 +220,7 @@ void finalize_reduce_computation( } -template +template void compute_partial_dot( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, const ValueType *__restrict__ y, size_type stride_y, @@ -230,13 +230,13 @@ void compute_partial_dot( compute_partial_reduce( num_rows, work, [x, stride_x, y, stride_y](size_type i) { - return x[i * stride_x] * conj(y[i * stride_y]); + return x[i * stride_x] * y[i * stride_y]; }, [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1, tmp_work); } -template +template void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_rows, const ValueType *x, size_type stride_x, @@ -244,7 +244,6 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, ValueType *work) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - std::cout << "partial " << cfg << std::endl; stream->submit([&](sycl::handler &cgh) { sycl::accessor, 0, sycl::access::mode::read_write, @@ -267,8 +266,54 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot, kcfg_1d_list) -template -void finalize_dot_computation( +template +void compute_partial_conj_dot( + size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, + const ValueType *__restrict__ y, size_type stride_y, + ValueType *__restrict__ work, sycl::nd_item<3> item_ct1, + UninitializedArray(cfg)> *tmp_work) +{ + compute_partial_reduce( + num_rows, work, + [x, stride_x, y, stride_y](size_type i) { + return conj(x[i * stride_x]) * y[i * stride_y]; + }, + [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1, + tmp_work); +} + +template +void compute_partial_conj_dot(dim3 grid, dim3 block, + size_t dynamic_shared_memory, sycl::queue *stream, + size_type num_rows, const ValueType *x, + size_type stride_x, const ValueType *y, + size_type stride_y, ValueType *work) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + stream->submit([&](sycl::handler &cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + tmp_work_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + compute_partial_conj_dot( + num_rows, x, stride_x, y, stride_y, work, item_ct1, + (UninitializedArray *) + tmp_work_acc_ct1.get_pointer()); + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_conj_dot, + compute_partial_conj_dot) +GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call, + compute_partial_conj_dot, kcfg_1d_list) + + +template +void finalize_sum_reduce_computation( size_type size, const ValueType *work, ValueType *result, sycl::nd_item<3> item_ct1, UninitializedArray(cfg)> *tmp_work) @@ -279,14 +324,13 @@ void finalize_dot_computation( [](const ValueType &x) { return x; }, item_ct1, tmp_work); } -template -void finalize_dot_computation(dim3 grid, dim3 block, - size_t dynamic_shared_memory, sycl::queue *stream, - size_type size, const ValueType *work, - ValueType *result) +template +void finalize_sum_reduce_computation(dim3 grid, dim3 block, + size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const ValueType *work, ValueType *result) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - std::cout << "finalize " << cfg << std::endl; stream->submit([&](sycl::handler &cgh) { sycl::accessor, 0, sycl::access::mode::read_write, @@ -295,7 +339,7 @@ void finalize_dot_computation(dim3 grid, dim3 block, cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - finalize_dot_computation( + finalize_sum_reduce_computation( size, work, result, item_ct1, (UninitializedArray *) tmp_work_acc_ct1.get_pointer()); @@ -303,13 +347,13 @@ void finalize_dot_computation(dim3 grid, dim3 block, }); } -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_dot_computation, - finalize_dot_computation) -GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_dot_computation_call, - finalize_dot_computation, kcfg_1d_list) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sum_reduce_computation, + finalize_sum_reduce_computation) +GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call, + finalize_sum_reduce_computation, kcfg_1d_list) -template +template void compute_partial_norm2( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, remove_complex *__restrict__ work, sycl::nd_item<3> item_ct1, @@ -324,7 +368,7 @@ void compute_partial_norm2( tmp_work); } -template +template void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_rows, const ValueType *x, size_type stride_x, @@ -353,8 +397,8 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call, compute_partial_norm2, kcfg_1d_list) -template -void finalize_norm2_computation( +template +void finalize_sqrt_reduce_computation( size_type size, const ValueType *work, ValueType *result, sycl::nd_item<3> item_ct1, UninitializedArray(cfg)> *tmp_work) @@ -362,14 +406,14 @@ void finalize_norm2_computation( finalize_reduce_computation( size, work, result, [](const ValueType &x, const ValueType &y) { return x + y; }, - [](const ValueType &x) { return sqrt(x); }, item_ct1, tmp_work); + [](const ValueType &x) { return std::sqrt(x); }, item_ct1, tmp_work); } -template -void finalize_norm2_computation(dim3 grid, dim3 block, - size_t dynamic_shared_memory, - sycl::queue *stream, size_type size, - const ValueType *work, ValueType *result) +template +void finalize_sqrt_reduce_computation(dim3 grid, dim3 block, + size_t dynamic_shared_memory, + sycl::queue *stream, size_type size, + const ValueType *work, ValueType *result) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); stream->submit([&](sycl::handler &cgh) { @@ -381,7 +425,7 @@ void finalize_norm2_computation(dim3 grid, dim3 block, cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - finalize_norm2_computation( + finalize_sqrt_reduce_computation( size, work, result, item_ct1, (UninitializedArray *) tmp_work_acc_ct1.get_pointer()); @@ -389,13 +433,13 @@ void finalize_norm2_computation(dim3 grid, dim3 block, }); } -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_norm2_computation, - finalize_norm2_computation) -GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_norm2_computation_call, - finalize_norm2_computation, kcfg_1d_list) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sqrt_reduce_computation, + finalize_sqrt_reduce_computation) +GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call, + finalize_sqrt_reduce_computation, kcfg_1d_list) -template +template void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, const size_type *__restrict__ row_ptrs, const ValueType *__restrict__ source, @@ -418,10 +462,12 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, } } -GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo) +GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_coo, fill_in_coo) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_coo, fill_in_coo) +GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_coo_call, fill_in_coo, kcfg_1d_list) -template +template void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride, const ValueType *__restrict__ work, IndexType *__restrict__ result, @@ -451,7 +497,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row, kcfg_1d_list) -template +template void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, const ValueType *__restrict__ source, IndexType *__restrict__ row_ptrs, @@ -472,10 +518,12 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, } } -GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr) +GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_csr, fill_in_csr) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_csr, fill_in_csr) +GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_csr_call, fill_in_csr, kcfg_1d_list) -template +template void fill_in_ell(size_type num_rows, size_type num_cols, size_type source_stride, const ValueType *__restrict__ source, size_type max_nnz_per_row, size_type result_stride, @@ -505,10 +553,12 @@ void fill_in_ell(size_type num_rows, size_type num_cols, } } -GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell) +GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_ell, fill_in_ell) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_ell, fill_in_ell) +GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_ell_call, fill_in_ell, kcfg_1d_list) -template +template void calculate_slice_lengths(size_type num_rows, size_type slice_size, int slice_num, size_type stride_factor, const size_type *__restrict__ nnz_per_row, @@ -516,7 +566,7 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size, size_type *__restrict__ slice_sets, sycl::nd_item<3> item_ct1) { - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + constexpr auto sg_size = cfg; const auto sliceid = item_ct1.get_group(2); const auto tid_in_warp = item_ct1.get_local_id(2); @@ -548,10 +598,10 @@ GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths) GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths, calculate_slice_lengths) GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call, - calculate_slice_lengths, kcfg_1d_list) + calculate_slice_lengths, subgroup_list) -template +template void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, size_type stride, const ValueType *__restrict__ source, size_type *__restrict__ slice_lengths, @@ -584,9 +634,12 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, } } -GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp) +GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_sellp, fill_in_sellp) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_sellp, fill_in_sellp) +GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_sellp_call, fill_in_sellp, kcfg_1d_list) -template + +template void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result, sycl::nd_item<3> item_ct1, uint8_t *dpct_local) @@ -603,7 +656,7 @@ void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, } } -template +template void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, const size_type *nnz_per_row, size_type *result) @@ -626,7 +679,8 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz); GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz, kcfg_1d_list) -template + +template void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size, size_type stride_factor, const size_type *__restrict__ nnz_per_row, @@ -665,7 +719,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call, reduce_max_nnz_per_slice, kcfg_1d_list) -template +template void reduce_total_cols(size_type num_slices, const size_type *__restrict__ max_nnz_per_slice, size_type *__restrict__ result, @@ -682,7 +736,7 @@ void reduce_total_cols(size_type num_slices, } } -template +template void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_slices, const size_type *max_nnz_per_slice, size_type *result) @@ -960,6 +1014,34 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); +namespace { + + +#define GKO_BIND_DOT(ValueType, Name, Func) \ + void Name(::cl::sycl::queue &exec_queue, std::int64_t n, \ + const ValueType *x, std::int64_t incx, const ValueType *y, \ + std::int64_t incy, ValueType *result) \ + { \ + Func(exec_queue, n, x, incx, y, incy, result); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(std::complex, dot, oneapi::mkl::blas::row_major::dotu); +GKO_BIND_DOT(std::complex, dot, oneapi::mkl::blas::row_major::dotu); +GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(std::complex, conj_dot, oneapi::mkl::blas::row_major::dotc); +GKO_BIND_DOT(std::complex, conj_dot, + oneapi::mkl::blas::row_major::dotc); + + +} // namespace + + template void compute_dot(std::shared_ptr exec, const matrix::Dense *x, @@ -981,15 +1063,13 @@ void compute_dot(std::shared_ptr exec, constexpr auto work_per_thread = 32; auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); const auto wg_size = KCFG_1D::decode<0>(cfg); const auto sg_size = KCFG_1D::decode<1>(cfg); - std::cout << "dot " << cfg << " " << wg_size << " " << sg_size - << std::endl; const auto work_per_block = work_per_thread * wg_size; const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); const dim3 block_dim{sg_size, 1, wg_size / sg_size}; @@ -1000,7 +1080,7 @@ void compute_dot(std::shared_ptr exec, cfg, grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], x->get_const_values() + col, x->get_stride(), y->get_const_values() + col, y->get_stride(), work.get_data()); - kernel::finalize_dot_computation_call( + kernel::finalize_sum_reduce_computation_call( cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, work.get_const_data(), result->get_values() + col); } @@ -1014,7 +1094,47 @@ template void compute_conj_dot(std::shared_ptr exec, const matrix::Dense *x, const matrix::Dense *y, - matrix::Dense *result) GKO_NOT_IMPLEMENTED; + matrix::Dense *result) +{ + if (0) { + // TODO: write a custom kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + conj_dot(*exec->get_queue(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), + result->get_values() + col); + } + } else { + // TODO: these are tuning parameters obtained experimentally, once + // we decide how to handle this uniformly, they should be modified + // appropriately + constexpr auto work_per_thread = 32; + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + + const auto work_per_block = work_per_thread * wg_size; + const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); + const dim3 block_dim{sg_size, 1, wg_size / sg_size}; + Array work(exec, grid_dim.x); + // TODO: write a kernel which does this more efficiently + for (size_type col = 0; col < x->get_size()[1]; ++col) { + kernel::compute_partial_conj_dot_call( + cfg, grid_dim, block_dim, 0, exec->get_queue(), + x->get_size()[0], x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), work.get_data()); + kernel::finalize_sum_reduce_computation_call( + cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, + work.get_const_data(), result->get_values() + col); + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); @@ -1039,8 +1159,8 @@ void compute_norm2(std::shared_ptr exec, constexpr auto work_per_thread = 32; auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -1057,7 +1177,7 @@ void compute_norm2(std::shared_ptr exec, cfg, grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0], x->get_const_values() + col, x->get_stride(), work.get_data()); - kernel::finalize_norm2_computation_call( + kernel::finalize_sqrt_reduce_computation_call( cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, work.get_const_data(), result->get_values() + col); } @@ -1086,12 +1206,21 @@ void convert_to_coo(std::shared_ptr exec, components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows); - size_type grid_dim = ceildiv(num_rows, default_block_size); + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + size_type grid_dim = ceildiv(num_rows, wg_size); - kernel::fill_in_coo(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, num_cols, stride, - nnz_prefix_sum.get_const_data(), - source->get_const_values(), row_idxs, col_idxs, values); + kernel::fill_in_coo_call( + cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, num_cols, + stride, nnz_prefix_sum.get_const_data(), source->get_const_values(), + row_idxs, col_idxs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1105,8 +1234,8 @@ void convert_to_csr(std::shared_ptr exec, { auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -1133,9 +1262,10 @@ void convert_to_csr(std::shared_ptr exec, size_type grid_dim = ceildiv(num_rows, wg_size); - kernel::fill_in_csr(grid_dim, wg_size, 0, exec->get_queue(), num_rows, - num_cols, stride, source->get_const_values(), row_ptrs, - col_idxs, values); + kernel::fill_in_csr_call(cfg, grid_dim, default_block_size, 0, + exec->get_queue(), num_rows, num_cols, stride, + source->get_const_values(), row_ptrs, col_idxs, + values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1157,11 +1287,20 @@ void convert_to_ell(std::shared_ptr exec, auto source_stride = source->get_stride(); auto result_stride = result->get_stride(); - auto grid_dim = ceildiv(result_stride, default_block_size); - kernel::fill_in_ell(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, num_cols, source_stride, - source->get_const_values(), max_nnz_per_row, - result_stride, col_ptrs, values); + auto queue = exec->get_queue(); + constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + const auto wg_size = KCFG_1D::decode<0>(cfg); + const auto sg_size = KCFG_1D::decode<1>(cfg); + auto grid_dim = ceildiv(result_stride, wg_size); + kernel::fill_in_ell_call(cfg, grid_dim, wg_size, 0, exec->get_queue(), + num_rows, num_cols, source_stride, + source->get_const_values(), max_nnz_per_row, + result_stride, col_ptrs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1185,8 +1324,8 @@ void convert_to_sellp(std::shared_ptr exec, { auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -1211,24 +1350,25 @@ void convert_to_sellp(std::shared_ptr exec, const int slice_num = ceildiv(num_rows, slice_size); auto nnz_per_row = Array(exec, num_rows); - calculate_nonzeros_per_row(exec, source, &nnz_per_row); auto grid_dim = slice_num; if (grid_dim > 0) { kernel::calculate_slice_lengths_call( - cfg, grid_dim, sg_size, 0, exec->get_queue(), num_rows, slice_size, - slice_num, stride_factor, nnz_per_row.get_const_data(), + sg_size, grid_dim, sg_size, 0, exec->get_queue(), num_rows, + slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(), slice_lengths, slice_sets); } + components::prefix_sum(exec, slice_sets, slice_num + 1); + grid_dim = ceildiv(num_rows, wg_size); if (grid_dim > 0) { - kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows, - num_cols, slice_size, stride, - source->get_const_values(), slice_lengths, - slice_sets, col_idxs, vals); + kernel::fill_in_sellp_call(cfg, grid_dim, wg_size, 0, exec->get_queue(), + num_rows, num_cols, slice_size, stride, + source->get_const_values(), slice_lengths, + slice_sets, col_idxs, vals); } } @@ -1272,14 +1412,12 @@ void calculate_max_nnz_per_row(std::shared_ptr exec, calculate_nonzeros_per_row(exec, source, &nnz_per_row); auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); const auto wg_size = KCFG_1D::decode<0>(cfg); - std::cout << "wg_size " << wg_size << "sg_size " << KCFG_1D::decode<1>(cfg) - << std::endl; const auto n = ceildiv(num_rows, wg_size); const size_type grid_dim = (n <= wg_size) ? n : wg_size; @@ -1309,8 +1447,8 @@ void calculate_nonzeros_per_row(std::shared_ptr exec, { auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -1355,8 +1493,8 @@ void calculate_total_cols(std::shared_ptr exec, auto max_nnz_per_slice = Array(exec, slice_num); auto queue = exec->get_queue(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const ConfigSetType cfg = - get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) { + const std::uint32_t cfg = + get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -1394,27 +1532,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( template void transpose(std::shared_ptr exec, const matrix::Dense *orig, - matrix::Dense *trans) -{ - // if (cublas::is_supported::value) { - // auto handle = exec->get_cublas_handle(); - // { - // cublas::pointer_mode_guard pm_guard(handle); - // auto alpha = one(); - // auto beta = zero(); - // cublas::geam( - // handle, oneapi::mkl::transpose::trans, - // oneapi::mkl::transpose::nontrans, orig->get_size()[0], - // orig->get_size()[1], &alpha, orig->get_const_values(), - // orig->get_stride(), &beta, static_cast(nullptr), trans->get_size()[1], trans->get_values(), - // trans->get_stride()); - // } - // } else { - // GKO_NOT_IMPLEMENTED; - // } - GKO_NOT_IMPLEMENTED; -}; + matrix::Dense *trans) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); @@ -1422,27 +1540,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL); template void conj_transpose(std::shared_ptr exec, const matrix::Dense *orig, - matrix::Dense *trans) -{ - // if (cublas::is_supported::value) { - // auto handle = exec->get_cublas_handle(); - // { - // cublas::pointer_mode_guard pm_guard(handle); - // auto alpha = one(); - // auto beta = zero(); - // cublas::geam( - // handle, oneapi::mkl::transpose::conjtrans, - // oneapi::mkl::transpose::nontrans, orig->get_size()[0], - // orig->get_size()[1], &alpha, orig->get_const_values(), - // orig->get_stride(), &beta, static_cast(nullptr), trans->get_size()[1], trans->get_values(), - // trans->get_stride()); - // } - // } else { - // GKO_NOT_IMPLEMENTED; - // } - GKO_NOT_IMPLEMENTED; -} + matrix::Dense *trans) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL); diff --git a/dpcpp/test/components/prefix_sum.cpp b/dpcpp/test/components/prefix_sum.cpp index 2ae72880443..402192d0b77 100644 --- a/dpcpp/test/components/prefix_sum.cpp +++ b/dpcpp/test/components/prefix_sum.cpp @@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/test/utils.hpp" + + namespace { diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index e47de0a6487..2b9af16732a 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/fill_array.hpp" #include "core/matrix/dense_kernels.hpp" +#include "core/test/utils.hpp" namespace { @@ -64,9 +65,12 @@ class Dense : public ::testing::Test { using vtype = double; #endif // GINKGO_DPCPP_SINGLE_MODE using Mtx = gko::matrix::Dense; + using MixedMtx = gko::matrix::Dense>; using NormVector = gko::matrix::Dense>; using Arr = gko::Array; - // using ComplexMtx = gko::matrix::Dense>; + using ComplexMtx = gko::matrix::Dense>; + using MixedComplexMtx = + gko::matrix::Dense>>; Dense() : rand_engine(15) {} @@ -116,15 +120,16 @@ class Dense : public ::testing::Test { void set_up_apply_data() { x = gen_mtx(65, 25); - // c_x = gen_mtx(65, 25); + c_x = gen_mtx(65, 25); y = gen_mtx(25, 35); expected = gen_mtx(65, 35); alpha = gko::initialize({2.0}, ref); beta = gko::initialize({-1.0}, ref); + square = gen_mtx(x->get_size()[0], x->get_size()[0]); dx = Mtx::create(dpcpp); dx->copy_from(x.get()); - // dc_x = ComplexMtx::create(dpcpp); - // dc_x->copy_from(c_x.get()); + dc_x = ComplexMtx::create(dpcpp); + dc_x->copy_from(c_x.get()); dy = Mtx::create(dpcpp); dy->copy_from(y.get()); dresult = Mtx::create(dpcpp); @@ -133,6 +138,8 @@ class Dense : public ::testing::Test { dalpha->copy_from(alpha.get()); dbeta = Mtx::create(dpcpp); dbeta->copy_from(beta.get()); + dsquare = Mtx::create(dpcpp); + dsquare->copy_from(square.get()); std::vector tmp(x->get_size()[0], 0); auto rng = std::default_random_engine{}; @@ -141,14 +148,25 @@ class Dense : public ::testing::Test { std::vector tmp2(x->get_size()[1], 0); std::iota(tmp2.begin(), tmp2.end(), 0); std::shuffle(tmp2.begin(), tmp2.end(), rng); + std::vector tmp3(x->get_size()[0] / 10); + std::uniform_int_distribution row_dist(0, x->get_size()[0] - 1); + for (auto &i : tmp3) { + i = row_dist(rng); + } rpermute_idxs = std::unique_ptr(new Arr{ref, tmp.begin(), tmp.end()}); - drpermute_idxs = - std::unique_ptr(new Arr{dpcpp, tmp.begin(), tmp.end()}); cpermute_idxs = std::unique_ptr(new Arr{ref, tmp2.begin(), tmp2.end()}); - dcpermute_idxs = - std::unique_ptr(new Arr{dpcpp, tmp2.begin(), tmp2.end()}); + rgather_idxs = + std::unique_ptr(new Arr{ref, tmp3.begin(), tmp3.end()}); + } + + template + std::unique_ptr convert(InputType &&input) + { + auto result = ConvertedType::create(input->get_executor()); + input->convert_to(result.get()); + return result; } std::shared_ptr ref; @@ -157,21 +175,22 @@ class Dense : public ::testing::Test { std::ranlux48 rand_engine; std::unique_ptr x; - // std::unique_ptr c_x; + std::unique_ptr c_x; std::unique_ptr y; std::unique_ptr alpha; std::unique_ptr beta; std::unique_ptr expected; + std::unique_ptr square; std::unique_ptr dresult; std::unique_ptr dx; - // std::unique_ptr dc_x; + std::unique_ptr dc_x; std::unique_ptr dy; std::unique_ptr dalpha; std::unique_ptr dbeta; + std::unique_ptr dsquare; std::unique_ptr rpermute_idxs; - std::unique_ptr drpermute_idxs; std::unique_ptr cpermute_idxs; - std::unique_ptr dcpermute_idxs; + std::unique_ptr rgather_idxs; }; @@ -201,7 +220,7 @@ TEST_F(Dense, DpcppStridedFillIsEquivalentToRef) dx->fill(42); result->copy_from(dx.get()); - GKO_ASSERT_MTX_NEAR(result, x, r::value); + GKO_ASSERT_MTX_NEAR(result, x, r::value); } @@ -318,6 +337,28 @@ TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef) } +TEST_F(Dense, SingleVectorDpcppComputeConjDotIsEquivalentToRef) +{ + set_up_vector_data(1); + + x->compute_conj_dot(y.get(), expected.get()); + dx->compute_conj_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); +} + + +TEST_F(Dense, MultipleVectorDpcppComputeConjDotIsEquivalentToRef) +{ + set_up_vector_data(20); + + x->compute_conj_dot(y.get(), expected.get()); + dx->compute_conj_dot(dy.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, r::value); +} + + TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef) { set_up_vector_data(20); @@ -343,6 +384,23 @@ TEST_F(Dense, SimpleApplyIsEquivalentToRef) } +#if !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, SimpleApplyMixedIsEquivalentToRef) +{ + set_up_apply_data(); + + x->apply(convert(y).get(), convert(expected).get()); + dx->apply(convert(dy).get(), convert(dresult).get()); + + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7); +} + + +#endif // !GINKGO_DPCPP_SINGLE_MODE + + TEST_F(Dense, AdvancedApplyIsEquivalentToRef) { set_up_apply_data(); @@ -354,38 +412,142 @@ TEST_F(Dense, AdvancedApplyIsEquivalentToRef) } -// TEST_F(Dense, ApplyToComplexIsEquivalentToRef) -// { -// set_up_apply_data(); -// auto complex_b = gen_mtx(25, 1); -// auto dcomplex_b = ComplexMtx::create(dpcpp); -// dcomplex_b->copy_from(complex_b.get()); -// auto complex_x = gen_mtx(65, 1); -// auto dcomplex_x = ComplexMtx::create(dpcpp); -// dcomplex_x->copy_from(complex_x.get()); +#if !GINKGO_DPCPP_SINGLE_MODE -// x->apply(complex_b.get(), complex_x.get()); -// dx->apply(dcomplex_b.get(), dcomplex_x.get()); -// GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); -// } +TEST_F(Dense, AdvancedApplyMixedIsEquivalentToRef) +{ + set_up_apply_data(); + x->apply(convert(alpha).get(), convert(y).get(), + convert(beta).get(), convert(expected).get()); + dx->apply(convert(dalpha).get(), convert(dy).get(), + convert(dbeta).get(), convert(dresult).get()); -// TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef) -// { -// set_up_apply_data(); -// auto complex_b = gen_mtx(25, 1); -// auto dcomplex_b = ComplexMtx::create(dpcpp); -// dcomplex_b->copy_from(complex_b.get()); -// auto complex_x = gen_mtx(65, 1); -// auto dcomplex_x = ComplexMtx::create(dpcpp); -// dcomplex_x->copy_from(complex_x.get()); + GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7); +} -// x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get()); -// dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get()); -// GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14); -// } +#endif // !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, ApplyToComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(complex_b.get(), complex_x.get()); + dx->apply(dcomplex_b.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r::value); +} + + +#if !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = MixedComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = MixedComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(complex_b.get(), complex_x.get()); + dx->apply(dcomplex_b.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7); +} + +#endif // !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get()); + dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r::value); +} + + +#if !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(25, 1); + auto dcomplex_b = MixedComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(65, 1); + auto dcomplex_x = MixedComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + + x->apply(convert(alpha).get(), complex_b.get(), + convert(beta).get(), complex_x.get()); + dx->apply(convert(dalpha).get(), dcomplex_b.get(), + convert(dbeta).get(), dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7); +} + + +#endif // !GINKGO_DPCPP_SINGLE_MODE + + +TEST_F(Dense, ComputeDotComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(1234, 2); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(1234, 2); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2}); + auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2}); + + complex_b->compute_dot(complex_x.get(), result.get()); + dcomplex_b->compute_dot(dcomplex_x.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(result, dresult, r::value); +} + + +TEST_F(Dense, ComputeConjDotComplexIsEquivalentToRef) +{ + set_up_apply_data(); + auto complex_b = gen_mtx(1234, 2); + auto dcomplex_b = ComplexMtx::create(dpcpp); + dcomplex_b->copy_from(complex_b.get()); + auto complex_x = gen_mtx(1234, 2); + auto dcomplex_x = ComplexMtx::create(dpcpp); + dcomplex_x->copy_from(complex_x.get()); + auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2}); + auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2}); + + complex_b->compute_conj_dot(complex_x.get(), result.get()); + dcomplex_b->compute_conj_dot(dcomplex_x.get(), dresult.get()); + + GKO_ASSERT_MTX_NEAR(result, dresult, r::value); +} // TEST_F(Dense, IsTransposable) @@ -494,42 +656,42 @@ TEST_F(Dense, MoveToEllIsEquivalentToRef) } -// TEST_F(Dense, ConvertToSellpIsEquivalentToRef) -// { -// set_up_apply_data(); -// auto sellp_mtx = gko::matrix::Sellp<>::create(ref); -// auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); +TEST_F(Dense, ConvertToSellpIsEquivalentToRef) +{ + set_up_apply_data(); + auto sellp_mtx = gko::matrix::Sellp::create(ref); + auto dsellp_mtx = gko::matrix::Sellp::create(dpcpp); -// x->convert_to(sellp_mtx.get()); -// dx->convert_to(dsellp_mtx.get()); + x->convert_to(sellp_mtx.get()); + dx->convert_to(dsellp_mtx.get()); -// GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6); -// } + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r::value); +} -// TEST_F(Dense, MoveToSellpIsEquivalentToRef) -// { -// set_up_apply_data(); -// auto sellp_mtx = gko::matrix::Sellp<>::create(ref); -// auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); +TEST_F(Dense, MoveToSellpIsEquivalentToRef) +{ + set_up_apply_data(); + auto sellp_mtx = gko::matrix::Sellp::create(ref); + auto dsellp_mtx = gko::matrix::Sellp::create(dpcpp); -// x->move_to(sellp_mtx.get()); -// dx->move_to(dsellp_mtx.get()); + x->move_to(sellp_mtx.get()); + dx->move_to(dsellp_mtx.get()); -// GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6); -// } + GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r::value); +} -// TEST_F(Dense, ConvertsEmptyToSellp) -// { -// auto dempty_mtx = Mtx::create(dpcpp); -// auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp); +TEST_F(Dense, ConvertsEmptyToSellp) +{ + auto dempty_mtx = Mtx::create(dpcpp); + auto dsellp_mtx = gko::matrix::Sellp::create(dpcpp); -// dempty_mtx->convert_to(dsellp_mtx.get()); + dempty_mtx->convert_to(dsellp_mtx.get()); -// ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), -// 0); ASSERT_FALSE(dsellp_mtx->get_size()); -// } + ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); + ASSERT_FALSE(dsellp_mtx->get_size()); +} TEST_F(Dense, CountNNZIsEquivalentToRef) @@ -595,12 +757,63 @@ TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) } +TEST_F(Dense, CanGatherRows) +{ + set_up_apply_data(); + + auto r_gather = x->row_gather(rgather_idxs.get()); + auto dr_gather = dx->row_gather(rgather_idxs.get()); + + GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0); +} + + +TEST_F(Dense, CanGatherRowsIntoDense) +{ + set_up_apply_data(); + auto gather_size = + gko::dim<2>{rgather_idxs->get_num_elems(), x->get_size()[1]}; + auto r_gather = Mtx::create(ref, gather_size); + // test make_temporary_clone and non-default stride + auto dr_gather = Mtx::create(ref, gather_size, x->get_size()[1] + 2); + + x->row_gather(rgather_idxs.get(), r_gather.get()); + dx->row_gather(rgather_idxs.get(), dr_gather.get()); + + GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0); +} + + +TEST_F(Dense, IsPermutable) +{ + set_up_apply_data(); + + auto permuted = square->permute(rpermute_idxs.get()); + auto dpermuted = dsquare->permute(rpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(permuted.get()), + static_cast(dpermuted.get()), 0); +} + + +TEST_F(Dense, IsInversePermutable) +{ + set_up_apply_data(); + + auto permuted = square->inverse_permute(rpermute_idxs.get()); + auto dpermuted = dsquare->inverse_permute(rpermute_idxs.get()); + + GKO_ASSERT_MTX_NEAR(static_cast(permuted.get()), + static_cast(dpermuted.get()), 0); +} + + TEST_F(Dense, IsRowPermutable) { set_up_apply_data(); auto r_permute = x->row_permute(rpermute_idxs.get()); - auto dr_permute = dx->row_permute(drpermute_idxs.get()); + auto dr_permute = dx->row_permute(rpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(r_permute.get()), static_cast(dr_permute.get()), 0); @@ -612,7 +825,7 @@ TEST_F(Dense, IsColPermutable) set_up_apply_data(); auto c_permute = x->column_permute(cpermute_idxs.get()); - auto dc_permute = dx->column_permute(dcpermute_idxs.get()); + auto dc_permute = dx->column_permute(cpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(c_permute.get()), static_cast(dc_permute.get()), 0); @@ -624,7 +837,7 @@ TEST_F(Dense, IsInverseRowPermutable) set_up_apply_data(); auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get()); - auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get()); + auto d_inverse_r_permute = dx->inverse_row_permute(rpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(inverse_r_permute.get()), static_cast(d_inverse_r_permute.get()), 0); @@ -636,14 +849,14 @@ TEST_F(Dense, IsInverseColPermutable) set_up_apply_data(); auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get()); - auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get()); + auto d_inverse_c_permute = dx->inverse_column_permute(cpermute_idxs.get()); GKO_ASSERT_MTX_NEAR(static_cast(inverse_c_permute.get()), static_cast(d_inverse_c_permute.get()), 0); } -TEST_F(Dense, ExtractDiagonalIsEquivalentToRef) +TEST_F(Dense, ExtractDiagonalOnTallSkinnyIsEquivalentToRef) { set_up_apply_data(); @@ -654,6 +867,17 @@ TEST_F(Dense, ExtractDiagonalIsEquivalentToRef) } +TEST_F(Dense, ExtractDiagonalOnShortFatIsEquivalentToRef) +{ + set_up_apply_data(); + + auto diag = y->extract_diagonal(); + auto ddiag = dy->extract_diagonal(); + + GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0); +} + + TEST_F(Dense, InplaceAbsoluteMatrixIsEquivalentToRef) { set_up_apply_data(); @@ -676,4 +900,76 @@ TEST_F(Dense, OutplaceAbsoluteMatrixIsEquivalentToRef) } +TEST_F(Dense, MakeComplexIsEquivalentToRef) +{ + set_up_apply_data(); + + auto complex_x = x->make_complex(); + auto dcomplex_x = dx->make_complex(); + + GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0); +} + + +TEST_F(Dense, MakeComplexWithGivenResultIsEquivalentToRef) +{ + set_up_apply_data(); + + auto complex_x = ComplexMtx::create(ref, x->get_size()); + x->make_complex(complex_x.get()); + auto dcomplex_x = ComplexMtx::create(dpcpp, x->get_size()); + dx->make_complex(dcomplex_x.get()); + + GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0); +} + + +TEST_F(Dense, GetRealIsEquivalentToRef) +{ + set_up_apply_data(); + + auto real_x = x->get_real(); + auto dreal_x = dx->get_real(); + + GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0); +} + + +TEST_F(Dense, GetRealWithGivenResultIsEquivalentToRef) +{ + set_up_apply_data(); + + auto real_x = Mtx::create(ref, x->get_size()); + x->get_real(real_x.get()); + auto dreal_x = Mtx::create(dpcpp, dx->get_size()); + dx->get_real(dreal_x.get()); + + GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0); +} + + +TEST_F(Dense, GetImagIsEquivalentToRef) +{ + set_up_apply_data(); + + auto imag_x = x->get_imag(); + auto dimag_x = dx->get_imag(); + + GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0); +} + + +TEST_F(Dense, GetImagWithGivenResultIsEquivalentToRef) +{ + set_up_apply_data(); + + auto imag_x = Mtx::create(ref, x->get_size()); + x->get_imag(imag_x.get()); + auto dimag_x = Mtx::create(dpcpp, dx->get_size()); + dx->get_imag(dimag_x.get()); + + GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0); +} + + } // namespace From 9583b1bca76e7715d36c737928d7d8e8e14c5168 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 2 Jun 2021 19:46:54 +0200 Subject: [PATCH 14/22] fix sellp stuck --- dpcpp/matrix/dense_kernels.dp.cpp | 37 ++++++++++++++----------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 58f688951c9..41a283d71bd 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -569,28 +569,25 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size, constexpr auto sg_size = cfg; const auto sliceid = item_ct1.get_group(2); const auto tid_in_warp = item_ct1.get_local_id(2); + const bool runable = sliceid * slice_size + tid_in_warp < num_rows; + size_type thread_result = 0; + for (size_type i = tid_in_warp; i < slice_size; i += sg_size) { + thread_result = + (i + slice_size * sliceid < num_rows) + ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) + : thread_result; + } - if (sliceid * slice_size + tid_in_warp < num_rows) { - size_type thread_result = 0; - for (size_type i = tid_in_warp; i < slice_size; i += sg_size) { - thread_result = - (i + slice_size * sliceid < num_rows) - ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) - : thread_result; - } + auto warp_tile = + group::tiled_partition(group::this_thread_block(item_ct1)); + auto warp_result = ::gko::kernels::dpcpp::reduce( + warp_tile, thread_result, + [](const size_type &a, const size_type &b) { return max(a, b); }); - auto warp_tile = - group::tiled_partition(group::this_thread_block(item_ct1)); - auto warp_result = ::gko::kernels::dpcpp::reduce( - warp_tile, thread_result, - [](const size_type &a, const size_type &b) { return max(a, b); }); - - if (tid_in_warp == 0) { - auto slice_length = - ceildiv(warp_result, stride_factor) * stride_factor; - slice_lengths[sliceid] = slice_length; - slice_sets[sliceid] = slice_length; - } + if (tid_in_warp == 0 && runable) { + auto slice_length = ceildiv(warp_result, stride_factor) * stride_factor; + slice_lengths[sliceid] = slice_length; + slice_sets[sliceid] = slice_length; } } From 485934083e5e321b794d085fea355bd1e844a59f Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 8 Jul 2021 13:44:52 +0200 Subject: [PATCH 15/22] add cp, update doc, mv mkl bind Co-authored-by: Terry Cojean --- dpcpp/CMakeLists.txt | 4 +- dpcpp/base/config.hpp | 2 +- dpcpp/base/onemkl_bindings.hpp | 128 ++++++++++++++++++ dpcpp/components/prefix_sum.dp.hpp | 1 - dpcpp/components/reduction.dp.hpp | 3 +- dpcpp/components/thread_ids.dp.hpp | 1 - dpcpp/components/uninitialized_array.hpp | 4 +- dpcpp/matrix/dense_kernels.dp.cpp | 63 +++++---- .../ginkgo/core/synthesizer/containers.hpp | 85 ++++++++++++ 9 files changed, 259 insertions(+), 32 deletions(-) create mode 100644 dpcpp/base/onemkl_bindings.hpp diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 48addebaf5f..8755b424433 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -55,13 +55,13 @@ target_sources(ginkgo_dpcpp ginkgo_compile_features(ginkgo_dpcpp) target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP) -set(GINKGO_DPCPP_FLAGS ${GINKGO_COMPILER_FLAGS} -fsycl) +set(GINKGO_DPCPP_FLAGS ${GINKGO_COMPILER_FLAGS} -DMKL_ILP64) set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) -target_link_libraries(ginkgo_dpcpp PRIVATE "mkl_sycl;mkl_intel_ilp64;mkl_sequential;mkl_core") +target_link_libraries(ginkgo_dpcpp PRIVATE "mkl_sycl;mkl_intel_ilp64;mkl_tbb_thread;mkl_core;sycl;OpenCL;pthread;m;dl") target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) if (GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp index 78fdcc2b819..abb84d9b7ff 100644 --- a/dpcpp/base/config.hpp +++ b/dpcpp/base/config.hpp @@ -53,7 +53,7 @@ struct config { /** * The number of threads within a CUDA warp. */ - static constexpr uint32 warp_size = 32; + static constexpr uint32 warp_size = 16; /** * The bitmask of the entire warp. diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp new file mode 100644 index 00000000000..6456a048d23 --- /dev/null +++ b/dpcpp/base/onemkl_bindings.hpp @@ -0,0 +1,128 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_ +#define GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_ + + +#include +#include + + +namespace gko { +/** + * @brief The device specific kernels namespace. + * + * @ingroup kernels + */ +namespace kernels { +/** + * @brief The DPCPP namespace. + * + * @ingroup dpcpp + */ +namespace dpcpp { +/** + * @brief The ONEMKL namespace. + * + * @ingroup onemkl + */ +namespace onemkl { +/** + * @brief The detail namespace. + * + * @ingroup detail + */ +namespace detail { + + +template +inline void not_implemented(Args &&...) GKO_NOT_IMPLEMENTED; + + +} // namespace detail + + +template +struct is_supported : std::false_type {}; + +template <> +struct is_supported : std::true_type {}; + +template <> +struct is_supported : std::true_type {}; + +template <> +struct is_supported> : std::true_type {}; + +template <> +struct is_supported> : std::true_type {}; + + +#define GKO_BIND_DOT(ValueType, Name, Func) \ + void Name(::cl::sycl::queue &exec_queue, std::int64_t n, \ + const ValueType *x, std::int64_t incx, const ValueType *y, \ + std::int64_t incy, ValueType *result) \ + { \ + Func(exec_queue, n, x, incx, y, incy, result); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +// Bind the dot for x^T * y +GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(std::complex, dot, oneapi::mkl::blas::row_major::dotu); +GKO_BIND_DOT(std::complex, dot, oneapi::mkl::blas::row_major::dotu); +template +GKO_BIND_DOT(ValueType, dot, detail::not_implemented); + +// Bind the conj_dot for x' * y +GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot); +GKO_BIND_DOT(std::complex, conj_dot, oneapi::mkl::blas::row_major::dotc); +GKO_BIND_DOT(std::complex, conj_dot, + oneapi::mkl::blas::row_major::dotc); +template +GKO_BIND_DOT(ValueType, conj_dot, detail::not_implemented); + +#undef GKO_BIND_DOT + + +} // namespace onemkl +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_ diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index f76f85135eb..22e6139dd84 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -53,7 +53,6 @@ namespace kernels { namespace dpcpp { -// #include "common/components/prefix_sum.hpp.inc" /** * @internal * Computes the prefix sum and total sum of `element` over a subwarp. diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index 9c2387a7113..e0678f6cf7a 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -70,7 +70,6 @@ constexpr auto kcfg_1d_list = KCFG_1D::encode(256, 8)>(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); -// #include "common/components/reduction.hpp.inc" /** * @internal * @@ -217,7 +216,7 @@ void reduce_add_array( } } -template +template void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, const ValueType *source, ValueType *result) diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp index 5b656c5e0db..9eda077381c 100644 --- a/dpcpp/components/thread_ids.dp.hpp +++ b/dpcpp/components/thread_ids.dp.hpp @@ -52,7 +52,6 @@ namespace dpcpp { namespace thread { -// #include "common/components/thread_ids.hpp.inc" /** * @internal * diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp index 415126b8ed3..b10457df217 100644 --- a/dpcpp/components/uninitialized_array.hpp +++ b/dpcpp/components/uninitialized_array.hpp @@ -45,7 +45,6 @@ namespace kernels { namespace dpcpp { -// #include "common/components/uninitialized_array.hpp.inc" /** * Stores an array with uninitialized contents. * @@ -105,7 +104,8 @@ class UninitializedArray { } private: - // unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size]; + // if dpcpp uses char to represent data in char, compiling gives error. + // Thanksfully, dpcpp support complex data allocation directly. ValueType data_[size]; }; diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 41a283d71bd..a4e061c3f98 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/helper.hpp" +#include "dpcpp/base/onemkl_bindings.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" @@ -79,10 +80,26 @@ constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); constexpr auto default_block_size = 256; -// #include "common/matrix/dense_kernels.hpp.inc" namespace kernel { +template +void strided_copy(size_type num_rows, size_type num_cols, size_type in_stride, + size_type out_stride, const InValueType *__restrict__ input, + OutValueType *__restrict__ output, sycl::nd_item<3> item_ct1) +{ + const auto global_id = thread::get_thread_id_flat(item_ct1); + const auto row_id = global_id / num_cols; + const auto col_id = global_id % num_cols; + if (row_id < num_rows) { + output[row_id * out_stride + col_id] = + static_cast(input[row_id * in_stride + col_id]); + } +} + +GKO_ENABLE_DEFAULT_HOST(strided_copy, strided_copy) + + template void strided_fill(size_type num_rows, size_type num_cols, size_type stride, ValueType *__restrict__ mat, ValueType value, @@ -157,7 +174,7 @@ void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha, GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag) -template void compute_partial_reduce( size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value, @@ -191,7 +208,7 @@ void compute_partial_reduce( } -template void finalize_reduce_computation( size_type size, const ValueType *work, ValueType *result, @@ -220,7 +237,7 @@ void finalize_reduce_computation( } -template +template void compute_partial_dot( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, const ValueType *__restrict__ y, size_type stride_y, @@ -236,7 +253,7 @@ void compute_partial_dot( tmp_work); } -template +template void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_rows, const ValueType *x, size_type stride_x, @@ -266,7 +283,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot, kcfg_1d_list) -template +template void compute_partial_conj_dot( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, const ValueType *__restrict__ y, size_type stride_y, @@ -282,7 +299,7 @@ void compute_partial_conj_dot( tmp_work); } -template +template void compute_partial_conj_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_rows, const ValueType *x, @@ -312,7 +329,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call, compute_partial_conj_dot, kcfg_1d_list) -template +template void finalize_sum_reduce_computation( size_type size, const ValueType *work, ValueType *result, sycl::nd_item<3> item_ct1, @@ -324,7 +341,7 @@ void finalize_sum_reduce_computation( [](const ValueType &x) { return x; }, item_ct1, tmp_work); } -template +template void finalize_sum_reduce_computation(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, @@ -353,7 +370,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call, finalize_sum_reduce_computation, kcfg_1d_list) -template +template void compute_partial_norm2( size_type num_rows, const ValueType *__restrict__ x, size_type stride_x, remove_complex *__restrict__ work, sycl::nd_item<3> item_ct1, @@ -368,7 +385,7 @@ void compute_partial_norm2( tmp_work); } -template +template void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_rows, const ValueType *x, size_type stride_x, @@ -397,7 +414,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call, compute_partial_norm2, kcfg_1d_list) -template +template void finalize_sqrt_reduce_computation( size_type size, const ValueType *work, ValueType *result, sycl::nd_item<3> item_ct1, @@ -409,7 +426,7 @@ void finalize_sqrt_reduce_computation( [](const ValueType &x) { return std::sqrt(x); }, item_ct1, tmp_work); } -template +template void finalize_sqrt_reduce_computation(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, @@ -653,7 +670,7 @@ void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row, } } -template +template void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type size, const size_type *nnz_per_row, size_type *result) @@ -733,7 +750,7 @@ void reduce_total_cols(size_type num_slices, } } -template +template void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::queue *stream, size_type num_slices, const size_type *max_nnz_per_slice, size_type *result) @@ -1048,10 +1065,10 @@ void compute_dot(std::shared_ptr exec, if (0) { // TODO: write a custom kernel which does this more efficiently for (size_type col = 0; col < x->get_size()[1]; ++col) { - dot(*exec->get_queue(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); + onemkl::dot(*exec->get_queue(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), + result->get_values() + col); } } else { // TODO: these are tuning parameters obtained experimentally, once @@ -1096,10 +1113,10 @@ void compute_conj_dot(std::shared_ptr exec, if (0) { // TODO: write a custom kernel which does this more efficiently for (size_type col = 0; col < x->get_size()[1]; ++col) { - conj_dot(*exec->get_queue(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); + onemkl::conj_dot(*exec->get_queue(), x->get_size()[0], + x->get_const_values() + col, x->get_stride(), + y->get_const_values() + col, y->get_stride(), + result->get_values() + col); } } else { // TODO: these are tuning parameters obtained experimentally, once diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp index 3c79c7b7455..10e8c1031a1 100644 --- a/include/ginkgo/core/synthesizer/containers.hpp +++ b/include/ginkgo/core/synthesizer/containers.hpp @@ -47,14 +47,32 @@ namespace gko { namespace syn { +/** + * value_list records several values with the same type in template. + * + * @tparam T the value type of the list + * @tparam T... the values in the list + */ template struct value_list {}; +/** + * type_list records several types in template + * + * @tparam ...Types the types in the list + */ template struct type_list {}; +/** + * range records start, end, step in template + * + * @tparam int start of range + * @tparam int end of range + * @tparam int step of range. default is 1 + */ template struct range {}; @@ -62,9 +80,22 @@ struct range {}; namespace detail { +/** + * concatenate_impl base type + * + * @tparam List1 the first List + * @tparam List2 the second List + */ template struct concatenate_impl; +/** + * concatenate_impl specializes for two value_list with the same value type. + * + * @tparam T the value type of two value_list + * @tparam T... the values of the first list + * @tparam T... the values of the second list + */ template struct concatenate_impl, value_list> { using type = value_list; @@ -74,6 +105,12 @@ struct concatenate_impl, value_list> { } // namespace detail +/** + * concatenate combines two value_list into one value_list. + * + * @tparam List1 the first list + * @tparam List2 the second list + */ template using concatenate = typename detail::concatenate_impl::type; @@ -81,19 +118,43 @@ using concatenate = typename detail::concatenate_impl::type; namespace detail { +/** + * as_list_impl base type + * + * @tparam T the input template + */ template struct as_list_impl; +/** + * as_list_impl specializes for the value_list + * + * @tparam T the value_list type + * @tparam T... the values of value_list + */ template struct as_list_impl> { using type = value_list; }; +/** + * as_list_impl specializes for the type_list + * + * @tparam ...Types the types of type_list + */ template struct as_list_impl> { using type = type_list; }; +/** + * as_list_impl specializes for the range. This is the recursive case. It will + * concatenate Start and range. + * + * @tparam int the start of range + * @tparam int the end of range + * @tparam int the step of range + */ template struct as_list_impl, std::enable_if_t<(Start < End)>> { using type = concatenate< @@ -101,6 +162,13 @@ struct as_list_impl, std::enable_if_t<(Start < End)>> { typename as_list_impl>::type>; }; +/** + * as_list_impl specializes for the range. This is the end case. + * + * @tparam int the start of range + * @tparam int the end of range + * @tparam int the step of range + */ template struct as_list_impl, std::enable_if_t<(Start >= End)>> { using type = value_list; @@ -110,10 +178,27 @@ struct as_list_impl, std::enable_if_t<(Start >= End)>> { } // namespace detail +/** + * as_list gives the alias type of as_list_impl::type. It gives a list + * (itself) if input is already a list, or generates list type from range input. + * + * @tparam T list or range + */ template using as_list = typename detail::as_list_impl::type; +/** + * as_array returns the array from value_list. It will be helpful if using + * for in runtime on the array. + * + * @tparam T the type of value_list + * @tparam T... the values of value_list + * + * @param value_list the input value_list + * + * @return std::array the std::array contains the values of value_list + */ template constexpr std::array as_array(value_list vl) { From bb4d648e07852ce1de04bbdd277466bde3032fe0 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 13 Jul 2021 14:06:27 +0200 Subject: [PATCH 16/22] MKL cmake, delete unused, simplify func, add job --- cmake/create_test.cmake | 8 +- dpcpp/CMakeLists.txt | 8 +- dpcpp/matrix/dense_kernels.dp.cpp | 386 ++---------------------------- 3 files changed, 33 insertions(+), 369 deletions(-) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index ebf70232dd3..9d36b49911e 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -42,6 +42,10 @@ function(ginkgo_create_dpcpp_test test_name) target_compile_options(${test_target_name} PRIVATE "${GINKGO_DPCPP_FLAGS}") target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) ginkgo_set_test_target_properties(${test_name} ${test_target_name}) + # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test. + if (MKL_ENV) + set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}") + endif() endfunction(ginkgo_create_dpcpp_test) function(ginkgo_create_thread_test test_name) @@ -165,7 +169,7 @@ function(ginkgo_create_common_test test_name) # use float for DPC++ if necessary if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1) - endif() + endif() ginkgo_set_test_target_properties(${test_name}_${exec} ${test_target_name}) endforeach() -endfunction(ginkgo_create_common_test) \ No newline at end of file +endfunction(ginkgo_create_common_test) diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 8755b424433..d30810cf12c 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -6,6 +6,8 @@ endif() ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION) set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE) +find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}") + add_library(ginkgo_dpcpp $ "") target_sources(ginkgo_dpcpp PRIVATE @@ -55,14 +57,16 @@ target_sources(ginkgo_dpcpp ginkgo_compile_features(ginkgo_dpcpp) target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP) -set(GINKGO_DPCPP_FLAGS ${GINKGO_COMPILER_FLAGS} -DMKL_ILP64) set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") +# Note. add MKL via PRIVATE not PUBLIC (MKL example shows) to avoid find_package(MKL) everywhere when link ginkgo +target_compile_options(ginkgo_dpcpp PRIVATE $) target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) +target_include_directories(ginkgo_dpcpp PRIVATE $) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) -target_link_libraries(ginkgo_dpcpp PRIVATE "mkl_sycl;mkl_intel_ilp64;mkl_tbb_thread;mkl_core;sycl;OpenCL;pthread;m;dl") target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) +target_link_libraries(ginkgo_dpcpp PRIVATE $) if (GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index a4e061c3f98..f264f970cac 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -83,97 +83,6 @@ constexpr auto default_block_size = 256; namespace kernel { -template -void strided_copy(size_type num_rows, size_type num_cols, size_type in_stride, - size_type out_stride, const InValueType *__restrict__ input, - OutValueType *__restrict__ output, sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - if (row_id < num_rows) { - output[row_id * out_stride + col_id] = - static_cast(input[row_id * in_stride + col_id]); - } -} - -GKO_ENABLE_DEFAULT_HOST(strided_copy, strided_copy) - - -template -void strided_fill(size_type num_rows, size_type num_cols, size_type stride, - ValueType *__restrict__ mat, ValueType value, - sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - if (row_id < num_rows) { - mat[row_id * stride + col_id] = value; - } -} - -GKO_ENABLE_DEFAULT_HOST(strided_fill, strided_fill) - - -template -void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols, - const ValueType *__restrict__ alpha, ValueType *__restrict__ x, - size_type stride_x, sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id; - if (row_id < num_rows) { - x[row_id * stride_x + col_id] = - alpha[alpha_id] == zero() - ? zero() - : x[row_id * stride_x + col_id] * alpha[alpha_id]; - } -} - -GKO_ENABLE_DEFAULT_HOST(scale, scale) - - -template -void add_scaled(size_type num_rows, size_type num_cols, - size_type num_alpha_cols, const ValueType *__restrict__ alpha, - const ValueType *__restrict__ x, size_type stride_x, - ValueType *__restrict__ y, size_type stride_y, - sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id; - if (row_id < num_rows && alpha[alpha_id] != zero()) { - y[row_id * stride_y + col_id] += - x[row_id * stride_x + col_id] * alpha[alpha_id]; - } -} - -GKO_ENABLE_DEFAULT_HOST(add_scaled, add_scaled) - - -template -void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha, - const ValueType *__restrict__ diag, - ValueType *__restrict__ y, size_type stride_y, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - - if (tidx >= size) { - return; - } - - y[tidx * stride_y + tidx] += alpha[0] * diag[tidx]; -} - -GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag) - - template void compute_partial_reduce( @@ -456,7 +365,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call, finalize_sqrt_reduce_computation, kcfg_1d_list) -template +template void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, const size_type *__restrict__ row_ptrs, const ValueType *__restrict__ source, @@ -479,9 +388,7 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, } } -GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_coo, fill_in_coo) -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_coo, fill_in_coo) -GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_coo_call, fill_in_coo, kcfg_1d_list) +GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo) template @@ -514,7 +421,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row, kcfg_1d_list) -template +template void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, const ValueType *__restrict__ source, IndexType *__restrict__ row_ptrs, @@ -535,12 +442,10 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, } } -GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_csr, fill_in_csr) -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_csr, fill_in_csr) -GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_csr_call, fill_in_csr, kcfg_1d_list) +GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr) -template +template void fill_in_ell(size_type num_rows, size_type num_cols, size_type source_stride, const ValueType *__restrict__ source, size_type max_nnz_per_row, size_type result_stride, @@ -570,9 +475,7 @@ void fill_in_ell(size_type num_rows, size_type num_cols, } } -GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_ell, fill_in_ell) -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_ell, fill_in_ell) -GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_ell_call, fill_in_ell, kcfg_1d_list) +GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell) template @@ -615,7 +518,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call, calculate_slice_lengths, subgroup_list) -template +template void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, size_type stride, const ValueType *__restrict__ source, size_type *__restrict__ slice_lengths, @@ -648,9 +551,7 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, } } -GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_sellp, fill_in_sellp) -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_sellp, fill_in_sellp) -GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_sellp_call, fill_in_sellp, kcfg_1d_list) +GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp) template @@ -775,220 +676,6 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols, kcfg_1d_list) -template -void symm_permute(size_type num_rows, size_type num_cols, - const IndexType *__restrict__ perm_idxs, - const ValueType *__restrict__ orig, size_type stride_orig, - ValueType *__restrict__ result, size_type stride_result, - sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - if (row_id < num_rows) { - result[row_id * stride_result + col_id] = - orig[perm_idxs[row_id] * stride_orig + perm_idxs[col_id]]; - } -} - -GKO_ENABLE_DEFAULT_HOST(symm_permute, symm_permute) - - -template -void inv_symm_permute(size_type num_rows, size_type num_cols, - const IndexType *__restrict__ perm_idxs, - const ValueType *__restrict__ orig, size_type stride_orig, - ValueType *__restrict__ result, size_type stride_result, - sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - if (row_id < num_rows) { - result[perm_idxs[row_id] * stride_result + perm_idxs[col_id]] = - orig[row_id * stride_orig + col_id]; - } -} - -GKO_ENABLE_DEFAULT_HOST(inv_symm_permute, inv_symm_permute) - - -template -void row_gather(size_type num_rows, size_type num_cols, - const IndexType *__restrict__ perm_idxs, - const ValueType *__restrict__ orig, size_type stride_orig, - ValueType *__restrict__ result, size_type stride_result, - sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - if (row_id < num_rows) { - result[row_id * stride_result + col_id] = - orig[perm_idxs[row_id] * stride_orig + col_id]; - } -} - -GKO_ENABLE_DEFAULT_HOST(row_gather, row_gather) - - -template -void column_permute(size_type num_rows, size_type num_cols, - const IndexType *__restrict__ perm_idxs, - const ValueType *__restrict__ orig, size_type stride_orig, - ValueType *__restrict__ result, size_type stride_result, - sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - if (row_id < num_rows) { - result[row_id * stride_result + col_id] = - orig[row_id * stride_orig + perm_idxs[col_id]]; - } -} - -GKO_ENABLE_DEFAULT_HOST(column_permute, column_permute) - - -template -void inverse_row_permute(size_type num_rows, size_type num_cols, - const IndexType *__restrict__ perm_idxs, - const ValueType *__restrict__ orig, - size_type stride_orig, ValueType *__restrict__ result, - size_type stride_result, sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - if (row_id < num_rows) { - result[perm_idxs[row_id] * stride_result + col_id] = - orig[row_id * stride_orig + col_id]; - } -} - -GKO_ENABLE_DEFAULT_HOST(inverse_row_permute, inverse_row_permute) - - -template -void inverse_column_permute(size_type num_rows, size_type num_cols, - const IndexType *__restrict__ perm_idxs, - const ValueType *__restrict__ orig, - size_type stride_orig, - ValueType *__restrict__ result, - size_type stride_result, sycl::nd_item<3> item_ct1) -{ - const auto global_id = thread::get_thread_id_flat(item_ct1); - const auto row_id = global_id / num_cols; - const auto col_id = global_id % num_cols; - if (row_id < num_rows) { - result[row_id * stride_result + perm_idxs[col_id]] = - orig[row_id * stride_orig + col_id]; - } -} - -GKO_ENABLE_DEFAULT_HOST(inverse_column_permute, inverse_column_permute) - - -template -void extract_diagonal(size_type problem_size, - const ValueType *__restrict__ orig, size_type stride_orig, - ValueType *__restrict__ diag, sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - if (tidx < problem_size) { - diag[tidx] = orig[tidx * stride_orig + tidx]; - } -} - -GKO_ENABLE_DEFAULT_HOST(extract_diagonal, extract_diagonal) - - -template -void inplace_absolute_dense(size_type num_rows, size_type num_cols, - ValueType *__restrict__ data, size_type stride, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - auto row = tidx / num_cols; - auto col = tidx % num_cols; - if (row < num_rows) { - data[row * stride + col] = std::abs(data[row * stride + col]); - } -} - -GKO_ENABLE_DEFAULT_HOST(inplace_absolute_dense, inplace_absolute_dense) - - -template -void outplace_absolute_dense(size_type num_rows, size_type num_cols, - const ValueType *__restrict__ in, - size_type stride_in, - remove_complex *__restrict__ out, - size_type stride_out, sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - auto row = tidx / num_cols; - auto col = tidx % num_cols; - if (row < num_rows) { - out[row * stride_out + col] = std::abs(in[row * stride_in + col]); - } -} - -GKO_ENABLE_DEFAULT_HOST(outplace_absolute_dense, outplace_absolute_dense) - - -template -void make_complex(size_type num_rows, size_type num_cols, - const ValueType *__restrict__ in, size_type stride_in, - ComplexType *__restrict__ out, size_type stride_out, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - auto row = tidx / num_cols; - auto col = tidx % num_cols; - if (row < num_rows) { - out[row * stride_out + col] = in[row * stride_in + col]; - } -} - -GKO_ENABLE_DEFAULT_HOST(make_complex, make_complex) - - -template -void get_real(size_type num_rows, size_type num_cols, - const ValueType *__restrict__ in, size_type stride_in, - remove_complex *__restrict__ out, size_type stride_out, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - auto row = tidx / num_cols; - auto col = tidx % num_cols; - if (row < num_rows) { - out[row * stride_out + col] = real(in[row * stride_in + col]); - } -} - -GKO_ENABLE_DEFAULT_HOST(get_real, get_real) - - -template -void get_imag(size_type num_rows, size_type num_cols, - const ValueType *__restrict__ in, size_type stride_in, - remove_complex *__restrict__ out, size_type stride_out, - sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - auto row = tidx / num_cols; - auto col = tidx % num_cols; - if (row < num_rows) { - out[row * stride_out + col] = imag(in[row * stride_in + col]); - } -} - -GKO_ENABLE_DEFAULT_HOST(get_imag, get_imag) - - } // namespace kernel @@ -1028,34 +715,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -namespace { - - -#define GKO_BIND_DOT(ValueType, Name, Func) \ - void Name(::cl::sycl::queue &exec_queue, std::int64_t n, \ - const ValueType *x, std::int64_t incx, const ValueType *y, \ - std::int64_t incy, ValueType *result) \ - { \ - Func(exec_queue, n, x, incx, y, incy, result); \ - } \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") - -GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot); -GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot); -GKO_BIND_DOT(std::complex, dot, oneapi::mkl::blas::row_major::dotu); -GKO_BIND_DOT(std::complex, dot, oneapi::mkl::blas::row_major::dotu); -GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot); -GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot); -GKO_BIND_DOT(std::complex, conj_dot, oneapi::mkl::blas::row_major::dotc); -GKO_BIND_DOT(std::complex, conj_dot, - oneapi::mkl::blas::row_major::dotc); - - -} // namespace - - template void compute_dot(std::shared_ptr exec, const matrix::Dense *x, @@ -1231,10 +890,9 @@ void convert_to_coo(std::shared_ptr exec, const auto sg_size = KCFG_1D::decode<1>(cfg); size_type grid_dim = ceildiv(num_rows, wg_size); - kernel::fill_in_coo_call( - cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, num_cols, - stride, nnz_prefix_sum.get_const_data(), source->get_const_values(), - row_idxs, col_idxs, values); + kernel::fill_in_coo(grid_dim, wg_size, 0, exec->get_queue(), num_rows, + num_cols, stride, nnz_prefix_sum.get_const_data(), + source->get_const_values(), row_idxs, col_idxs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1276,10 +934,9 @@ void convert_to_csr(std::shared_ptr exec, size_type grid_dim = ceildiv(num_rows, wg_size); - kernel::fill_in_csr_call(cfg, grid_dim, default_block_size, 0, - exec->get_queue(), num_rows, num_cols, stride, - source->get_const_values(), row_ptrs, col_idxs, - values); + kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(), + num_rows, num_cols, stride, source->get_const_values(), + row_ptrs, col_idxs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1311,10 +968,9 @@ void convert_to_ell(std::shared_ptr exec, const auto wg_size = KCFG_1D::decode<0>(cfg); const auto sg_size = KCFG_1D::decode<1>(cfg); auto grid_dim = ceildiv(result_stride, wg_size); - kernel::fill_in_ell_call(cfg, grid_dim, wg_size, 0, exec->get_queue(), - num_rows, num_cols, source_stride, - source->get_const_values(), max_nnz_per_row, - result_stride, col_ptrs, values); + kernel::fill_in_ell(grid_dim, wg_size, 0, exec->get_queue(), num_rows, + num_cols, source_stride, source->get_const_values(), + max_nnz_per_row, result_stride, col_ptrs, values); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -1379,10 +1035,10 @@ void convert_to_sellp(std::shared_ptr exec, grid_dim = ceildiv(num_rows, wg_size); if (grid_dim > 0) { - kernel::fill_in_sellp_call(cfg, grid_dim, wg_size, 0, exec->get_queue(), - num_rows, num_cols, slice_size, stride, - source->get_const_values(), slice_lengths, - slice_sets, col_idxs, vals); + kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows, + num_cols, slice_size, stride, + source->get_const_values(), slice_lengths, + slice_sets, col_idxs, vals); } } From 0b562fdf49f3a0cfa3196c65f6f4d39b4cf78dc5 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Fri, 16 Jul 2021 10:46:25 +0200 Subject: [PATCH 17/22] MKL static cmake - public per_kernel link option when static - add find(MKL) when ginkgo is static --- cmake/GinkgoConfig.cmake.in | 5 +++++ dpcpp/CMakeLists.txt | 9 ++++++++- dpcpp/matrix/dense_kernels.dp.cpp | 5 +++-- dpcpp/test/matrix/dense_kernels.cpp | 24 ------------------------ 4 files changed, 16 insertions(+), 27 deletions(-) diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in index 61da405cf6f..4675bcb4781 100644 --- a/cmake/GinkgoConfig.cmake.in +++ b/cmake/GinkgoConfig.cmake.in @@ -78,6 +78,7 @@ set(GINKGO_AMD_ARCH_FLAGS @GINKGO_AMD_ARCH_FLAGS@) set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@) set(GINKGO_DPCPP_FLAGS @GINKGO_DPCPP_FLAGS@) +set(GINKGO_MKL_ROOT @GINKGO_MKL_ROOT@) set(GINKGO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@) @@ -168,4 +169,8 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP) find_package(rocrand REQUIRED) endif() +if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_DPCPP) + find_package(MKL CONFIG REQUIRED HINTS "${GINKGO_MKL_ROOT}") +endif() + include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake) diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index d30810cf12c..443d180b172 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -7,6 +7,7 @@ ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION) set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE) find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}") +set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE) add_library(ginkgo_dpcpp $ "") target_sources(ginkgo_dpcpp @@ -64,7 +65,13 @@ target_compile_options(ginkgo_dpcpp PRIVATE $) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) -target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) +# When building ginkgo as a static library, we need to use dpcpp and per_kernel +# link option when the program uses dpcpp related function. +if (BUILD_SHARED_LIBS) + target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) +else () + target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel) +endif() target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) target_link_libraries(ginkgo_dpcpp PRIVATE $) if (GINKGO_DPCPP_SINGLE_MODE) diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index f264f970cac..c2326be9c82 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -75,8 +75,9 @@ constexpr auto kcfg_1d_list = KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16), KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16), KCFG_1D::encode(256, 8)>(); -constexpr auto subgroup_list = syn::value_list(); -constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); +constexpr auto subgroup_list = + syn::value_list(); +constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); constexpr auto default_block_size = 256; diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index 2b9af16732a..43ce9bad547 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -550,30 +550,6 @@ TEST_F(Dense, ComputeConjDotComplexIsEquivalentToRef) } -// TEST_F(Dense, IsTransposable) -// { -// set_up_apply_data(); - -// auto trans = x->transpose(); -// auto dtrans = dx->transpose(); - -// GKO_ASSERT_MTX_NEAR(static_cast(dtrans.get()), -// static_cast(trans.get()), 0); -// } - - -// TEST_F(Dense, IsConjugateTransposable) -// { -// set_up_apply_data(); - -// auto trans = c_x->conj_transpose(); -// auto dtrans = dc_x->conj_transpose(); - -// GKO_ASSERT_MTX_NEAR(static_cast(dtrans.get()), -// static_cast(trans.get()), 0); -// } - - TEST_F(Dense, ConvertToCooIsEquivalentToRef) { set_up_apply_data(); From 7ac6d0909f006b52c5516e63b50cf0dff6b18012 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sun, 18 Jul 2021 23:33:35 +0200 Subject: [PATCH 18/22] use mkl in ncols=1, add cuda descp in dpcpp thread --- dpcpp/components/thread_ids.dp.hpp | 26 ++++++++++++---------- dpcpp/matrix/dense_kernels.dp.cpp | 35 ++++++++++++------------------ 2 files changed, 28 insertions(+), 33 deletions(-) diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp index 9eda077381c..47abf3c7b72 100644 --- a/dpcpp/components/thread_ids.dp.hpp +++ b/dpcpp/components/thread_ids.dp.hpp @@ -59,7 +59,7 @@ namespace thread { * * @return the ID of the block group this thread belongs to * - * @note Assumes that grid dimensions are in standard format: + * @note Assumes that grid dimensions are in cuda standard format: * `(block_group_size, first_grid_dimension, second grid_dimension)` */ __dpct_inline__ size_type get_block_group_id(sycl::nd_item<3> item_ct1) @@ -76,7 +76,7 @@ __dpct_inline__ size_type get_block_group_id(sycl::nd_item<3> item_ct1) * * @return the ID of the block this thread belongs to * - * @note Assumes that grid dimensions are in standard format: + * @note Assumes that grid dimensions are in cuda standard format: * `(block_group_size, first_grid_dimension, second grid_dimension)` */ __dpct_inline__ size_type get_block_id(sycl::nd_item<3> item_ct1) @@ -95,7 +95,7 @@ __dpct_inline__ size_type get_block_id(sycl::nd_item<3> item_ct1) * @return the local ID of the warp (relative to the block) this thread belongs * to * - * @note Assumes that block dimensions are in standard format: + * @note Assumes that block dimensions are in cuda standard format: * `(subwarp_size, config::warp_size / subwarp_size, block_size / * config::warp_size)` */ @@ -116,7 +116,7 @@ __dpct_inline__ size_type get_local_warp_id(sycl::nd_item<3> item_ct1) * @return the local ID of the sub-warp (relative to the block) this thread * belongs to * - * @note Assumes that block dimensions are in standard format: + * @note Assumes that block dimensions are in cuda standard format: * `(subwarp_size, config::warp_size / subwarp_size, block_size / * config::warp_size)` */ @@ -140,7 +140,7 @@ __dpct_inline__ size_type get_local_subwarp_id(sycl::nd_item<3> item_ct1) * * @return the local ID of the thread (relative to the block) * - * @note Assumes that block dimensions are in standard format: + * @note Assumes that block dimensions are in cuda standard format: * `(subwarp_size, config::warp_size / subwarp_size, block_size / * config::warp_size)` */ @@ -161,7 +161,7 @@ __dpct_inline__ size_type get_local_thread_id(sycl::nd_item<3> item_ct1) * * @return the global ID of the warp this thread belongs to. * - * @note Assumes that block dimensions and grid dimensions are in standard + * @note Assumes that block dimensions and grid dimensions are in cuda standard * format: * `(subwarp_size, config::warp_size / subwarp_size, block_size / * config::warp_size)` and @@ -185,7 +185,7 @@ __dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1) * * @return the global ID of the sub-warp this thread belongs to. * - * @note Assumes that block dimensions and grid dimensions are in standard + * @note Assumes that block dimensions and grid dimensions are in cuda standard * format: * `(subwarp_size, config::warp_size / subwarp_size, block_size / * config::warp_size)` and @@ -211,7 +211,7 @@ __dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1) * * @tparam subwarp_size size of the subwarp * - * @note Assumes that block dimensions and grid dimensions are in standard + * @note Assumes that block dimensions and grid dimensions are in cuda standard * format: * `(subwarp_size, config::warp_size / subwarp_size, block_size / * config::warp_size)` and @@ -231,7 +231,8 @@ __dpct_inline__ size_type get_thread_id(sycl::nd_item<3> item_ct1) * @internal * * Returns the global ID of the thread in the given index type. - * This function assumes one-dimensional thread and block indexing. + * This function assumes one-dimensional thread and block indexing in cuda + * sense. It uses the third position infomation to get the information. * * @return the global ID of the thread in the given index type. * @@ -250,7 +251,8 @@ __dpct_inline__ IndexType get_thread_id_flat(sycl::nd_item<3> item_ct1) * @internal * * Returns the total number of threads in the given index type. - * This function assumes one-dimensional thread and block indexing. + * This function assumes one-dimensional thread and block indexing in cuda + * sense. It uses the third position infomation to get the information. * * @return the total number of threads in the given index type. * @@ -268,7 +270,7 @@ __dpct_inline__ IndexType get_thread_num_flat(sycl::nd_item<3> item_ct1) * @internal * * Returns the global ID of the subwarp in the given index type. - * This function assumes one-dimensional thread and block indexing + * This function assumes one-dimensional thread and block indexing in cuda sense * with a power of two block size of at least subwarp_size. * * @return the global ID of the subwarp in the given index type. @@ -292,7 +294,7 @@ __dpct_inline__ IndexType get_subwarp_id_flat(sycl::nd_item<3> item_ct1) * @internal * * Returns the total number of subwarps in the given index type. - * This function assumes one-dimensional thread and block indexing + * This function assumes one-dimensional thread and block indexing in cuda sense * with a power of two block size of at least subwarp_size. * * @return the total number of subwarps in the given index type. diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index c2326be9c82..32eef01af63 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -722,14 +722,11 @@ void compute_dot(std::shared_ptr exec, const matrix::Dense *y, matrix::Dense *result) { - if (0) { + if (x->get_size()[1] == 1) { // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - onemkl::dot(*exec->get_queue(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } + onemkl::dot(*exec->get_queue(), x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), y->get_stride(), + result->get_values()); } else { // TODO: these are tuning parameters obtained experimentally, once // we decide how to handle this uniformly, they should be modified @@ -770,14 +767,13 @@ void compute_conj_dot(std::shared_ptr exec, const matrix::Dense *y, matrix::Dense *result) { - if (0) { + if (x->get_size()[1] == 1) { // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - onemkl::conj_dot(*exec->get_queue(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } + onemkl::conj_dot(*exec->get_queue(), x->get_size()[0], + x->get_const_values(), x->get_stride(), + y->get_const_values(), y->get_stride(), + result->get_values()); + } else { // TODO: these are tuning parameters obtained experimentally, once // we decide how to handle this uniformly, they should be modified @@ -818,13 +814,10 @@ void compute_norm2(std::shared_ptr exec, const matrix::Dense *x, matrix::Dense> *result) { - if (0) { - for (size_type col = 0; col < x->get_size()[1]; ++col) { - oneapi::mkl::blas::row_major::nrm2( - *exec->get_queue(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - result->get_values() + col); - } + if (x->get_size()[1] == 1) { + oneapi::mkl::blas::row_major::nrm2( + *exec->get_queue(), x->get_size()[0], x->get_const_values(), + x->get_stride(), result->get_values()); } else { using norm_type = remove_complex; // TODO: these are tuning parameters obtained experimentally, once From 6169c2ff4a6caf20a0b4213b62119bb8ce0e3890 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 20 Jul 2021 15:56:24 +0200 Subject: [PATCH 19/22] improve document, fix auto usage in for, shared_memory usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Grützmacher --- common/components/prefix_sum.hpp.inc | 2 +- common/components/sorting.hpp.inc | 12 ++-- common/components/uninitialized_array.hpp.inc | 7 +- .../par_ilut_filter_kernels.hpp.inc | 4 +- .../par_ilut_select_kernels.hpp.inc | 10 +-- common/matrix/dense_kernels.hpp.inc | 2 +- common/matrix/ell_kernels.hpp.inc | 4 +- common/matrix/hybrid_kernels.hpp.inc | 2 +- cuda/components/prefix_sum.cu | 6 +- cuda/test/components/sorting_kernels.cu | 2 +- cuda/test/matrix/dense_kernels.cpp | 2 +- cuda/test/matrix/ell_kernels.cpp | 2 +- dpcpp/CMakeLists.txt | 6 +- dpcpp/base/config.hpp | 6 -- dpcpp/base/helper.dp.cpp | 20 +++--- dpcpp/base/helper.hpp | 71 ++++++++++++------- dpcpp/base/onemkl_bindings.hpp | 3 + dpcpp/components/prefix_sum.dp.cpp | 6 +- dpcpp/components/prefix_sum.dp.hpp | 5 +- dpcpp/components/reduction.dp.hpp | 6 +- dpcpp/components/thread_ids.dp.hpp | 2 +- dpcpp/components/uninitialized_array.hpp | 6 +- dpcpp/matrix/dense_kernels.dp.cpp | 35 ++++----- dpcpp/test/matrix/dense_kernels.cpp | 2 +- hip/components/prefix_sum.hip.cpp | 6 +- .../ginkgo/core/synthesizer/containers.hpp | 18 ++--- 26 files changed, 131 insertions(+), 116 deletions(-) diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc index 8f759b1dc95..1d57c20b2e5 100644 --- a/common/components/prefix_sum.hpp.inc +++ b/common/components/prefix_sum.hpp.inc @@ -57,7 +57,7 @@ __forceinline__ __device__ void subwarp_prefix_sum(ValueType element, total_sum = element; #pragma unroll // hypercube prefix sum - for (auto step = 1; step < subwarp.size(); step *= 2) { + for (int step = 1; step < subwarp.size(); step *= 2) { auto neighbor = subwarp.shfl_xor(total_sum, step); total_sum += neighbor; prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0; diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc index ef5bd690937..cd772e08adb 100644 --- a/common/components/sorting.hpp.inc +++ b/common/components/sorting.hpp.inc @@ -70,7 +70,7 @@ struct bitonic_local { bool reverse) { auto els_mid = els + (num_elements / 2); - for (auto i = 0; i < num_elements / 2; ++i) { + for (int i = 0; i < num_elements / 2; ++i) { bitonic_cas(els[i], els_mid[i], reverse); } half::merge(els, reverse); @@ -131,7 +131,7 @@ struct bitonic_warp { auto tile = group::tiled_partition(group::this_thread_block()); auto new_reverse = reverse != upper_half(); - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { auto other = tile.shfl_xor(els[i], num_threads / 2); bitonic_cas(els[i], other, new_reverse); } @@ -206,7 +206,7 @@ struct bitonic_global { auto upper_shared_els = shared_els + (num_groups * num_threads / 2); // only the lower group executes the CAS if (!upper_half()) { - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { auto j = shared_idx(i); bitonic_cas(shared_els[j], upper_shared_els[j], reverse); } @@ -241,11 +241,11 @@ struct bitonic_global { bool reverse) { group::this_thread_block().sync(); - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { local_els[i] = shared_els[shared_idx(i)]; } warp::merge(local_els, reverse); - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { shared_els[shared_idx(i)] = local_els[i]; } } @@ -258,7 +258,7 @@ struct bitonic_global { // This is the first step, so we don't need to load from shared memory warp::sort(local_els, reverse); // store the sorted elements in shared memory - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { shared_els[shared_idx(i)] = local_els[i]; } } diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc index 3a8b3796c12..e951cf06860 100644 --- a/common/components/uninitialized_array.hpp.inc +++ b/common/components/uninitialized_array.hpp.inc @@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * Stores an array with uninitialized contents. * - * This class needed for datatypes that do have a non-empty constructor when` + * This class is needed for datatypes that do have a non-empty constructor when * using them as shared memory, for example `thrust::complex`. * * @tparam ValueType the type of values @@ -49,7 +49,7 @@ public: * * @return the constexpr pointer to the first entry of the array. */ - constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept + constexpr GKO_ATTRIBUTES operator const ValueType *() const noexcept { return &(*this)[0]; } @@ -70,7 +70,8 @@ public: * * @return a reference to the array entry at the given index. */ - constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept + constexpr GKO_ATTRIBUTES const ValueType &operator[](size_type pos) const + noexcept { return reinterpret_cast(data_)[pos]; } diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc index 25b43e789ee..b5f7d43db67 100644 --- a/common/factorization/par_ilut_filter_kernels.hpp.inc +++ b/common/factorization/par_ilut_filter_kernels.hpp.inc @@ -55,7 +55,7 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs, auto end = row_ptrs[row + 1]; begin_cb(row); auto num_steps = ceildiv(end - begin, subwarp_size); - for (auto step = 0; step < num_steps; ++step) { + for (IndexType step = 0; step < num_steps; ++step) { auto idx = begin + lane + step * subwarp_size; auto keep = idx < end && pred(idx, begin, end); auto mask = subwarp.ballot(keep); @@ -189,4 +189,4 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter( } -} // namespace kernel \ No newline at end of file +} // namespace kernel diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc index 059069faf41..9b4897d1766 100644 --- a/common/factorization/par_ilut_select_kernels.hpp.inc +++ b/common/factorization/par_ilut_select_kernels.hpp.inc @@ -62,7 +62,8 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree( // assuming rounding towards zero auto stride = double(size) / sample_size; #pragma unroll - for (auto i = 0; i < sampleselect_oversampling; ++i) { + for (auto i = decltype(sampleselect_oversampling){0}; + i < sampleselect_oversampling; ++i) { auto lidx = idx * sampleselect_oversampling + i; auto val = input[static_cast(lidx * stride)]; samples[i] = abs(val); @@ -119,7 +120,8 @@ __global__ __launch_bounds__(default_block_size) void count_buckets( auto el = abs(input[i]); IndexType tree_idx{}; #pragma unroll - for (auto level = 0; level < sampleselect_searchtree_height; ++level) { + for (auto level = decltype(sampleselect_searchtree_height){0}; + level < sampleselect_searchtree_height; ++level) { auto cmp = !(el < sh_tree[tree_idx]); tree_idx = 2 * tree_idx + 1 + cmp; } @@ -168,7 +170,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum( // compute prefix sum over warp-sized blocks IndexType total{}; auto base_idx = warp_idx * work_per_warp * warp.size(); - for (auto step = 0; step < work_per_warp; ++step) { + for (auto step = decltype(work_per_warp){0}; step < work_per_warp; ++step) { auto idx = warp_lane + step * warp.size() + base_idx; auto val = idx < num_blocks ? local_counters[idx] : zero(); IndexType warp_total{}; @@ -207,7 +209,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum( // add block prefix sum to each warp's block of data block.sync(); auto warp_prefixsum = warp_sums[warp_idx]; - for (auto step = 0; step < work_per_warp; ++step) { + for (IndexType step = 0; step < work_per_warp; ++step) { auto idx = warp_lane + step * warp.size() + base_idx; auto val = idx < num_blocks ? local_counters[idx] : zero(); if (idx < num_blocks) { diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc index d46b202a8ff..c7ebafd0627 100644 --- a/common/matrix/dense_kernels.hpp.inc +++ b/common/matrix/dense_kernels.hpp.inc @@ -211,7 +211,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( if (tidx < num_rows) { auto write_to = row_ptrs[tidx]; - for (auto i = 0; i < num_cols; i++) { + for (size_type i = 0; i < num_cols; i++) { if (source[stride * tidx + i] != zero()) { values[write_to] = source[stride * tidx + i]; col_idxs[write_to] = i; diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc index 2323d512258..399dd5070ac 100644 --- a/common/matrix/ell_kernels.hpp.inc +++ b/common/matrix/ell_kernels.hpp.inc @@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense( { const auto tidx = thread::get_thread_id_flat(); if (tidx < num_rows) { - for (auto col = 0; col < nnz; col++) { + for (size_type col = 0; col < nnz; col++) { result[tidx * result_stride + col_idxs[tidx + col * source_stride]] += values[tidx + col * source_stride]; @@ -226,7 +226,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( if (tidx < num_rows) { auto write_to = result_row_ptrs[tidx]; - for (auto i = 0; i < max_nnz_per_row; i++) { + for (size_type i = 0; i < max_nnz_per_row; i++) { const auto source_idx = tidx + stride * i; if (source_values[source_idx] != zero()) { result_values[write_to] = source_values[source_idx]; diff --git a/common/matrix/hybrid_kernels.hpp.inc b/common/matrix/hybrid_kernels.hpp.inc index b6af7c2be36..c7c192189e0 100644 --- a/common/matrix/hybrid_kernels.hpp.inc +++ b/common/matrix/hybrid_kernels.hpp.inc @@ -108,7 +108,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( if (tidx < num_rows) { auto write_to = result_row_ptrs[tidx]; - for (auto i = 0; i < max_nnz_per_row; i++) { + for (size_type i = 0; i < max_nnz_per_row; i++) { const auto source_idx = tidx + stride * i; if (ell_val[source_idx] != zero()) { result_values[write_to] = ell_val[source_idx]; diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu index 54739c783c8..ce108fa8cf9 100644 --- a/cuda/components/prefix_sum.cu +++ b/cuda/components/prefix_sum.cu @@ -49,7 +49,7 @@ template void prefix_sum(std::shared_ptr exec, IndexType *counts, size_type num_entries) { - // prefix_sum should be on the valid array + // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); Array block_sum_array(exec, num_blocks - 1); @@ -57,8 +57,8 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, start_prefix_sum <<>>(num_entries, counts, block_sums); - // add the total sum of the previous block only when the number of block - // is larger than 1. + // add the total sum of the previous block only when the number of + // blocks is larger than 1. if (num_blocks > 1) { finalize_prefix_sum <<>>(num_entries, counts, diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu index e973cc0f650..f61bbd0694e 100644 --- a/cuda/test/components/sorting_kernels.cu +++ b/cuda/test/components/sorting_kernels.cu @@ -99,7 +99,7 @@ protected: { // we want some duplicate elements std::uniform_int_distribution dist(0, num_elements / 2); - for (auto i = 0; i < num_elements; ++i) { + for (auto i = decltype(num_elements){0}; i < num_elements; ++i) { ref_shared.get_data()[i] = dist(rng); } ddata = gko::Array{cuda, ref_shared}; diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp index 6e40ce5b5a3..de96d27d823 100644 --- a/cuda/test/matrix/dense_kernels.cpp +++ b/cuda/test/matrix/dense_kernels.cpp @@ -550,7 +550,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) &dnnz_per_row); auto tmp = gko::Array(ref, dnnz_per_row); - for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); } } diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index 2df1c397f4c..51c12fab531 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -585,7 +585,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) &dnnz_per_row); auto tmp = gko::Array(ref, dnnz_per_row); - for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); } } diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 443d180b172..7729588d363 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -60,13 +60,15 @@ target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP) set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") -# Note. add MKL via PRIVATE not PUBLIC (MKL example shows) to avoid find_package(MKL) everywhere when link ginkgo +# Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating +# find_package(MKL) everywhere when linking ginkgo (see the MKL example +# https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html) target_compile_options(ginkgo_dpcpp PRIVATE $) target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) target_include_directories(ginkgo_dpcpp PRIVATE $) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) # When building ginkgo as a static library, we need to use dpcpp and per_kernel -# link option when the program uses dpcpp related function. +# link option when the program uses a dpcpp related function. if (BUILD_SHARED_LIBS) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) else () diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp index abb84d9b7ff..78fe25978a7 100644 --- a/dpcpp/base/config.hpp +++ b/dpcpp/base/config.hpp @@ -49,12 +49,6 @@ struct config { */ using lane_mask_type = uint64; - - /** - * The number of threads within a CUDA warp. - */ - static constexpr uint32 warp_size = 16; - /** * The bitmask of the entire warp. */ diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp index ae453dd937d..5e6c1a579f5 100644 --- a/dpcpp/base/helper.dp.cpp +++ b/dpcpp/base/helper.dp.cpp @@ -44,18 +44,16 @@ namespace dpcpp { bool validate(sycl::queue *queue, unsigned int workgroup_size, unsigned int subgroup_size) { - { - auto device = queue->get_device(); - auto subgroup_size_list = - device.get_info(); - auto max_workgroup_size = - device.get_info(); - bool allowed = false; - for (auto &i : subgroup_size_list) { - allowed |= (i == subgroup_size); - } - return allowed && (workgroup_size <= max_workgroup_size); + auto device = queue->get_device(); + auto subgroup_size_list = + device.get_info(); + auto max_workgroup_size = + device.get_info(); + bool allowed = false; + for (auto &i : subgroup_size_list) { + allowed |= (i == subgroup_size); } + return allowed && (workgroup_size <= max_workgroup_size); } diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index 8c7f45e5174..16d91c2ef8d 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -51,44 +51,44 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * GKO_ENABLE_DEFAULT_HOST gives a default host implementation for those * kernels which require encoded config but do not need explicit template - * parameter and share memory + * parameter and shared memory * * @param name_ the name of the host function with config * @param kernel_ the kernel name */ -#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ - template \ - void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \ - sycl::queue *queue, InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler &cgh) { \ - cgh.parallel_for(sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ + template \ + void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \ + InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler &cgh) { \ + cgh.parallel_for(sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } /** * GKO_ENABLE_DEFAULT_HOST_CONFIG gives a default host implementation for those * kernels which require encoded config but do not need explicit template - * parameter and share memory + * parameter and shared memory * * @param name_ the name of the host function with config * @param kernel_ the kernel name */ -#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ - template \ - inline void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \ - sycl::queue *queue, InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler &cgh) { \ - cgh.parallel_for(sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ + template \ + inline void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \ + InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler &cgh) { \ + cgh.parallel_for(sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } /** @@ -138,10 +138,33 @@ namespace kernels { namespace dpcpp { +/** + * This is the validate function for common check. It checks the workgroup size + * is below device max workgroup size and subgroup size is in the supported + * subgroup size. + * + * @param queue the sycl queue pointer + * @param workgroup_size the workgroup size (block size in cuda sense) + * @param subgroup_size the subgroup size (warp size in cuda sense) + * + * @return the given arguments are valid or not in given queue. + */ bool validate(sycl::queue *queue, unsigned workgroup_size, unsigned subgroup_size); +/** + * get_first_cfg will return the first valid config by validate function from + * given config array. + * + * @tparam IterArr the iteratable array type + * @tparam Validate the validate function type + * + * @param arr the config array + * @param verify the validate function + * + * @return the first valid config + */ template std::uint32_t get_first_cfg(IterArr &arr, Validate verify) { diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp index 6456a048d23..1c9a8dabb30 100644 --- a/dpcpp/base/onemkl_bindings.hpp +++ b/dpcpp/base/onemkl_bindings.hpp @@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_ +#include + + #include #include diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp index 07cdb5b38aa..63f33e9ba35 100644 --- a/dpcpp/components/prefix_sum.dp.cpp +++ b/dpcpp/components/prefix_sum.dp.cpp @@ -70,7 +70,7 @@ template void prefix_sum(std::shared_ptr exec, IndexType *counts, size_type num_entries) { - // prefix_sum should be on the valid array + // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto queue = exec->get_queue(); constexpr auto block_cfg_array = as_array(block_cfg_list); @@ -84,8 +84,8 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, auto block_sums = block_sum_array.get_data(); start_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(), num_entries, counts, block_sums); - // add the total sum of the previous block only when the number of block - // is larger than 1. + // add the total sum of the previous block only when the number of + // blocks is larger than 1. if (num_blocks > 1) { finalize_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(), num_entries, counts, diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index 22e6139dd84..334d4239c56 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -78,7 +78,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element, total_sum = element; #pragma unroll // hypercube prefix sum - for (auto step = 1; step < subwarp.size(); step *= 2) { + for (int step = 1; step < subwarp.size(); step *= 2) { auto neighbor = subwarp.shfl_xor(total_sum, step); total_sum += neighbor; prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0; @@ -193,8 +193,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory, [=](sycl::nd_item<3> item_ct1) { start_prefix_sum( num_elements, elements, block_sum, item_ct1, - (UninitializedArray *) - prefix_helper_acc_ct1.get_pointer()); + prefix_helper_acc_ct1.get_pointer().get()); }); }); } diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index e0678f6cf7a..094f2093a95 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -229,10 +229,8 @@ void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory, cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - reduce_add_array( - size, source, result, item_ct1, - (UninitializedArray(cfg)> *) - block_sum_acc_ct1.get_pointer()); + reduce_add_array(size, source, result, item_ct1, + block_sum_acc_ct1.get_pointer().get()); }); }); } diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp index 47abf3c7b72..f9decfd989d 100644 --- a/dpcpp/components/thread_ids.dp.hpp +++ b/dpcpp/components/thread_ids.dp.hpp @@ -195,7 +195,7 @@ __dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1) template __dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1) { - // dpcpp dose not have subwarp + // dpcpp does not have subwarp constexpr auto subwarps_per_warp = subwarp_size / subwarp_size; return get_warp_id(item_ct1) * subwarps_per_warp + item_ct1.get_local_id(1); diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp index b10457df217..d9d423c9c94 100644 --- a/dpcpp/components/uninitialized_array.hpp +++ b/dpcpp/components/uninitialized_array.hpp @@ -48,7 +48,7 @@ namespace dpcpp { /** * Stores an array with uninitialized contents. * - * This class needed for datatypes that do have a non-empty constructor when` + * This class is needed for datatypes that do have a non-empty constructor when * using them as shared memory, for example `thrust::complex`. * * @tparam ValueType the type of values @@ -63,7 +63,7 @@ class UninitializedArray { * * @return the constexpr pointer to the first entry of the array. */ - constexpr __dpct_inline__ operator ValueType *() const noexcept + constexpr __dpct_inline__ operator const ValueType *() const noexcept { return &(*this)[0]; } @@ -84,7 +84,7 @@ class UninitializedArray { * * @return a reference to the array entry at the given index. */ - constexpr __dpct_inline__ ValueType &operator[](size_type pos) const + constexpr __dpct_inline__ const ValueType &operator[](size_type pos) const noexcept { return data_[pos]; diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 32eef01af63..b17e44f9706 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -179,10 +179,9 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory, cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_dot( - num_rows, x, stride_x, y, stride_y, work, item_ct1, - (UninitializedArray *) - tmp_work_acc_ct1.get_pointer()); + compute_partial_dot(num_rows, x, stride_x, y, stride_y, + work, item_ct1, + tmp_work_acc_ct1.get_pointer().get()); }); }); } @@ -227,8 +226,7 @@ void compute_partial_conj_dot(dim3 grid, dim3 block, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { compute_partial_conj_dot( num_rows, x, stride_x, y, stride_y, work, item_ct1, - (UninitializedArray *) - tmp_work_acc_ct1.get_pointer()); + tmp_work_acc_ct1.get_pointer().get()); }); }); } @@ -268,8 +266,7 @@ void finalize_sum_reduce_computation(dim3 grid, dim3 block, [=](sycl::nd_item<3> item_ct1) { finalize_sum_reduce_computation( size, work, result, item_ct1, - (UninitializedArray *) - tmp_work_acc_ct1.get_pointer()); + tmp_work_acc_ct1.get_pointer().get()); }); }); } @@ -308,13 +305,12 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::target::local> tmp_work_acc_ct1(cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_norm2( - num_rows, x, stride_x, work, item_ct1, - (UninitializedArray, wg_size> *) - tmp_work_acc_ct1.get_pointer()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + compute_partial_norm2( + num_rows, x, stride_x, work, item_ct1, + tmp_work_acc_ct1.get_pointer().get()); + }); }); } @@ -354,8 +350,7 @@ void finalize_sqrt_reduce_computation(dim3 grid, dim3 block, [=](sycl::nd_item<3> item_ct1) { finalize_sqrt_reduce_computation( size, work, result, item_ct1, - (UninitializedArray *) - tmp_work_acc_ct1.get_pointer()); + tmp_work_acc_ct1.get_pointer().get()); }); }); } @@ -433,7 +428,7 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, if (tidx < num_rows) { auto write_to = row_ptrs[tidx]; - for (auto i = 0; i < num_cols; i++) { + for (size_type i = 0; i < num_cols; i++) { if (source[stride * tidx + i] != zero()) { values[write_to] = source[stride * tidx + i]; col_idxs[write_to] = i; @@ -586,7 +581,7 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory, cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { reduce_max_nnz(size, nnz_per_row, result, item_ct1, - dpct_local_acc_ct1.get_pointer()); + dpct_local_acc_ct1.get_pointer().get()); }); }); } @@ -666,7 +661,7 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { reduce_total_cols(num_slices, max_nnz_per_slice, result, item_ct1, - dpct_local_acc_ct1.get_pointer()); + dpct_local_acc_ct1.get_pointer().get()); }); }); } diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index 43ce9bad547..257ee6fbc6a 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -697,7 +697,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) &dnnz_per_row); auto tmp = gko::Array(ref, dnnz_per_row); - for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); } } diff --git a/hip/components/prefix_sum.hip.cpp b/hip/components/prefix_sum.hip.cpp index 28cd01b4fb5..9302fc07b9a 100644 --- a/hip/components/prefix_sum.hip.cpp +++ b/hip/components/prefix_sum.hip.cpp @@ -49,7 +49,7 @@ template void prefix_sum(std::shared_ptr exec, IndexType *counts, size_type num_entries) { - // prefix_sum should be on the valid array + // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); Array block_sum_array(exec, num_blocks - 1); @@ -58,8 +58,8 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, HIP_KERNEL_NAME(start_prefix_sum), dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, num_entries, counts, block_sums); - // add the total sum of the previous block only when the number of block - // is larger than 1. + // add the total sum of the previous block only when the number of + // blocks is larger than 1. if (num_blocks > 1) { hipLaunchKernelGGL( HIP_KERNEL_NAME(finalize_prefix_sum), diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp index 10e8c1031a1..0e9570540fa 100644 --- a/include/ginkgo/core/synthesizer/containers.hpp +++ b/include/ginkgo/core/synthesizer/containers.hpp @@ -51,7 +51,7 @@ namespace syn { * value_list records several values with the same type in template. * * @tparam T the value type of the list - * @tparam T... the values in the list + * @tparam Values the values in the list */ template struct value_list {}; @@ -60,7 +60,7 @@ struct value_list {}; /** * type_list records several types in template * - * @tparam ...Types the types in the list + * @tparam Types the types in the list */ template struct type_list {}; @@ -69,9 +69,9 @@ struct type_list {}; /** * range records start, end, step in template * - * @tparam int start of range - * @tparam int end of range - * @tparam int step of range. default is 1 + * @tparam Start start of range + * @tparam End end of range + * @tparam Step step of range. default is 1 */ template struct range {}; @@ -93,8 +93,8 @@ struct concatenate_impl; * concatenate_impl specializes for two value_list with the same value type. * * @tparam T the value type of two value_list - * @tparam T... the values of the first list - * @tparam T... the values of the second list + * @tparam Values the values of the first list + * @tparam Values the values of the second list */ template struct concatenate_impl, value_list> { @@ -130,7 +130,7 @@ struct as_list_impl; * as_list_impl specializes for the value_list * * @tparam T the value_list type - * @tparam T... the values of value_list + * @tparam Values the values of value_list */ template struct as_list_impl> { @@ -193,7 +193,7 @@ using as_list = typename detail::as_list_impl::type; * for in runtime on the array. * * @tparam T the type of value_list - * @tparam T... the values of value_list + * @tparam Value the values of value_list * * @param value_list the input value_list * From a6b4ccc461e78df3bd2d7082184b6dc5ffb69a45 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 20 Jul 2021 18:08:59 +0200 Subject: [PATCH 20/22] add some note to indicate the porting TODO need to revisit these TODO when we are close to fully porting ginkgo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Terry Cojean Co-authored-by: Thomas Grützmacher Co-authored-by: Tobias Ribizel --- dpcpp/base/helper.hpp | 47 ++++++++++++------------ dpcpp/components/thread_ids.dp.hpp | 6 +++ dpcpp/components/uninitialized_array.hpp | 3 ++ 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index 16d91c2ef8d..cb98e4c511e 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -56,17 +56,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * @param name_ the name of the host function with config * @param kernel_ the kernel name */ -#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ - template \ - void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \ - InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler &cgh) { \ - cgh.parallel_for(sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ + template \ + void name_(dim3 grid, dim3 block, gko::size_type, sycl::queue *queue, \ + InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler &cgh) { \ + cgh.parallel_for(sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } @@ -78,17 +78,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * @param name_ the name of the host function with config * @param kernel_ the kernel name */ -#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ - template \ - inline void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \ - InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler &cgh) { \ - cgh.parallel_for(sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ + template \ + inline void name_(dim3 grid, dim3 block, gko::size_type, \ + sycl::queue *queue, InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler &cgh) { \ + cgh.parallel_for(sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } /** @@ -106,7 +106,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_) \ template \ void name_(std::uint32_t desired_cfg, dim3 grid, dim3 block, \ - size_t dynamic_shared_memory, sycl::queue *queue, \ + gko::size_type dynamic_shared_memory, sycl::queue *queue, \ InferredArgs... args) \ { \ callable_( \ @@ -174,7 +174,6 @@ std::uint32_t get_first_cfg(IterArr &arr, Validate verify) } } GKO_NOT_SUPPORTED(arr); - return 0; } diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp index f9decfd989d..70ad76d9ccb 100644 --- a/dpcpp/components/thread_ids.dp.hpp +++ b/dpcpp/components/thread_ids.dp.hpp @@ -52,6 +52,12 @@ namespace dpcpp { namespace thread { +// TODO: porting - need to refine functions and their name in this file +// the grid/block description uses the cuda dim3 to represent. i.e. using dim3 +// to launch dpcpp kernel, the kernel will reverse the ordering to keep the same +// linear memory usage as cuda. + + /** * @internal * diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp index d9d423c9c94..eb8a36770d7 100644 --- a/dpcpp/components/uninitialized_array.hpp +++ b/dpcpp/components/uninitialized_array.hpp @@ -45,6 +45,9 @@ namespace kernels { namespace dpcpp { +// TODO: porting - consider directly use the array as shared memory + + /** * Stores an array with uninitialized contents. * From 4464c4db84ed78715b5a4bccd92de140e2b5aecd Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 21 Jul 2021 12:04:00 +0200 Subject: [PATCH 21/22] explict type in for, use func not macro to skip, remove dup test note: gtest_skip can not be in another function call Co-authored-by: Tobias Ribizel --- .../par_ilut_select_kernels.hpp.inc | 8 +- cuda/test/components/sorting_kernels.cu | 2 +- dpcpp/CMakeLists.txt | 2 +- dpcpp/test/matrix/dense_kernels.cpp | 365 +----------------- dpcpp/test/utils.hpp | 56 +++ 5 files changed, 67 insertions(+), 366 deletions(-) create mode 100644 dpcpp/test/utils.hpp diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc index 9b4897d1766..e443d7b6ba7 100644 --- a/common/factorization/par_ilut_select_kernels.hpp.inc +++ b/common/factorization/par_ilut_select_kernels.hpp.inc @@ -62,8 +62,7 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree( // assuming rounding towards zero auto stride = double(size) / sample_size; #pragma unroll - for (auto i = decltype(sampleselect_oversampling){0}; - i < sampleselect_oversampling; ++i) { + for (int i = 0; i < sampleselect_oversampling; ++i) { auto lidx = idx * sampleselect_oversampling + i; auto val = input[static_cast(lidx * stride)]; samples[i] = abs(val); @@ -120,8 +119,7 @@ __global__ __launch_bounds__(default_block_size) void count_buckets( auto el = abs(input[i]); IndexType tree_idx{}; #pragma unroll - for (auto level = decltype(sampleselect_searchtree_height){0}; - level < sampleselect_searchtree_height; ++level) { + for (int level = 0; level < sampleselect_searchtree_height; ++level) { auto cmp = !(el < sh_tree[tree_idx]); tree_idx = 2 * tree_idx + 1 + cmp; } @@ -170,7 +168,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum( // compute prefix sum over warp-sized blocks IndexType total{}; auto base_idx = warp_idx * work_per_warp * warp.size(); - for (auto step = decltype(work_per_warp){0}; step < work_per_warp; ++step) { + for (IndexType step = 0; step < work_per_warp; ++step) { auto idx = warp_lane + step * warp.size() + base_idx; auto val = idx < num_blocks ? local_counters[idx] : zero(); IndexType warp_total{}; diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu index f61bbd0694e..e2b7abc51d7 100644 --- a/cuda/test/components/sorting_kernels.cu +++ b/cuda/test/components/sorting_kernels.cu @@ -99,7 +99,7 @@ protected: { // we want some duplicate elements std::uniform_int_distribution dist(0, num_elements / 2); - for (auto i = decltype(num_elements){0}; i < num_elements; ++i) { + for (int i = 0; i < num_elements; ++i) { ref_shared.get_data()[i] = dist(rng); } ddata = gko::Array{cuda, ref_shared}; diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 7729588d363..e2d476164e8 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -75,7 +75,7 @@ else () target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel) endif() target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) -target_link_libraries(ginkgo_dpcpp PRIVATE $) +target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP) if (GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index 257ee6fbc6a..cf1bbe26cd4 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/fill_array.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/test/utils.hpp" +#include "dpcpp/test/utils.hpp" namespace { @@ -194,127 +195,6 @@ class Dense : public ::testing::Test { }; -TEST_F(Dense, DpcppFillIsEquivalentToRef) -{ - set_up_vector_data(3); - auto result = Mtx::create(ref); - - x->fill(42); - dx->fill(42); - result->copy_from(dx.get()); - - GKO_ASSERT_MTX_NEAR(result, x, r::value); -} - - -TEST_F(Dense, DpcppStridedFillIsEquivalentToRef) -{ - using T = vtype; - auto x = gko::initialize>( - 4, {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}, ref); - auto dx = gko::initialize>( - 4, {I{1.0, 2.0}, I{3.0, 4.0}, I{5.0, 6.0}}, dpcpp); - auto result = Mtx::create(ref); - - x->fill(42); - dx->fill(42); - result->copy_from(dx.get()); - - GKO_ASSERT_MTX_NEAR(result, x, r::value); -} - - -TEST_F(Dense, SingleVectorDpcppScaleIsEquivalentToRef) -{ - set_up_vector_data(1); - auto result = Mtx::create(ref); - - x->scale(alpha.get()); - dx->scale(dalpha.get()); - result->copy_from(dx.get()); - - GKO_ASSERT_MTX_NEAR(result, x, r::value); -} - - -TEST_F(Dense, MultipleVectorDpcppScaleIsEquivalentToRef) -{ - set_up_vector_data(20); - - x->scale(alpha.get()); - dx->scale(dalpha.get()); - - GKO_ASSERT_MTX_NEAR(dx, x, r::value); -} - - -TEST_F(Dense, MultipleVectorDpcppScaleWithDifferentAlphaIsEquivalentToRef) -{ - set_up_vector_data(20, true); - - x->scale(alpha.get()); - dx->scale(dalpha.get()); - - GKO_ASSERT_MTX_NEAR(dx, x, r::value); -} - - -TEST_F(Dense, SingleVectorDpcppAddScaledIsEquivalentToRef) -{ - set_up_vector_data(1); - - x->add_scaled(alpha.get(), y.get()); - dx->add_scaled(dalpha.get(), dy.get()); - - GKO_ASSERT_MTX_NEAR(dx, x, r::value); -} - - -TEST_F(Dense, MultipleVectorDpcppAddScaledIsEquivalentToRef) -{ - set_up_vector_data(20); - - x->add_scaled(alpha.get(), y.get()); - dx->add_scaled(dalpha.get(), dy.get()); - - GKO_ASSERT_MTX_NEAR(dx, x, r::value); -} - - -TEST_F(Dense, MultipleVectorDpcppAddScaledWithDifferentAlphaIsEquivalentToRef) -{ - set_up_vector_data(20); - - x->add_scaled(alpha.get(), y.get()); - dx->add_scaled(dalpha.get(), dy.get()); - - GKO_ASSERT_MTX_NEAR(dx, x, r::value); -} - - -TEST_F(Dense, AddsScaledDiagIsEquivalentToRef) -{ - auto mat = gen_mtx(532, 532); - gko::Array diag_values(ref, 532); - gko::kernels::reference::components::fill_array(ref, diag_values.get_data(), - 532, Mtx::value_type{2.0}); - auto diag = - gko::matrix::Diagonal::create(ref, 532, diag_values); - alpha = gko::initialize({2.0}, ref); - auto dmat = Mtx::create(dpcpp); - dmat->copy_from(mat.get()); - auto ddiag = gko::matrix::Diagonal::create(dpcpp); - ddiag->copy_from(diag.get()); - dalpha = Mtx::create(dpcpp); - dalpha->copy_from(alpha.get()); - - mat->add_scaled(alpha.get(), diag.get()); - dmat->add_scaled(dalpha.get(), ddiag.get()); - - GKO_ASSERT_MTX_NEAR(mat, dmat, r::value); -} - - TEST_F(Dense, SingleVectorDpcppComputeDotIsEquivalentToRef) { set_up_vector_data(1); @@ -384,11 +264,9 @@ TEST_F(Dense, SimpleApplyIsEquivalentToRef) } -#if !GINKGO_DPCPP_SINGLE_MODE - - TEST_F(Dense, SimpleApplyMixedIsEquivalentToRef) { + SKIP_IF_SINGLE_MODE; set_up_apply_data(); x->apply(convert(y).get(), convert(expected).get()); @@ -398,9 +276,6 @@ TEST_F(Dense, SimpleApplyMixedIsEquivalentToRef) } -#endif // !GINKGO_DPCPP_SINGLE_MODE - - TEST_F(Dense, AdvancedApplyIsEquivalentToRef) { set_up_apply_data(); @@ -412,11 +287,9 @@ TEST_F(Dense, AdvancedApplyIsEquivalentToRef) } -#if !GINKGO_DPCPP_SINGLE_MODE - - TEST_F(Dense, AdvancedApplyMixedIsEquivalentToRef) { + SKIP_IF_SINGLE_MODE; set_up_apply_data(); x->apply(convert(alpha).get(), convert(y).get(), @@ -428,11 +301,9 @@ TEST_F(Dense, AdvancedApplyMixedIsEquivalentToRef) } -#endif // !GINKGO_DPCPP_SINGLE_MODE - - TEST_F(Dense, ApplyToComplexIsEquivalentToRef) { + SKIP_IF_SINGLE_MODE; set_up_apply_data(); auto complex_b = gen_mtx(25, 1); auto dcomplex_b = ComplexMtx::create(dpcpp); @@ -448,11 +319,9 @@ TEST_F(Dense, ApplyToComplexIsEquivalentToRef) } -#if !GINKGO_DPCPP_SINGLE_MODE - - TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef) { + SKIP_IF_SINGLE_MODE; set_up_apply_data(); auto complex_b = gen_mtx(25, 1); auto dcomplex_b = MixedComplexMtx::create(dpcpp); @@ -467,8 +336,6 @@ TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7); } -#endif // !GINKGO_DPCPP_SINGLE_MODE - TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef) { @@ -487,11 +354,9 @@ TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef) } -#if !GINKGO_DPCPP_SINGLE_MODE - - TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef) { + SKIP_IF_SINGLE_MODE; set_up_apply_data(); auto complex_b = gen_mtx(25, 1); auto dcomplex_b = MixedComplexMtx::create(dpcpp); @@ -509,9 +374,6 @@ TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef) } -#endif // !GINKGO_DPCPP_SINGLE_MODE - - TEST_F(Dense, ComputeDotComplexIsEquivalentToRef) { set_up_apply_data(); @@ -733,219 +595,4 @@ TEST_F(Dense, CalculateTotalColsIsEquivalentToRef) } -TEST_F(Dense, CanGatherRows) -{ - set_up_apply_data(); - - auto r_gather = x->row_gather(rgather_idxs.get()); - auto dr_gather = dx->row_gather(rgather_idxs.get()); - - GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0); -} - - -TEST_F(Dense, CanGatherRowsIntoDense) -{ - set_up_apply_data(); - auto gather_size = - gko::dim<2>{rgather_idxs->get_num_elems(), x->get_size()[1]}; - auto r_gather = Mtx::create(ref, gather_size); - // test make_temporary_clone and non-default stride - auto dr_gather = Mtx::create(ref, gather_size, x->get_size()[1] + 2); - - x->row_gather(rgather_idxs.get(), r_gather.get()); - dx->row_gather(rgather_idxs.get(), dr_gather.get()); - - GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0); -} - - -TEST_F(Dense, IsPermutable) -{ - set_up_apply_data(); - - auto permuted = square->permute(rpermute_idxs.get()); - auto dpermuted = dsquare->permute(rpermute_idxs.get()); - - GKO_ASSERT_MTX_NEAR(static_cast(permuted.get()), - static_cast(dpermuted.get()), 0); -} - - -TEST_F(Dense, IsInversePermutable) -{ - set_up_apply_data(); - - auto permuted = square->inverse_permute(rpermute_idxs.get()); - auto dpermuted = dsquare->inverse_permute(rpermute_idxs.get()); - - GKO_ASSERT_MTX_NEAR(static_cast(permuted.get()), - static_cast(dpermuted.get()), 0); -} - - -TEST_F(Dense, IsRowPermutable) -{ - set_up_apply_data(); - - auto r_permute = x->row_permute(rpermute_idxs.get()); - auto dr_permute = dx->row_permute(rpermute_idxs.get()); - - GKO_ASSERT_MTX_NEAR(static_cast(r_permute.get()), - static_cast(dr_permute.get()), 0); -} - - -TEST_F(Dense, IsColPermutable) -{ - set_up_apply_data(); - - auto c_permute = x->column_permute(cpermute_idxs.get()); - auto dc_permute = dx->column_permute(cpermute_idxs.get()); - - GKO_ASSERT_MTX_NEAR(static_cast(c_permute.get()), - static_cast(dc_permute.get()), 0); -} - - -TEST_F(Dense, IsInverseRowPermutable) -{ - set_up_apply_data(); - - auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get()); - auto d_inverse_r_permute = dx->inverse_row_permute(rpermute_idxs.get()); - - GKO_ASSERT_MTX_NEAR(static_cast(inverse_r_permute.get()), - static_cast(d_inverse_r_permute.get()), 0); -} - - -TEST_F(Dense, IsInverseColPermutable) -{ - set_up_apply_data(); - - auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get()); - auto d_inverse_c_permute = dx->inverse_column_permute(cpermute_idxs.get()); - - GKO_ASSERT_MTX_NEAR(static_cast(inverse_c_permute.get()), - static_cast(d_inverse_c_permute.get()), 0); -} - - -TEST_F(Dense, ExtractDiagonalOnTallSkinnyIsEquivalentToRef) -{ - set_up_apply_data(); - - auto diag = x->extract_diagonal(); - auto ddiag = dx->extract_diagonal(); - - GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0); -} - - -TEST_F(Dense, ExtractDiagonalOnShortFatIsEquivalentToRef) -{ - set_up_apply_data(); - - auto diag = y->extract_diagonal(); - auto ddiag = dy->extract_diagonal(); - - GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0); -} - - -TEST_F(Dense, InplaceAbsoluteMatrixIsEquivalentToRef) -{ - set_up_apply_data(); - - x->compute_absolute_inplace(); - dx->compute_absolute_inplace(); - - GKO_ASSERT_MTX_NEAR(x, dx, r::value); -} - - -TEST_F(Dense, OutplaceAbsoluteMatrixIsEquivalentToRef) -{ - set_up_apply_data(); - - auto abs_x = x->compute_absolute(); - auto dabs_x = dx->compute_absolute(); - - GKO_ASSERT_MTX_NEAR(abs_x, dabs_x, r::value); -} - - -TEST_F(Dense, MakeComplexIsEquivalentToRef) -{ - set_up_apply_data(); - - auto complex_x = x->make_complex(); - auto dcomplex_x = dx->make_complex(); - - GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0); -} - - -TEST_F(Dense, MakeComplexWithGivenResultIsEquivalentToRef) -{ - set_up_apply_data(); - - auto complex_x = ComplexMtx::create(ref, x->get_size()); - x->make_complex(complex_x.get()); - auto dcomplex_x = ComplexMtx::create(dpcpp, x->get_size()); - dx->make_complex(dcomplex_x.get()); - - GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0); -} - - -TEST_F(Dense, GetRealIsEquivalentToRef) -{ - set_up_apply_data(); - - auto real_x = x->get_real(); - auto dreal_x = dx->get_real(); - - GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0); -} - - -TEST_F(Dense, GetRealWithGivenResultIsEquivalentToRef) -{ - set_up_apply_data(); - - auto real_x = Mtx::create(ref, x->get_size()); - x->get_real(real_x.get()); - auto dreal_x = Mtx::create(dpcpp, dx->get_size()); - dx->get_real(dreal_x.get()); - - GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0); -} - - -TEST_F(Dense, GetImagIsEquivalentToRef) -{ - set_up_apply_data(); - - auto imag_x = x->get_imag(); - auto dimag_x = dx->get_imag(); - - GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0); -} - - -TEST_F(Dense, GetImagWithGivenResultIsEquivalentToRef) -{ - set_up_apply_data(); - - auto imag_x = Mtx::create(ref, x->get_size()); - x->get_imag(imag_x.get()); - auto dimag_x = Mtx::create(dpcpp, dx->get_size()); - dx->get_imag(dimag_x.get()); - - GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0); -} - - } // namespace diff --git a/dpcpp/test/utils.hpp b/dpcpp/test/utils.hpp new file mode 100644 index 00000000000..57e703b8ef1 --- /dev/null +++ b/dpcpp/test/utils.hpp @@ -0,0 +1,56 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_TEST_UTILS_HPP_ +#define GKO_DPCPP_TEST_UTILS_HPP_ + + +#include + + +namespace { + + +#if GINKGO_DPCPP_SINGLE_MODE +#define SKIP_IF_SINGLE_MODE GTEST_SKIP() << "Skip due to single mode" +#else +#define SKIP_IF_SINGLE_MODE \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") +#endif + + +} // namespace + + +#endif // GKO_DPCPP_TEST_UTILS_HPP_ From d17511655c09acfec159ce55e0fb919f5d8747d5 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 21 Jul 2021 16:44:49 +0200 Subject: [PATCH 22/22] debug/static dpcpp -> debug/shared due to memory --- .gitlab-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index df3265cb58c..1a884c2b408 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -754,7 +754,7 @@ build/dpcpp/opencl_igpu/release/static: SYCL_DEVICE_FILTER: "OpenCL" SYCL_DEVICE_TYPE: "GPU" -build/dpcpp/level_zero_igpu/debug/static: +build/dpcpp/level_zero_igpu/debug/shared: <<: *default_build_with_test extends: - .full_test_condition @@ -765,7 +765,7 @@ build/dpcpp/level_zero_igpu/debug/static: CXX_COMPILER: "dpcpp" BUILD_DPCPP: "ON" BUILD_TYPE: "Debug" - BUILD_SHARED_LIBS: "OFF" + BUILD_SHARED_LIBS: "ON" DPCPP_SINGLE_MODE: "ON" SYCL_DEVICE_FILTER: "Level_Zero:GPU"