From 8259e01843b0e38cf629ae8c8bdf7413a450a512 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 19 Feb 2021 21:20:50 +0800
Subject: [PATCH 01/22] update format header

---
 dev_tools/scripts/format_header.sh | 1 -
 1 file changed, 1 deletion(-)
diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh
index cbe1b2e8ab9..7a24f50bef6 100755
--- a/dev_tools/scripts/format_header.sh
+++ b/dev_tools/scripts/format_header.sh
@@ -121,7 +121,6 @@ GINKGO_LICENSE_BEACON="******************************<GINKGO LICENSE>***********
 
 CONTENT="content.cpp" # Store the residual part (start from namespace)
 BEFORE="before.cpp" # Store the main header and the #ifdef/#define of header file
-BEGIN="begin.cpp" # Store the header before license
 HAS_HIP_RUNTIME="false"
 DURING_LICENSE="false"
 INCLUDE_REGEX="^#include.*"

From fa90ab8cea0fd6dbab8c28a9d46fd0db8a0b1433 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sat, 20 Feb 2021 00:28:42 +0800
Subject: [PATCH 02/22] auto

---
 dpcpp/components/reduction.dp.hpp        | 278 ++++++++++++++++++++
 dpcpp/components/thread_ids.dp.hpp       | 317 +++++++++++++++++++++++
 dpcpp/components/uninitialized_array.hpp | 113 ++++++++
 3 files changed, 708 insertions(+)
 create mode 100644 dpcpp/components/reduction.dp.hpp
 create mode 100644 dpcpp/components/thread_ids.dp.hpp
 create mode 100644 dpcpp/components/uninitialized_array.hpp

diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
new file mode 100644
index 00000000000..2fe6c516e9c
--- /dev/null
+++ b/dpcpp/components/reduction.dp.hpp
@@ -0,0 +1,278 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_
+
+
+#include <type_traits>
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/uninitialized_array.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+constexpr int default_block_size = 512;
+
+
+// #include "common/components/reduction.hpp.inc"
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on a group
+ * `group`. Each thread contributes with one element `local_data`. The local
+ * thread element is always passed as the first parameter to the `reduce_op`.
+ * The function returns the result of the reduction on all threads.
+ *
+ * @note The function is guaranteed to return the correct value on all threads
+ *       only if `reduce_op` is commutative (in addition to being associative).
+ *       Otherwise, the correct value is returned only to the thread with
+ *       subwarp index 0.
+ */
+template <
+    typename Group, typename ValueType, typename Operator,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ ValueType reduce(const Group &group, ValueType local_data,
+                                 Operator reduce_op = Operator{})
+{
+#pragma unroll
+    for (int32 bitmask = 1; bitmask < group.size(); bitmask <<= 1) {
+        const auto remote_data = group.shfl_xor(local_data, bitmask);
+        local_data = reduce_op(local_data, remote_data);
+    }
+    return local_data;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the index of the thread that has the element with the largest
+ * magnitude among all the threads in the group.
+ * Only the values from threads which set `is_pivoted` to `false` will be
+ * considered.
+ */
+template <
+    typename Group, typename ValueType,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ int choose_pivot(const Group &group, ValueType local_data,
+                                 bool is_pivoted)
+{
+    using real = remove_complex<ValueType>;
+    real lmag = is_pivoted ? -one<real>() : abs(local_data);
+    const auto pivot = std::reduce(
+        oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
+        group, group.thread_rank(), [&](int lidx, int ridx) {
+            const auto rmag = group.shfl(lmag, ridx);
+            if (rmag > lmag) {
+                lmag = rmag;
+                lidx = ridx;
+            }
+            return lidx;
+        });
+    // pivot operator not commutative, make sure everyone has the same pivot
+    return group.shfl(pivot, 0);
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on entire block.
+ * The data for the reduction is taken from the `data` array which has to be of
+ * size `block_size` and accessible from all threads. The `data` array is also
+ * used as work space (so its content will be destroyed in the process), as well
+ * as to store the return value - which is stored in the 0-th position of the
+ * array.
+ */
+template <
+    typename Group, typename ValueType, typename Operator,
+    typename = std::enable_if_t<group::is_synchronizable_group<Group>::value>>
+void reduce(const Group &__restrict__ group, ValueType *__restrict__ data,
+            Operator reduce_op = Operator{})
+{
+    const auto local_id = group.thread_rank();
+
+    for (int k = group.size() / 2; k >= config::warp_size; k /= 2) {
+        group.sync();
+        if (local_id < k) {
+            data[local_id] = reduce_op(data[local_id], data[local_id + k]);
+        }
+    }
+
+    const auto warp = group::tiled_partition<config::warp_size>(group);
+    const auto warp_id = group.thread_rank() / warp.size();
+    if (warp_id > 0) {
+        return;
+    }
+    auto result = std::reduce(
+        oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
+        warp, data[warp.thread_rank()], reduce_op);
+    if (warp.thread_rank() == 0) {
+        data[0] = result;
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on an array
+ * `source` of any size. Has to be called a second time on `result` to reduce
+ * an array larger than `block_size`.
+ */
+template <typename Operator, typename ValueType>
+void reduce_array(size_type size, const ValueType *__restrict__ source,
+                  ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
+                  Operator reduce_op = Operator{})
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    auto thread_result = zero<ValueType>();
+    for (auto i = tidx; i < size;
+         i += item_ct1.get_local_range().get(2) * item_ct1.get_group_range(2)) {
+        thread_result = reduce_op(thread_result, source[i]);
+    }
+    result[item_ct1.get_local_id(2)] = thread_result;
+
+    group::this_thread_block(item_ct1).sync();
+
+    // Stores the result of the reduction inside `result[0]`
+    reduce(group::this_thread_block(item_ct1), result, reduce_op);
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the add operation (+) on an array
+ * `source` of any size. Has to be called a second time on `result` to reduce
+ * an array larger than `default_block_size`.
+ */
+template <typename ValueType>
+void reduce_add_array(
+    size_type size, const ValueType *__restrict__ source,
+    ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, default_block_size> *block_sum)
+{
+    reduce_array(size, source, static_cast<ValueType *>((*block_sum)), item_ct1,
+                 [](const ValueType &x, const ValueType &y) { return x + y; });
+
+    if (item_ct1.get_local_id(2) == 0) {
+        result[item_ct1.get_group(2)] = (*block_sum)[0];
+    }
+}
+
+template <typename ValueType>
+void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                      sycl::queue *stream, size_type size,
+                      const ValueType *source, ValueType *result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, default_block_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            block_sum_acc_ct1(cgh);
+
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(global_range, local_range),
+            [=](sycl::nd_item<3> item_ct1) {
+                reduce_add_array(
+                    size, source, result, item_ct1,
+                    (UninitializedArray<ValueType, default_block_size> *)
+                        block_sum_acc_ct1.get_pointer());
+            });
+    });
+}
+
+
+/**
+ * Compute a reduction using add operation (+).
+ *
+ * @param exec  Executor associated to the array
+ * @param size  size of the array
+ * @param source  the pointer of the array
+ *
+ * @return the reduction result
+ */
+template <typename ValueType>
+ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,
+                           size_type size, const ValueType *source)
+{
+    auto block_results_val = source;
+    size_type grid_dim = size;
+    auto block_results = Array<ValueType>(exec);
+    if (size > default_block_size) {
+        const auto n = ceildiv(size, default_block_size);
+        grid_dim = (n <= default_block_size) ? n : default_block_size;
+
+        block_results.resize_and_reset(grid_dim);
+
+        reduce_add_array(grid_dim, default_block_size, 0, exec->get_queue(),
+                         size, source, block_results.get_data());
+
+        block_results_val = block_results.get_const_data();
+    }
+
+    auto d_result = Array<ValueType>(exec, 1);
+
+    reduce_add_array(1, default_block_size, 0, exec->get_queue(), grid_dim,
+                     block_results_val, d_result.get_data());
+    auto answer = exec->copy_val_to_host(d_result.get_const_data());
+    return answer;
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
new file mode 100644
index 00000000000..4f27302dbc5
--- /dev/null
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -0,0 +1,317 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_
+
+
+#include <CL/sycl.hpp>
+
+
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dpct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The DPCPP thread namespace.
+ *
+ * @ingroup dpcpp_thread
+ */
+namespace thread {
+
+
+// #include "common/components/thread_ids.hpp.inc"
+/**
+ * @internal
+ *
+ * Returns the ID of the block group this thread belongs to.
+ *
+ * @return the ID of the block group this thread belongs to
+ *
+ * @note Assumes that grid dimensions are in standard format:
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`
+ */
+__dpct_inline__ size_type get_block_group_id(sycl::nd_item<3> item_ct1)
+{
+    return static_cast<size_type>(item_ct1.get_group(0)) *
+               item_ct1.get_group_range(1) +
+           item_ct1.get_group(1);
+}
+
+/**
+ * @internal
+ *
+ * Returns the ID of the block this thread belongs to.
+ *
+ * @return the ID of the block this thread belongs to
+ *
+ * @note Assumes that grid dimensions are in standard format:
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`
+ */
+__dpct_inline__ size_type get_block_id(sycl::nd_item<3> item_ct1)
+{
+    return get_block_group_id(item_ct1) * item_ct1.get_group_range(2) +
+           item_ct1.get_group(2);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the local ID of the warp (relative to the block) this thread belongs
+ * to.
+ *
+ * @return the local ID of the warp (relative to the block) this thread belongs
+ *         to
+ *
+ * @note Assumes that block dimensions are in standard format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
+ */
+__dpct_inline__ size_type get_local_warp_id(sycl::nd_item<3> item_ct1)
+{
+    return static_cast<size_type>(item_ct1.get_local_id(0));
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the local ID of the sub-warp (relative to the block) this thread
+ * belongs to.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @return the local ID of the sub-warp (relative to the block) this thread
+ *         belongs to
+ *
+ * @note Assumes that block dimensions are in standard format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
+ */
+template <int subwarp_size>
+__dpct_inline__ size_type get_local_subwarp_id(sycl::nd_item<3> item_ct1)
+{
+    constexpr auto subwarps_per_warp = config::warp_size / subwarp_size;
+    return get_local_warp_id(item_ct1) * subwarps_per_warp +
+           item_ct1.get_local_id(1);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the local ID of the thread (relative to the block).
+ * to.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @return the local ID of the thread (relative to the block)
+ *
+ * @note Assumes that block dimensions are in standard format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
+ */
+template <int subwarp_size>
+__dpct_inline__ size_type get_local_thread_id(sycl::nd_item<3> item_ct1)
+{
+    return get_local_subwarp_id<subwarp_size>(item_ct1) * subwarp_size +
+           item_ct1.get_local_id(2);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the warp this thread belongs to.
+ *
+ * @tparam warps_per_block  number of warps within each block
+ *
+ * @return the global ID of the warp this thread belongs to.
+ *
+ * @note Assumes that block dimensions and grid dimensions are in standard
+ *       format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
+ *       respectively.
+ */
+template <int warps_per_block>
+__dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1)
+{
+    return get_block_id(item_ct1) * warps_per_block +
+           get_local_warp_id(item_ct1);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the sub-warp this thread belongs to.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @return the global ID of the sub-warp this thread belongs to.
+ *
+ * @note Assumes that block dimensions and grid dimensions are in standard
+ *       format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
+ *       respectively.
+ */
+template <int subwarp_size, int warps_per_block>
+__dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1)
+{
+    constexpr auto subwarps_per_warp = config::warp_size / subwarp_size;
+    return get_warp_id<warps_per_block>(item_ct1) * subwarps_per_warp +
+           item_ct1.get_local_id(1);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the thread.
+ *
+ * @return the global ID of the thread.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @note Assumes that block dimensions and grid dimensions are in standard
+ *       format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
+ *       respectively.
+ */
+template <int subwarp_size, int warps_per_block>
+__dpct_inline__ size_type get_thread_id(sycl::nd_item<3> item_ct1)
+{
+    return get_subwarp_id<subwarp_size, warps_per_block>(item_ct1) *
+               subwarp_size +
+           item_ct1.get_local_id(2);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the thread in the given index type.
+ * This function assumes one-dimensional thread and block indexing.
+ *
+ * @return the global ID of the thread in the given index type.
+ *
+ * @tparam IndexType  the index type
+ */
+template <typename IndexType = size_type>
+__dpct_inline__ IndexType get_thread_id_flat(sycl::nd_item<3> item_ct1)
+{
+    return item_ct1.get_local_id(2) +
+           static_cast<IndexType>(item_ct1.get_local_range().get(2)) *
+               item_ct1.get_group(2);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the total number of threads in the given index type.
+ * This function assumes one-dimensional thread and block indexing.
+ *
+ * @return the total number of threads in the given index type.
+ *
+ * @tparam IndexType  the index type
+ */
+template <typename IndexType = size_type>
+__dpct_inline__ IndexType get_thread_num_flat(sycl::nd_item<3> item_ct1)
+{
+    return item_ct1.get_local_range().get(2) *
+           static_cast<IndexType>(item_ct1.get_group_range(2));
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the subwarp in the given index type.
+ * This function assumes one-dimensional thread and block indexing
+ * with a power of two block size of at least subwarp_size.
+ *
+ * @return the global ID of the subwarp in the given index type.
+ *
+ * @tparam subwarp_size  the size of the subwarp. Must be a power of two!
+ * @tparam IndexType  the index type
+ */
+template <int subwarp_size, typename IndexType = size_type>
+__dpct_inline__ IndexType get_subwarp_id_flat(sycl::nd_item<3> item_ct1)
+{
+    static_assert(!(subwarp_size & (subwarp_size - 1)),
+                  "subwarp_size must be a power of two");
+    return item_ct1.get_local_id(2) / subwarp_size +
+           static_cast<IndexType>(item_ct1.get_local_range().get(2) /
+                                  subwarp_size) *
+               item_ct1.get_group(2);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the total number of subwarps in the given index type.
+ * This function assumes one-dimensional thread and block indexing
+ * with a power of two block size of at least subwarp_size.
+ *
+ * @return the total number of subwarps in the given index type.
+ *
+ * @tparam subwarp_size  the size of the subwarp. Must be a power of two!
+ * @tparam IndexType  the index type
+ */
+template <int subwarp_size, typename IndexType = size_type>
+__dpct_inline__ IndexType get_subwarp_num_flat(sycl::nd_item<3> item_ct1)
+{
+    static_assert(!(subwarp_size & (subwarp_size - 1)),
+                  "subwarp_size must be a power of two");
+    return item_ct1.get_local_range().get(2) / subwarp_size *
+           static_cast<IndexType>(item_ct1.get_group_range(2));
+}
+
+}  // namespace thread
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_
diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp
new file mode 100644
index 00000000000..b8d3006007d
--- /dev/null
+++ b/dpcpp/components/uninitialized_array.hpp
@@ -0,0 +1,113 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
+#define GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+// #include "common/components/uninitialized_array.hpp.inc"
+/**
+ * Stores an array with uninitialized contents.
+ *
+ * This class needed for datatypes that do have a non-empty constructor when`
+ * using them as shared memory, for example `thrust::complex<float>`.
+ *
+ * @tparam ValueType the type of values
+ * @tparam size the size of the array
+ */
+template <typename ValueType, size_type size>
+class UninitializedArray {
+public:
+    /**
+     * Operator for casting an UninitializedArray into its constexpr value
+     * pointer.
+     *
+     * @return the constexpr pointer to the first entry of the array.
+     */
+    constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
+    {
+        return &(*this)[0];
+    }
+
+    /**
+     * Operator for casting an UninitializedArray into its non-const value
+     * pointer.
+     *
+     * @return the non-const pointer to the first entry of the array.
+     */
+    GKO_ATTRIBUTES operator ValueType *() noexcept { return &(*this)[0]; }
+
+    /**
+     * constexpr array access operator.
+     *
+     * @param pos The array index. Using a value outside [0, size) is undefined
+     * behavior.
+     *
+     * @return a reference to the array entry at the given index.
+     */
+    constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
+    {
+        return reinterpret_cast<const ValueType *>(data_)[pos];
+    }
+
+    /**
+     * Non-const array access operator.
+     *
+     * @param pos The array index. Using a value outside [0, size) is undefined
+     * behavior.
+     *
+     * @return a reference to the array entry at the given index.
+     */
+    GKO_ATTRIBUTES ValueType &operator[](size_type pos) noexcept
+    {
+        return reinterpret_cast<ValueType *>(data_)[pos];
+    }
+
+private:
+    unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
+};
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_

From 12f669ba0c9e69887ecabc3d505ebb570e2a6104 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sat, 20 Feb 2021 00:37:10 +0800
Subject: [PATCH 03/22] manual modification

---
 dpcpp/components/reduction.dp.hpp        | 12 +++++-------
 dpcpp/components/thread_ids.dp.hpp       |  1 +
 dpcpp/components/uninitialized_array.hpp | 10 +++++++---
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index 2fe6c516e9c..4caf46229c8 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -57,7 +57,7 @@ namespace kernels {
 namespace dpcpp {
 
 
-constexpr int default_block_size = 512;
+constexpr int default_block_size = 256;
 
 
 // #include "common/components/reduction.hpp.inc"
@@ -105,9 +105,8 @@ __dpct_inline__ int choose_pivot(const Group &group, ValueType local_data,
 {
     using real = remove_complex<ValueType>;
     real lmag = is_pivoted ? -one<real>() : abs(local_data);
-    const auto pivot = std::reduce(
-        oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
-        group, group.thread_rank(), [&](int lidx, int ridx) {
+    const auto pivot =
+        reduce(group, group.thread_rank(), [&](int lidx, int ridx) {
             const auto rmag = group.shfl(lmag, ridx);
             if (rmag > lmag) {
                 lmag = rmag;
@@ -150,9 +149,8 @@ void reduce(const Group &__restrict__ group, ValueType *__restrict__ data,
     if (warp_id > 0) {
         return;
     }
-    auto result = std::reduce(
-        oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
-        warp, data[warp.thread_rank()], reduce_op);
+    auto result = ::gko::kernels::dpcpp::reduce(warp, data[warp.thread_rank()],
+                                                reduce_op);
     if (warp.thread_rank() == 0) {
         data[0] = result;
     }
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
index 4f27302dbc5..8694d6a88c9 100644
--- a/dpcpp/components/thread_ids.dp.hpp
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -308,6 +308,7 @@ __dpct_inline__ IndexType get_subwarp_num_flat(sycl::nd_item<3> item_ct1)
            static_cast<IndexType>(item_ct1.get_group_range(2));
 }
 
+
 }  // namespace thread
 }  // namespace dpcpp
 }  // namespace kernels
diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp
index b8d3006007d..fb7575bc202 100644
--- a/dpcpp/components/uninitialized_array.hpp
+++ b/dpcpp/components/uninitialized_array.hpp
@@ -37,6 +37,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "dpcpp/base/dpct.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace dpcpp {
@@ -61,7 +64,7 @@ class UninitializedArray {
      *
      * @return the constexpr pointer to the first entry of the array.
      */
-    constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
+    constexpr __dpct_inline__ operator ValueType *() const noexcept
     {
         return &(*this)[0];
     }
@@ -72,7 +75,7 @@ class UninitializedArray {
      *
      * @return the non-const pointer to the first entry of the array.
      */
-    GKO_ATTRIBUTES operator ValueType *() noexcept { return &(*this)[0]; }
+    __dpct_inline__ operator ValueType *() noexcept { return &(*this)[0]; }
 
     /**
      * constexpr array access operator.
@@ -82,7 +85,8 @@ class UninitializedArray {
      *
      * @return a reference to the array entry at the given index.
      */
-    constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
+    constexpr __dpct_inline__ ValueType &operator[](size_type pos) const
+        noexcept
     {
         return reinterpret_cast<const ValueType *>(data_)[pos];
     }

From 0e222387fc7a90274583e73135be2947723ce0be Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sat, 20 Feb 2021 01:04:38 +0800
Subject: [PATCH 04/22] auto dense

---
 dpcpp/matrix/dense_kernels.dp.cpp | 1574 ++++++++++++++++++++++++++++-
 1 file changed, 1555 insertions(+), 19 deletions(-)

diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 531cfd52259..36935b1a4d6 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -33,25 +33,30 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 
 
-#include <algorithm>
+#include <dpcpp/base/cublas_bindings.hpp>
+#include <dpcpp/base/pointer_mode_guard.hpp>
 
 
 #include <CL/sycl.hpp>
 
 
-#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 #include "core/components/prefix_sum.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/uninitialized_array.hpp"
 
 
 namespace gko {
@@ -65,11 +70,1245 @@ namespace dpcpp {
 namespace dense {
 
 
+constexpr auto default_block_size = 512;
+
+
+// #include "common/matrix/dense_kernels.hpp.inc"
+namespace kernel {
+
+
+template <typename ValueType>
+void strided_fill(size_type num_rows, size_type num_cols, size_type stride,
+                  ValueType *__restrict__ mat, ValueType value,
+                  sycl::nd_item<3> item_ct1)
+{
+    const auto global_id = thread::get_thread_id_flat(item_ct1);
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        mat[row_id * stride + col_id] = value;
+    }
+}
+
+template <typename ValueType>
+void strided_fill(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                  sycl::queue *stream, size_type num_rows, size_type num_cols,
+                  size_type stride, ValueType *mat, ValueType value)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             strided_fill(num_rows, num_cols, stride, mat,
+                                          value, item_ct1);
+                         });
+    });
+}
+
+
+template <size_type block_size, typename ValueType>
+void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols,
+           const ValueType *__restrict__ alpha, ValueType *__restrict__ x,
+           size_type stride_x, sycl::nd_item<3> item_ct1)
+{
+    constexpr auto warps_per_block = block_size / config::warp_size;
+    const auto global_id =
+        thread::get_thread_id<config::warp_size, warps_per_block>(item_ct1);
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id;
+    if (row_id < num_rows) {
+        x[row_id * stride_x + col_id] =
+            alpha[alpha_id] == zero<ValueType>()
+                ? zero<ValueType>()
+                : x[row_id * stride_x + col_id] * alpha[alpha_id];
+    }
+}
+
+template <size_type block_size, typename ValueType>
+void scale(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+           sycl::queue *stream, size_type num_rows, size_type num_cols,
+           size_type num_alpha_cols, const ValueType *alpha, ValueType *x,
+           size_type stride_x)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             scale<block_size>(num_rows, num_cols,
+                                               num_alpha_cols, alpha, x,
+                                               stride_x, item_ct1);
+                         });
+    });
+}
+
+
+template <size_type block_size, typename ValueType>
+void add_scaled(size_type num_rows, size_type num_cols,
+                size_type num_alpha_cols, const ValueType *__restrict__ alpha,
+                const ValueType *__restrict__ x, size_type stride_x,
+                ValueType *__restrict__ y, size_type stride_y,
+                sycl::nd_item<3> item_ct1)
+{
+    constexpr auto warps_per_block = block_size / config::warp_size;
+    const auto global_id =
+        thread::get_thread_id<config::warp_size, warps_per_block>(item_ct1);
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id;
+    if (row_id < num_rows && alpha[alpha_id] != zero<ValueType>()) {
+        y[row_id * stride_y + col_id] +=
+            x[row_id * stride_x + col_id] * alpha[alpha_id];
+    }
+}
+
+template <size_type block_size, typename ValueType>
+void add_scaled(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                sycl::queue *stream, size_type num_rows, size_type num_cols,
+                size_type num_alpha_cols, const ValueType *alpha,
+                const ValueType *x, size_type stride_x, ValueType *y,
+                size_type stride_y)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             add_scaled<block_size>(
+                                 num_rows, num_cols, num_alpha_cols, alpha, x,
+                                 stride_x, y, stride_y, item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType>
+void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha,
+                     const ValueType *__restrict__ diag,
+                     ValueType *__restrict__ y, size_type stride_y,
+                     sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+
+    if (tidx >= size) {
+        return;
+    }
+
+    y[tidx * stride_y + tidx] += alpha[0] * diag[tidx];
+}
+
+template <typename ValueType>
+void add_scaled_diag(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                     sycl::queue *stream, size_type size,
+                     const ValueType *alpha, const ValueType *diag,
+                     ValueType *y, size_type stride_y)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             add_scaled_diag(size, alpha, diag, y, stride_y,
+                                             item_ct1);
+                         });
+    });
+}
+
+
+template <size_type block_size, typename OutType, typename CallableGetValue,
+          typename CallableReduce>
+void compute_partial_reduce(size_type num_rows, OutType *__restrict__ work,
+                            CallableGetValue get_value,
+                            CallableReduce reduce_op, sycl::nd_item<3> item_ct1,
+                            UninitializedArray<OutType, block_size> *tmp_work)
+{
+    constexpr auto warps_per_block = block_size / config::warp_size;
+
+    const auto num_blocks = item_ct1.get_group_range(2);
+    const auto local_id =
+        thread::get_local_thread_id<config::warp_size>(item_ct1);
+    const auto global_id =
+        thread::get_thread_id<config::warp_size, warps_per_block>(item_ct1);
+
+    auto tmp = zero<OutType>();
+    for (auto i = global_id; i < num_rows; i += block_size * num_blocks) {
+        tmp = reduce_op(tmp, get_value(i));
+    }
+
+    (*tmp_work)[local_id] = tmp;
+
+    reduce(group::this_thread_block(item_ct1),
+           static_cast<OutType *>((*tmp_work)), reduce_op);
+
+    if (local_id == 0) {
+        work[thread::get_block_id(item_ct1)] = (*tmp_work)[0];
+    }
+}
+
+
+template <size_type block_size, typename ValueType, typename CallableReduce,
+          typename CallableFinalize>
+void finalize_reduce_computation(
+    size_type size, const ValueType *work, ValueType *result,
+    CallableReduce reduce_op, CallableFinalize finalize_op,
+    sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, block_size> *tmp_work)
+{
+    const auto local_id =
+        thread::get_local_thread_id<config::warp_size>(item_ct1);
+
+    ValueType tmp = zero<ValueType>();
+    for (auto i = local_id; i < size; i += block_size) {
+        tmp = reduce_op(tmp, work[i]);
+    }
+
+    (*tmp_work)[local_id] = tmp;
+
+    reduce(group::this_thread_block(item_ct1),
+           static_cast<ValueType *>((*tmp_work)), reduce_op);
+
+    if (local_id == 0) {
+        *result = finalize_op((*tmp_work)[0]);
+    }
+}
+
+
+template <size_type block_size, typename ValueType>
+void compute_partial_dot(size_type num_rows, const ValueType *__restrict__ x,
+                         size_type stride_x, const ValueType *__restrict__ y,
+                         size_type stride_y, ValueType *__restrict__ work,
+                         sycl::nd_item<3> item_ct1,
+                         UninitializedArray<ValueType, block_size> *tmp_work)
+{
+    compute_partial_reduce<block_size>(
+        /*
+        DPCT1007:4: Migration of this CUDA API is not supported by the Intel(R)
+        DPC++ Compatibility Tool.
+        */
+        num_rows,
+        work,
+        [x, stride_x, y, stride_y](size_type i) {
+            return x[i * stride_x] * conj(y[i * stride_y]);
+        },
+        [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1,
+        tmp_work);
+}
+
+template <size_type block_size, typename ValueType>
+void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                         sycl::queue *stream, size_type num_rows,
+                         const ValueType *x, size_type stride_x,
+                         const ValueType *y, size_type stride_y,
+                         ValueType *work)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             compute_partial_dot<block_size>(
+                                 num_rows, x, stride_x, y, stride_y, work,
+                                 item_ct1,
+                                 (UninitializedArray<ValueType, block_size> *)
+                                     tmp_work_acc_ct1.get_pointer());
+                         });
+    });
+}
+
+
+template <size_type block_size, typename ValueType>
+void finalize_dot_computation(
+    size_type size, const ValueType *work, ValueType *result,
+    sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, block_size> *tmp_work)
+{
+    finalize_reduce_computation<block_size>(
+        size, work, result,
+        [](const ValueType &x, const ValueType &y) { return x + y; },
+        [](const ValueType &x) { return x; }, item_ct1, tmp_work);
+}
+
+template <size_type block_size, typename ValueType>
+void finalize_dot_computation(dim3 grid, dim3 block,
+                              size_t dynamic_shared_memory, sycl::queue *stream,
+                              size_type size, const ValueType *work,
+                              ValueType *result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             finalize_dot_computation<block_size>(
+                                 size, work, result, item_ct1,
+                                 (UninitializedArray<ValueType, block_size> *)
+                                     tmp_work_acc_ct1.get_pointer());
+                         });
+    });
+}
+
+
+template <size_type block_size, typename ValueType>
+void compute_partial_norm2(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    remove_complex<ValueType> *__restrict__ work, sycl::nd_item<3> item_ct1,
+    UninitializedArray<remove_complex<ValueType>, block_size> *tmp_work)
+{
+    using norm_type = remove_complex<ValueType>;
+    compute_partial_reduce<block_size>(
+        num_rows, work,
+        [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); },
+        [](const norm_type &x, const norm_type &y) { return x + y; }, item_ct1,
+        tmp_work);
+}
+
+template <size_type block_size, typename ValueType>
+void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                           sycl::queue *stream, size_type num_rows,
+                           const ValueType *x, size_type stride_x,
+                           remove_complex<ValueType> *work)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<
+            UninitializedArray<remove_complex<ValueType>, block_size>, 0,
+            sycl::access::mode::read_write, sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(global_range, local_range),
+            [=](sycl::nd_item<3> item_ct1) {
+                compute_partial_norm2<block_size>(
+                    num_rows, x, stride_x, work, item_ct1,
+                    (UninitializedArray<remove_complex<ValueType>, block_size>
+                         *)tmp_work_acc_ct1.get_pointer());
+            });
+    });
+}
+
+
+template <size_type block_size, typename ValueType>
+void finalize_norm2_computation(
+    size_type size, const ValueType *work, ValueType *result,
+    sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, block_size> *tmp_work)
+{
+    finalize_reduce_computation<block_size>(
+        size, work, result,
+        [](const ValueType &x, const ValueType &y) { return x + y; },
+        [](const ValueType &x) { return sycl::sqrt((float)x); }, item_ct1,
+        tmp_work);
+}
+
+template <size_type block_size, typename ValueType>
+void finalize_norm2_computation(dim3 grid, dim3 block,
+                                size_t dynamic_shared_memory,
+                                sycl::queue *stream, size_type size,
+                                const ValueType *work, ValueType *result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             finalize_norm2_computation<block_size>(
+                                 size, work, result, item_ct1,
+                                 (UninitializedArray<ValueType, block_size> *)
+                                     tmp_work_acc_ct1.get_pointer());
+                         });
+    });
+}
+
+
+template <typename ValueType, typename IndexType>
+void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
+                 const size_type *__restrict__ row_ptrs,
+                 const ValueType *__restrict__ source,
+                 IndexType *__restrict__ row_idxs,
+                 IndexType *__restrict__ col_idxs,
+                 ValueType *__restrict__ values, sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    if (tidx < num_rows) {
+        size_type write_to = row_ptrs[tidx];
+
+        for (size_type i = 0; i < num_cols; i++) {
+            if (source[stride * tidx + i] != zero<ValueType>()) {
+                values[write_to] = source[stride * tidx + i];
+                col_idxs[write_to] = i;
+                row_idxs[write_to] = tidx;
+                write_to++;
+            }
+        }
+    }
+}
+
+template <typename ValueType, typename IndexType>
+void fill_in_coo(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                 sycl::queue *stream, size_type num_rows, size_type num_cols,
+                 size_type stride, const size_type *row_ptrs,
+                 const ValueType *source, IndexType *row_idxs,
+                 IndexType *col_idxs, ValueType *values)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             fill_in_coo(num_rows, num_cols, stride, row_ptrs,
+                                         source, row_idxs, col_idxs, values,
+                                         item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType, typename IndexType>
+void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride,
+                       const ValueType *__restrict__ work,
+                       IndexType *__restrict__ result,
+                       sycl::nd_item<3> item_ct1)
+{
+    constexpr auto warp_size = config::warp_size;
+    const auto row_idx = thread::get_subwarp_id_flat<warp_size>(item_ct1);
+    auto warp_tile =
+        group::tiled_partition<warp_size>(group::this_thread_block(item_ct1));
+
+    if (row_idx < num_rows) {
+        IndexType part_result{};
+        for (auto i = warp_tile.thread_rank(); i < num_cols; i += warp_size) {
+            if (work[stride * row_idx + i] != zero<ValueType>()) {
+                part_result += 1;
+            }
+        }
+        result[row_idx] = std::reduce(
+            oneapi::dpl::execution::make_device_policy(
+                dpct::get_default_queue()),
+            warp_tile, part_result,
+            [](const size_type &a, const size_type &b) { return a + b; });
+    }
+}
+
+template <typename ValueType, typename IndexType>
+void count_nnz_per_row(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                       sycl::queue *stream, size_type num_rows,
+                       size_type num_cols, size_type stride,
+                       const ValueType *work, IndexType *result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             count_nnz_per_row(num_rows, num_cols, stride, work,
+                                               result, item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType, typename IndexType>
+void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
+                 const ValueType *__restrict__ source,
+                 IndexType *__restrict__ row_ptrs,
+                 IndexType *__restrict__ col_idxs,
+                 ValueType *__restrict__ values, sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+
+    if (tidx < num_rows) {
+        auto write_to = row_ptrs[tidx];
+        for (auto i = 0; i < num_cols; i++) {
+            if (source[stride * tidx + i] != zero<ValueType>()) {
+                values[write_to] = source[stride * tidx + i];
+                col_idxs[write_to] = i;
+                write_to++;
+            }
+        }
+    }
+}
+
+template <typename ValueType, typename IndexType>
+void fill_in_csr(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                 sycl::queue *stream, size_type num_rows, size_type num_cols,
+                 size_type stride, const ValueType *source, IndexType *row_ptrs,
+                 IndexType *col_idxs, ValueType *values)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             fill_in_csr(num_rows, num_cols, stride, source,
+                                         row_ptrs, col_idxs, values, item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType, typename IndexType>
+void fill_in_ell(size_type num_rows, size_type num_cols,
+                 size_type source_stride, const ValueType *__restrict__ source,
+                 size_type max_nnz_per_row, size_type result_stride,
+                 IndexType *__restrict__ col_ptrs,
+                 ValueType *__restrict__ values, sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    if (tidx < num_rows) {
+        IndexType col_idx = 0;
+        for (size_type col = 0; col < num_cols; col++) {
+            if (source[tidx * source_stride + col] != zero<ValueType>()) {
+                col_ptrs[col_idx * result_stride + tidx] = col;
+                values[col_idx * result_stride + tidx] =
+                    source[tidx * source_stride + col];
+                col_idx++;
+            }
+        }
+        for (size_type j = col_idx; j < max_nnz_per_row; j++) {
+            col_ptrs[j * result_stride + tidx] = 0;
+            values[j * result_stride + tidx] = zero<ValueType>();
+        }
+    } else if (tidx < result_stride) {
+        for (size_type j = 0; j < max_nnz_per_row; j++) {
+            col_ptrs[j * result_stride + tidx] = 0;
+            values[j * result_stride + tidx] = zero<ValueType>();
+        }
+    }
+}
+
+template <typename ValueType, typename IndexType>
+void fill_in_ell(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                 sycl::queue *stream, size_type num_rows, size_type num_cols,
+                 size_type source_stride, const ValueType *source,
+                 size_type max_nnz_per_row, size_type result_stride,
+                 IndexType *col_ptrs, ValueType *values)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             fill_in_ell(num_rows, num_cols, source_stride,
+                                         source, max_nnz_per_row, result_stride,
+                                         col_ptrs, values, item_ct1);
+                         });
+    });
+}
+
+
+void calculate_slice_lengths(size_type num_rows, size_type slice_size,
+                             int slice_num, size_type stride_factor,
+                             const size_type *__restrict__ nnz_per_row,
+                             size_type *__restrict__ slice_lengths,
+                             size_type *__restrict__ slice_sets,
+                             sycl::nd_item<3> item_ct1)
+{
+    constexpr auto warp_size = config::warp_size;
+    const auto sliceid = item_ct1.get_group(2);
+    const auto tid_in_warp = item_ct1.get_local_id(2);
+
+    if (sliceid * slice_size + tid_in_warp < num_rows) {
+        size_type thread_result = 0;
+        for (size_type i = tid_in_warp; i < slice_size; i += warp_size) {
+            thread_result =
+                (i + slice_size * sliceid < num_rows)
+                    ? max(thread_result, nnz_per_row[sliceid * slice_size + i])
+                    : thread_result;
+        }
+
+        auto warp_tile = group::tiled_partition<warp_size>(
+            group::this_thread_block(item_ct1));
+        auto warp_result = reduce(
+            warp_tile, thread_result,
+            [](const size_type &a, const size_type &b) { return max(a, b); });
+
+        if (tid_in_warp == 0) {
+            auto slice_length =
+                ceildiv(warp_result, stride_factor) * stride_factor;
+            slice_lengths[sliceid] = slice_length;
+            slice_sets[sliceid] = slice_length;
+        }
+    }
+}
+
+void calculate_slice_lengths(dim3 grid, dim3 block,
+                             size_t dynamic_shared_memory, sycl::queue *stream,
+                             size_type num_rows, size_type slice_size,
+                             int slice_num, size_type stride_factor,
+                             const size_type *nnz_per_row,
+                             size_type *slice_lengths, size_type *slice_sets)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             calculate_slice_lengths(num_rows, slice_size,
+                                                     slice_num, stride_factor,
+                                                     nnz_per_row, slice_lengths,
+                                                     slice_sets, item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType, typename IndexType>
+void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
+                   size_type stride, const ValueType *__restrict__ source,
+                   size_type *__restrict__ slice_lengths,
+                   size_type *__restrict__ slice_sets,
+                   IndexType *__restrict__ col_idxs,
+                   ValueType *__restrict__ vals, sycl::nd_item<3> item_ct1)
+{
+    const auto global_row = thread::get_thread_id_flat(item_ct1);
+    const auto row = global_row % slice_size;
+    const auto sliceid = global_row / slice_size;
+
+    if (global_row < num_rows) {
+        size_type sellp_ind = slice_sets[sliceid] * slice_size + row;
+
+        for (size_type col = 0; col < num_cols; col++) {
+            auto val = source[global_row * stride + col];
+            if (val != zero<ValueType>()) {
+                col_idxs[sellp_ind] = col;
+                vals[sellp_ind] = val;
+                sellp_ind += slice_size;
+            }
+        }
+        for (size_type i = sellp_ind;
+             i <
+             (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row;
+             i += slice_size) {
+            col_idxs[i] = 0;
+            vals[i] = zero<ValueType>();
+        }
+    }
+}
+
+template <typename ValueType, typename IndexType>
+void fill_in_sellp(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                   sycl::queue *stream, size_type num_rows, size_type num_cols,
+                   size_type slice_size, size_type stride,
+                   const ValueType *source, size_type *slice_lengths,
+                   size_type *slice_sets, IndexType *col_idxs, ValueType *vals)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             fill_in_sellp(num_rows, num_cols, slice_size,
+                                           stride, source, slice_lengths,
+                                           slice_sets, col_idxs, vals,
+                                           item_ct1);
+                         });
+    });
+}
+
+
+void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
+                    size_type *__restrict__ result, sycl::nd_item<3> item_ct1,
+                    uint8_t *dpct_local)
+{
+    auto block_max = (size_type *)dpct_local;
+
+    reduce_array(
+        size, nnz_per_row, block_max,
+        [](const size_type &x, const size_type &y) { return max(x, y); });
+
+    if (item_ct1.get_local_id(2) == 0) {
+        result[item_ct1.get_group(2)] = block_max[0];
+    }
+}
+
+void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                    sycl::queue *stream, size_type size,
+                    const size_type *nnz_per_row, size_type *result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh);
+
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             reduce_max_nnz(size, nnz_per_row, result, item_ct1,
+                                            dpct_local_acc_ct1.get_pointer());
+                         });
+    });
+}
+
+
+void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size,
+                              size_type stride_factor,
+                              const size_type *__restrict__ nnz_per_row,
+                              size_type *__restrict__ result,
+                              sycl::nd_item<3> item_ct1)
+{
+    constexpr auto warp_size = config::warp_size;
+    auto warp_tile =
+        group::tiled_partition<warp_size>(group::this_thread_block(item_ct1));
+    const auto warpid = thread::get_subwarp_id_flat<warp_size>(item_ct1);
+    const auto tid_in_warp = warp_tile.thread_rank();
+    const auto slice_num = ceildiv(num_rows, slice_size);
+
+    size_type thread_result = 0;
+    for (size_type i = tid_in_warp; i < slice_size; i += warp_size) {
+        if (warpid * slice_size + i < num_rows) {
+            thread_result =
+                max(thread_result, nnz_per_row[warpid * slice_size + i]);
+        }
+    }
+
+    auto warp_result = reduce(
+        warp_tile, thread_result,
+        [](const size_type &a, const size_type &b) { return max(a, b); });
+
+    if (tid_in_warp == 0 && warpid < slice_num) {
+        result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor;
+    }
+}
+
+void reduce_max_nnz_per_slice(dim3 grid, dim3 block,
+                              size_t dynamic_shared_memory, sycl::queue *stream,
+                              size_type num_rows, size_type slice_size,
+                              size_type stride_factor,
+                              const size_type *nnz_per_row, size_type *result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             reduce_max_nnz_per_slice(
+                                 num_rows, slice_size, stride_factor,
+                                 nnz_per_row, result, item_ct1);
+                         });
+    });
+}
+
+
+void reduce_total_cols(size_type num_slices,
+                       const size_type *__restrict__ max_nnz_per_slice,
+                       size_type *__restrict__ result,
+                       sycl::nd_item<3> item_ct1, uint8_t *dpct_local)
+{
+    auto block_result = (size_type *)dpct_local;
+
+    reduce_array(num_slices, max_nnz_per_slice, block_result,
+                 [](const size_type &x, const size_type &y) { return x + y; });
+
+    if (item_ct1.get_local_id(2) == 0) {
+        result[item_ct1.get_group(2)] = block_result[0];
+    }
+}
+
+void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                       sycl::queue *stream, size_type num_slices,
+                       const size_type *max_nnz_per_slice, size_type *result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh);
+
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             reduce_total_cols(
+                                 num_slices, max_nnz_per_slice, result,
+                                 item_ct1, dpct_local_acc_ct1.get_pointer());
+                         });
+    });
+}
+
+
+template <typename IndexType, typename ValueType>
+void symm_permute(size_type num_rows, size_type num_cols,
+                  const IndexType *__restrict__ perm_idxs,
+                  const ValueType *__restrict__ orig, size_type stride_orig,
+                  ValueType *__restrict__ result, size_type stride_result,
+                  sycl::nd_item<3> item_ct1)
+{
+    const auto global_id = thread::get_thread_id_flat(item_ct1);
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        result[row_id * stride_result + col_id] =
+            orig[perm_idxs[row_id] * stride_orig + perm_idxs[col_id]];
+    }
+}
+
+template <typename IndexType, typename ValueType>
+void symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                  sycl::queue *stream, size_type num_rows, size_type num_cols,
+                  const IndexType *perm_idxs, const ValueType *orig,
+                  size_type stride_orig, ValueType *result,
+                  size_type stride_result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             symm_permute(num_rows, num_cols, perm_idxs, orig,
+                                          stride_orig, result, stride_result,
+                                          item_ct1);
+                         });
+    });
+}
+
+
+template <typename IndexType, typename ValueType>
+void inv_symm_permute(size_type num_rows, size_type num_cols,
+                      const IndexType *__restrict__ perm_idxs,
+                      const ValueType *__restrict__ orig, size_type stride_orig,
+                      ValueType *__restrict__ result, size_type stride_result,
+                      sycl::nd_item<3> item_ct1)
+{
+    const auto global_id = thread::get_thread_id_flat(item_ct1);
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        result[perm_idxs[row_id] * stride_result + perm_idxs[col_id]] =
+            orig[row_id * stride_orig + col_id];
+    }
+}
+
+template <typename IndexType, typename ValueType>
+void inv_symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                      sycl::queue *stream, size_type num_rows,
+                      size_type num_cols, const IndexType *perm_idxs,
+                      const ValueType *orig, size_type stride_orig,
+                      ValueType *result, size_type stride_result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             inv_symm_permute(num_rows, num_cols, perm_idxs,
+                                              orig, stride_orig, result,
+                                              stride_result, item_ct1);
+                         });
+    });
+}
+
+
+template <typename IndexType, typename ValueType>
+void row_gather(size_type num_rows, size_type num_cols,
+                const IndexType *__restrict__ perm_idxs,
+                const ValueType *__restrict__ orig, size_type stride_orig,
+                ValueType *__restrict__ result, size_type stride_result,
+                sycl::nd_item<3> item_ct1)
+{
+    const auto global_id = thread::get_thread_id_flat(item_ct1);
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        result[row_id * stride_result + col_id] =
+            orig[perm_idxs[row_id] * stride_orig + col_id];
+    }
+}
+
+template <typename IndexType, typename ValueType>
+void row_gather(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                sycl::queue *stream, size_type num_rows, size_type num_cols,
+                const IndexType *perm_idxs, const ValueType *orig,
+                size_type stride_orig, ValueType *result,
+                size_type stride_result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             row_gather(num_rows, num_cols, perm_idxs, orig,
+                                        stride_orig, result, stride_result,
+                                        item_ct1);
+                         });
+    });
+}
+
+
+template <typename IndexType, typename ValueType>
+void column_permute(size_type num_rows, size_type num_cols,
+                    const IndexType *__restrict__ perm_idxs,
+                    const ValueType *__restrict__ orig, size_type stride_orig,
+                    ValueType *__restrict__ result, size_type stride_result,
+                    sycl::nd_item<3> item_ct1)
+{
+    const auto global_id = thread::get_thread_id_flat(item_ct1);
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        result[row_id * stride_result + col_id] =
+            orig[row_id * stride_orig + perm_idxs[col_id]];
+    }
+}
+
+template <typename IndexType, typename ValueType>
+void column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                    sycl::queue *stream, size_type num_rows, size_type num_cols,
+                    const IndexType *perm_idxs, const ValueType *orig,
+                    size_type stride_orig, ValueType *result,
+                    size_type stride_result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             column_permute(num_rows, num_cols, perm_idxs, orig,
+                                            stride_orig, result, stride_result,
+                                            item_ct1);
+                         });
+    });
+}
+
+
+template <typename IndexType, typename ValueType>
+void inverse_row_permute(size_type num_rows, size_type num_cols,
+                         const IndexType *__restrict__ perm_idxs,
+                         const ValueType *__restrict__ orig,
+                         size_type stride_orig, ValueType *__restrict__ result,
+                         size_type stride_result, sycl::nd_item<3> item_ct1)
+{
+    const auto global_id = thread::get_thread_id_flat(item_ct1);
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        result[perm_idxs[row_id] * stride_result + col_id] =
+            orig[row_id * stride_orig + col_id];
+    }
+}
+
+template <typename IndexType, typename ValueType>
+void inverse_row_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                         sycl::queue *stream, size_type num_rows,
+                         size_type num_cols, const IndexType *perm_idxs,
+                         const ValueType *orig, size_type stride_orig,
+                         ValueType *result, size_type stride_result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             inverse_row_permute(num_rows, num_cols, perm_idxs,
+                                                 orig, stride_orig, result,
+                                                 stride_result, item_ct1);
+                         });
+    });
+}
+
+
+template <typename IndexType, typename ValueType>
+void inverse_column_permute(size_type num_rows, size_type num_cols,
+                            const IndexType *__restrict__ perm_idxs,
+                            const ValueType *__restrict__ orig,
+                            size_type stride_orig,
+                            ValueType *__restrict__ result,
+                            size_type stride_result, sycl::nd_item<3> item_ct1)
+{
+    const auto global_id = thread::get_thread_id_flat(item_ct1);
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        result[row_id * stride_result + perm_idxs[col_id]] =
+            orig[row_id * stride_orig + col_id];
+    }
+}
+
+template <typename IndexType, typename ValueType>
+void inverse_column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                            sycl::queue *stream, size_type num_rows,
+                            size_type num_cols, const IndexType *perm_idxs,
+                            const ValueType *orig, size_type stride_orig,
+                            ValueType *result, size_type stride_result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             inverse_column_permute(
+                                 num_rows, num_cols, perm_idxs, orig,
+                                 stride_orig, result, stride_result, item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType>
+void extract_diagonal(size_type problem_size,
+                      const ValueType *__restrict__ orig, size_type stride_orig,
+                      ValueType *__restrict__ diag, sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+
+    if (tidx < problem_size) {
+        diag[tidx] = orig[tidx * stride_orig + tidx];
+    }
+}
+
+template <typename ValueType>
+void extract_diagonal(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                      sycl::queue *stream, size_type problem_size,
+                      const ValueType *orig, size_type stride_orig,
+                      ValueType *diag)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             extract_diagonal(problem_size, orig, stride_orig,
+                                              diag, item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType>
+void inplace_absolute_dense(size_type num_rows, size_type num_cols,
+                            ValueType *__restrict__ data, size_type stride,
+                            sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    auto row = tidx / num_cols;
+    auto col = tidx % num_cols;
+    if (row < num_rows) {
+        data[row * stride + col] = dpcpp::abs(data[row * stride + col]);
+    }
+}
+
+template <typename ValueType>
+void inplace_absolute_dense(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                            sycl::queue *stream, size_type num_rows,
+                            size_type num_cols, ValueType *data,
+                            size_type stride)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             inplace_absolute_dense(num_rows, num_cols, data,
+                                                    stride, item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType>
+void outplace_absolute_dense(size_type num_rows, size_type num_cols,
+                             const ValueType *__restrict__ in,
+                             size_type stride_in,
+                             remove_complex<ValueType> *__restrict__ out,
+                             size_type stride_out, sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    auto row = tidx / num_cols;
+    auto col = tidx % num_cols;
+    if (row < num_rows) {
+        out[row * stride_out + col] = dpcpp::abs(in[row * stride_in + col]);
+    }
+}
+
+template <typename ValueType>
+void outplace_absolute_dense(dim3 grid, dim3 block,
+                             size_t dynamic_shared_memory, sycl::queue *stream,
+                             size_type num_rows, size_type num_cols,
+                             const ValueType *in, size_type stride_in,
+                             remove_complex<ValueType> *out,
+                             size_type stride_out)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             outplace_absolute_dense(num_rows, num_cols, in,
+                                                     stride_in, out, stride_out,
+                                                     item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType, typename ComplexType>
+void make_complex(size_type num_rows, size_type num_cols,
+                  const ValueType *__restrict__ in, size_type stride_in,
+                  ComplexType *__restrict__ out, size_type stride_out,
+                  sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    auto row = tidx / num_cols;
+    auto col = tidx % num_cols;
+    if (row < num_rows) {
+        out[row * stride_out + col] = in[row * stride_in + col];
+    }
+}
+
+template <typename ValueType, typename ComplexType>
+void make_complex(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                  sycl::queue *stream, size_type num_rows, size_type num_cols,
+                  const ValueType *in, size_type stride_in, ComplexType *out,
+                  size_type stride_out)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             make_complex(num_rows, num_cols, in, stride_in,
+                                          out, stride_out, item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType>
+void get_real(size_type num_rows, size_type num_cols,
+              const ValueType *__restrict__ in, size_type stride_in,
+              remove_complex<ValueType> *__restrict__ out, size_type stride_out,
+              sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    auto row = tidx / num_cols;
+    auto col = tidx % num_cols;
+    if (row < num_rows) {
+        out[row * stride_out + col] = real(in[row * stride_in + col]);
+    }
+}
+
+template <typename ValueType>
+void get_real(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+              sycl::queue *stream, size_type num_rows, size_type num_cols,
+              const ValueType *in, size_type stride_in,
+              remove_complex<ValueType> *out, size_type stride_out)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             get_real(num_rows, num_cols, in, stride_in, out,
+                                      stride_out, item_ct1);
+                         });
+    });
+}
+
+
+template <typename ValueType>
+void get_imag(size_type num_rows, size_type num_cols,
+              const ValueType *__restrict__ in, size_type stride_in,
+              remove_complex<ValueType> *__restrict__ out, size_type stride_out,
+              sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    auto row = tidx / num_cols;
+    auto col = tidx % num_cols;
+    if (row < num_rows) {
+        out[row * stride_out + col] = imag(in[row * stride_in + col]);
+    }
+}
+
+template <typename ValueType>
+void get_imag(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+              sycl::queue *stream, size_type num_rows, size_type num_cols,
+              const ValueType *in, size_type stride_in,
+              remove_complex<ValueType> *out, size_type stride_out)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             get_imag(num_rows, num_cols, in, stride_in, out,
+                                      stride_out, item_ct1);
+                         });
+    });
+}
+
+
+}  // namespace kernel
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DpcppExecutor> exec,
                   const matrix::Dense<ValueType> *a,
                   const matrix::Dense<ValueType> *b,
-                  matrix::Dense<ValueType> *c) GKO_NOT_IMPLEMENTED;
+                  matrix::Dense<ValueType> *c)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        auto handle = exec->get_cublas_handle();
+        {
+            cublas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            cublas::gemm(handle, oneapi::mkl::transpose::nontrans,
+                         oneapi::mkl::transpose::nontrans, c->get_size()[1],
+                         c->get_size()[0], a->get_size()[1], &alpha,
+                         b->get_const_values(), b->get_stride(),
+                         a->get_const_values(), a->get_stride(), &beta,
+                         c->get_values(), c->get_stride());
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
@@ -78,8 +1317,20 @@ template <typename ValueType>
 void apply(std::shared_ptr<const DpcppExecutor> exec,
            const matrix::Dense<ValueType> *alpha,
            const matrix::Dense<ValueType> *a, const matrix::Dense<ValueType> *b,
-           const matrix::Dense<ValueType> *beta,
-           matrix::Dense<ValueType> *c) GKO_NOT_IMPLEMENTED;
+           const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *c)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        cublas::gemm(
+            exec->get_cublas_handle(), oneapi::mkl::transpose::nontrans,
+            oneapi::mkl::transpose::nontrans, c->get_size()[1],
+            c->get_size()[0], a->get_size()[1], alpha->get_const_values(),
+            b->get_const_values(), b->get_stride(), a->get_const_values(),
+            a->get_stride(), beta->get_const_values(), c->get_values(),
+            c->get_stride());
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
@@ -88,7 +1339,40 @@ template <typename ValueType>
 void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
                  const matrix::Dense<ValueType> *x,
                  const matrix::Dense<ValueType> *y,
-                 matrix::Dense<ValueType> *result) GKO_NOT_IMPLEMENTED;
+                 matrix::Dense<ValueType> *result)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        // TODO: write a custom kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            cublas::dot(exec->get_cublas_handle(), x->get_size()[0],
+                        x->get_const_values() + col, x->get_stride(),
+                        y->get_const_values() + col, y->get_stride(),
+                        result->get_values() + col);
+        }
+    } else {
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        constexpr auto block_size = 1024;
+
+        constexpr auto work_per_block = work_per_thread * block_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
+        Array<ValueType> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_dot<block_size>(
+                grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0],
+                x->get_const_values() + col, x->get_stride(),
+                y->get_const_values() + col, y->get_stride(), work.get_data());
+            kernel::finalize_dot_computation<block_size>(
+                1, block_dim, 0, exec->get_queue(), grid_dim.x,
+                work.get_const_data(), result->get_values() + col);
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 
@@ -106,7 +1390,37 @@ template <typename ValueType>
 void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
                    const matrix::Dense<ValueType> *x,
                    matrix::Dense<remove_complex<ValueType>> *result)
-    GKO_NOT_IMPLEMENTED;
+{
+    if (cublas::is_supported<ValueType>::value) {
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            cublas::norm2(exec->get_cublas_handle(), x->get_size()[0],
+                          x->get_const_values() + col, x->get_stride(),
+                          result->get_values() + col);
+        }
+    } else {
+        using norm_type = remove_complex<ValueType>;
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        constexpr auto block_size = 1024;
+
+        constexpr auto work_per_block = work_per_thread * block_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{config::warp_size, 1,
+                             block_size / config::warp_size};
+        Array<norm_type> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_norm2<block_size>(
+                grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0],
+                x->get_const_values() + col, x->get_stride(), work.get_data());
+            kernel::finalize_norm2_computation<block_size>(
+                1, block_dim, 0, exec->get_queue(), grid_dim.x,
+                work.get_const_data(), result->get_values() + col);
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
@@ -115,7 +1429,28 @@ template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *source,
                     matrix::Coo<ValueType, IndexType> *result)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+
+    auto row_idxs = result->get_row_idxs();
+    auto col_idxs = result->get_col_idxs();
+    auto values = result->get_values();
+
+    auto stride = source->get_stride();
+
+    auto nnz_prefix_sum = Array<size_type>(exec, num_rows);
+    calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum);
+
+    components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows);
+
+    size_type grid_dim = ceildiv(num_rows, default_block_size);
+
+    kernel::fill_in_coo(grid_dim, default_block_size, 0, exec->get_queue(),
+                        num_rows, num_cols, stride,
+                        nnz_prefix_sum.get_const_data(),
+                        source->get_const_values(), row_idxs, col_idxs, values);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
@@ -125,7 +1460,31 @@ template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *source,
                     matrix::Csr<ValueType, IndexType> *result)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+
+    auto row_ptrs = result->get_row_ptrs();
+    auto col_idxs = result->get_col_idxs();
+    auto values = result->get_values();
+
+    auto stride = source->get_stride();
+
+    const auto rows_per_block = ceildiv(default_block_size, config::warp_size);
+    const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
+
+    kernel::count_nnz_per_row(grid_dim_nnz, default_block_size, 0,
+                              exec->get_queue(), num_rows, num_cols, stride,
+                              source->get_const_values(), row_ptrs);
+
+    components::prefix_sum(exec, row_ptrs, num_rows + 1);
+
+    size_type grid_dim = ceildiv(num_rows, default_block_size);
+
+    kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(),
+                        num_rows, num_cols, stride, source->get_const_values(),
+                        row_ptrs, col_idxs, values);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
@@ -135,7 +1494,23 @@ template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *source,
                     matrix::Ell<ValueType, IndexType> *result)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    auto max_nnz_per_row = result->get_num_stored_elements_per_row();
+
+    auto col_ptrs = result->get_col_idxs();
+    auto values = result->get_values();
+
+    auto source_stride = source->get_stride();
+    auto result_stride = result->get_stride();
+
+    auto grid_dim = ceildiv(result_stride, default_block_size);
+    kernel::fill_in_ell(grid_dim, default_block_size, 0, exec->get_queue(),
+                        num_rows, num_cols, source_stride,
+                        source->get_const_values(), max_nnz_per_row,
+                        result_stride, col_ptrs, values);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
@@ -155,7 +1530,46 @@ template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
                       const matrix::Dense<ValueType> *source,
                       matrix::Sellp<ValueType, IndexType> *result)
-    GKO_NOT_IMPLEMENTED;
+{
+    const auto stride = source->get_stride();
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+
+    auto vals = result->get_values();
+    auto col_idxs = result->get_col_idxs();
+    auto slice_lengths = result->get_slice_lengths();
+    auto slice_sets = result->get_slice_sets();
+
+    const auto slice_size = (result->get_slice_size() == 0)
+                                ? matrix::default_slice_size
+                                : result->get_slice_size();
+    const auto stride_factor = (result->get_stride_factor() == 0)
+                                   ? matrix::default_stride_factor
+                                   : result->get_stride_factor();
+    const int slice_num = ceildiv(num_rows, slice_size);
+
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    auto grid_dim = slice_num;
+
+    if (grid_dim > 0) {
+        kernel::calculate_slice_lengths(
+            grid_dim, config::warp_size, 0, exec->get_queue(), num_rows,
+            slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(),
+            slice_lengths, slice_sets);
+    }
+
+    components::prefix_sum(exec, slice_sets, slice_num + 1);
+
+    grid_dim = ceildiv(num_rows, default_block_size);
+    if (grid_dim > 0) {
+        kernel::fill_in_sellp(grid_dim, default_block_size, 0,
+                              exec->get_queue(), num_rows, num_cols, slice_size,
+                              stride, source->get_const_values(), slice_lengths,
+                              slice_sets, col_idxs, vals);
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
@@ -173,8 +1587,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType>
 void count_nonzeros(std::shared_ptr<const DpcppExecutor> exec,
-                    const matrix::Dense<ValueType> *source,
-                    size_type *result) GKO_NOT_IMPLEMENTED;
+                    const matrix::Dense<ValueType> *source, size_type *result)
+{
+    const auto num_rows = source->get_size()[0];
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL);
 
@@ -182,7 +1603,33 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL);
 template <typename ValueType>
 void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
                                const matrix::Dense<ValueType> *source,
-                               size_type *result) GKO_NOT_IMPLEMENTED;
+                               size_type *result)
+{
+    const auto num_rows = source->get_size()[0];
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    const auto n = ceildiv(num_rows, default_block_size);
+    const size_type grid_dim =
+        (n <= default_block_size) ? n : default_block_size;
+
+    auto block_results = Array<size_type>(exec, grid_dim);
+
+    kernel::reduce_max_nnz(
+        grid_dim, default_block_size, default_block_size * sizeof(size_type),
+        exec->get_queue(), num_rows, nnz_per_row.get_const_data(),
+        block_results.get_data());
+
+    auto d_result = Array<size_type>(exec, 1);
+
+    kernel::reduce_max_nnz(1, default_block_size,
+                           default_block_size * sizeof(size_type),
+                           exec->get_queue(), grid_dim,
+                           block_results.get_const_data(), d_result.get_data());
+
+    *result = exec->copy_val_to_host(d_result.get_const_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
@@ -191,7 +1638,19 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 template <typename ValueType>
 void calculate_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
                                 const matrix::Dense<ValueType> *source,
-                                Array<size_type> *result) GKO_NOT_IMPLEMENTED;
+                                Array<size_type> *result)
+{
+    const dim3 block_size(default_block_size, 1, 1);
+    auto rows_per_block = ceildiv(default_block_size, config::warp_size);
+    const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block);
+    const dim3 grid_size(grid_x, 1, 1);
+    if (grid_x > 0) {
+        kernel::count_nnz_per_row(
+            grid_size, block_size, 0, exec->get_queue(), source->get_size()[0],
+            source->get_size()[1], source->get_stride(),
+            source->get_const_values(), result->get_data());
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL);
@@ -201,7 +1660,48 @@ template <typename ValueType>
 void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
                           const matrix::Dense<ValueType> *source,
                           size_type *result, size_type stride_factor,
-                          size_type slice_size) GKO_NOT_IMPLEMENTED;
+                          size_type slice_size)
+{
+    const auto num_rows = source->get_size()[0];
+
+    if (num_rows == 0) {
+        *result = 0;
+        return;
+    }
+
+    const auto num_cols = source->get_size()[1];
+    const auto slice_num = ceildiv(num_rows, slice_size);
+
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
+
+    auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size);
+
+    kernel::reduce_max_nnz_per_slice(
+        grid_dim, default_block_size, 0, exec->get_queue(), num_rows,
+        slice_size, stride_factor, nnz_per_row.get_const_data(),
+        max_nnz_per_slice.get_data());
+
+    grid_dim = ceildiv(slice_num, default_block_size);
+    auto block_results = Array<size_type>(exec, grid_dim);
+
+    kernel::reduce_total_cols(
+        grid_dim, default_block_size, default_block_size * sizeof(size_type),
+        exec->get_queue(), slice_num, max_nnz_per_slice.get_const_data(),
+        block_results.get_data());
+
+    auto d_result = Array<size_type>(exec, 1);
+
+    kernel::reduce_total_cols(
+        1, default_block_size, default_block_size * sizeof(size_type),
+        exec->get_queue(), grid_dim, block_results.get_const_data(),
+        d_result.get_data());
+
+    *result = exec->copy_val_to_host(d_result.get_const_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL);
@@ -210,7 +1710,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 template <typename ValueType>
 void transpose(std::shared_ptr<const DpcppExecutor> exec,
                const matrix::Dense<ValueType> *orig,
-               matrix::Dense<ValueType> *trans) GKO_NOT_IMPLEMENTED;
+               matrix::Dense<ValueType> *trans)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        auto handle = exec->get_cublas_handle();
+        {
+            cublas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            cublas::geam(
+                handle, oneapi::mkl::transpose::trans,
+                oneapi::mkl::transpose::nontrans, orig->get_size()[0],
+                orig->get_size()[1], &alpha, orig->get_const_values(),
+                orig->get_stride(), &beta, static_cast<ValueType *>(nullptr),
+                trans->get_size()[1], trans->get_values(), trans->get_stride());
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+};
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
@@ -218,7 +1736,25 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 template <typename ValueType>
 void conj_transpose(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *orig,
-                    matrix::Dense<ValueType> *trans) GKO_NOT_IMPLEMENTED;
+                    matrix::Dense<ValueType> *trans)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        auto handle = exec->get_cublas_handle();
+        {
+            cublas::pointer_mode_guard pm_guard(handle);
+            auto alpha = one<ValueType>();
+            auto beta = zero<ValueType>();
+            cublas::geam(
+                handle, oneapi::mkl::transpose::conjtrans,
+                oneapi::mkl::transpose::nontrans, orig->get_size()[0],
+                orig->get_size()[1], &alpha, orig->get_const_values(),
+                orig->get_stride(), &beta, static_cast<ValueType *>(nullptr),
+                trans->get_size()[1], trans->get_values(), trans->get_stride());
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 

From 2e63c8d321082effa6bdec5ecb8927f59c4d7a62 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 24 Feb 2021 16:09:26 +0800
Subject: [PATCH 05/22] dense, prefix_sum and uninitialized_array

---
 dpcpp/CMakeLists.txt                     |   1 +
 dpcpp/components/prefix_sum.dp.cpp       |  33 +-
 dpcpp/components/prefix_sum.dp.hpp       | 260 ++++++++
 dpcpp/components/uninitialized_array.hpp |   9 +-
 dpcpp/matrix/dense_kernels.dp.cpp        | 247 +++----
 dpcpp/test/components/CMakeLists.txt     |   1 +
 dpcpp/test/components/prefix_sum.cpp     |  96 +++
 dpcpp/test/matrix/CMakeLists.txt         |   3 +-
 dpcpp/test/matrix/dense_kernels.cpp      | 806 +++++++++++++++++++++++
 dpcpp/test/utils.hpp                     |  54 ++
 10 files changed, 1372 insertions(+), 138 deletions(-)
 create mode 100644 dpcpp/components/prefix_sum.dp.hpp
 create mode 100644 dpcpp/test/components/prefix_sum.cpp
 create mode 100644 dpcpp/test/matrix/dense_kernels.cpp
 create mode 100644 dpcpp/test/utils.hpp

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 97cf8a5daf6..b3101d8b2e2 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -60,6 +60,7 @@ target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
 target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
+target_link_libraries(ginkgo_dpcpp PRIVATE "mkl_sycl;mkl_intel_ilp64;mkl_sequential;mkl_core")
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
 if (GINKGO_DPCPP_SINGLE_MODE)
     target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp
index 4b7f816b381..b4961809a8b 100644
--- a/dpcpp/components/prefix_sum.dp.cpp
+++ b/dpcpp/components/prefix_sum.dp.cpp
@@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <CL/sycl.hpp>
 
 
-#include <ginkgo/core/base/exception_helpers.hpp>
+#include "dpcpp/components/prefix_sum.dp.hpp"
 
 
 namespace gko {
@@ -45,22 +45,33 @@ namespace dpcpp {
 namespace components {
 
 
+constexpr int prefix_sum_block_size = 256;
+
+
 template <typename IndexType>
-void prefix_sum(std::shared_ptr<const DefaultExecutor> exec, IndexType *counts,
+void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // TODO actually implement parallel prefix sum
-    exec->get_queue()->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl::range<1>{1}, [=](sycl::id<1> idx) {
-            IndexType sum{};
-            for (size_type i = 0; i < num_entries; i++) {
-                sum += std::exchange(counts[i], sum);
-            }
-        });
-    });
+    // prefix_sum should be on the valid array
+    if (num_entries > 0) {
+        auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
+        Array<IndexType> block_sum_array(exec, num_blocks - 1);
+        auto block_sums = block_sum_array.get_data();
+        start_prefix_sum<prefix_sum_block_size>(
+            num_blocks, prefix_sum_block_size, 0, exec->get_queue(),
+            num_entries, counts, block_sums);
+        // add the total sum of the previous block only when the number of block
+        // is larger than 1.
+        if (num_blocks > 1) {
+            finalize_prefix_sum<prefix_sum_block_size>(
+                num_blocks, prefix_sum_block_size, 0, exec->get_queue(),
+                num_entries, counts, block_sums);
+        }
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
 // instantiate for size_type as well, as this is used in the Sellp format
 template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type);
 
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
new file mode 100644
index 00000000000..6b3498d1dea
--- /dev/null
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -0,0 +1,260 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_
+
+
+#include <type_traits>
+
+
+#include <CL/sycl.hpp>
+
+
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+// #include "common/components/prefix_sum.hpp.inc"
+/**
+ * @internal
+ * Computes the prefix sum and total sum of `element` over a subwarp.
+ *
+ * @param element     the element over which we compute the prefix sum.
+ * @param prefix_sum  will be set to the sum of all `element`s from lower
+ *                    lanes, plus the local `element` if `inclusive` is `true`.
+ * @param total_sum   will be set to the total sum of `element` in this subwarp.
+ * @param subwarp     the cooperative group representing the subwarp.
+ *
+ * @tparam inclusive  if this is true, the computed prefix sum will be
+ *                    inclusive, otherwise it will be exclusive.
+ *
+ * @note For this function to work on architectures with independent thread
+ * scheduling, all threads of the subwarp have to execute it.
+ */
+template <bool inclusive, typename ValueType, typename Group>
+__dpct_inline__ void subwarp_prefix_sum(ValueType element,
+                                        ValueType &prefix_sum,
+                                        ValueType &total_sum, Group subwarp)
+{
+    prefix_sum = inclusive ? element : zero<ValueType>();
+    total_sum = element;
+#pragma unroll
+    // hypercube prefix sum
+    for (auto step = 1; step < subwarp.size(); step *= 2) {
+        auto neighbor = subwarp.shfl_xor(total_sum, step);
+        total_sum += neighbor;
+        prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;
+    }
+}
+
+/**
+ * @internal
+ * Computes the prefix sum of `element` over a subwarp.
+ *
+ * @param element     the element over which we compute the prefix sum.
+ * @param prefix_sum  will be set to the sum of all `element`s from lower
+ *                    lanes, plus the local `element` if `inclusive` is `true`.
+ * @param subwarp     the cooperative group representing the subwarp.
+ *
+ * @tparam inclusive  if this is true, the computed prefix sum will be
+ *                    inclusive, otherwise it will be exclusive.
+ *
+ * @note All threads of the subwarp have to execute this function for it to work
+ *       (and not dead-lock on newer architectures).
+ */
+template <bool inclusive, typename ValueType, typename Group>
+__dpct_inline__ void subwarp_prefix_sum(ValueType element,
+                                        ValueType &prefix_sum, Group subwarp)
+{
+    ValueType tmp{};
+    subwarp_prefix_sum<inclusive>(element, prefix_sum, tmp, subwarp);
+}
+
+
+/**
+ * @internal
+ * First step of the calculation of a prefix sum. Calculates the prefix sum
+ * in-place on parts of the array `elements`.
+ *
+ * @param elements  array on which the prefix sum is to be calculated
+ * @param block_sum  array which stores the total sum of each block, requires at
+ *                   least `ceildiv(num_elements, block_size) - 1` elements
+ * @param num_elements  total number of entries in `elements`
+ *
+ * @tparam block_size  thread block size for this kernel, also size of blocks on
+ *                     which this kernel calculates the prefix sum in-place
+ *
+ * @note To calculate the prefix sum over an array of size bigger than
+ *       `block_size`, `finalize_prefix_sum` has to be used as well.
+ */
+template <int block_size, typename ValueType>
+void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
+                      ValueType *__restrict__ block_sum,
+                      sycl::nd_item<3> item_ct1,
+                      UninitializedArray<ValueType, block_size> *prefix_helper)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    const auto element_id = item_ct1.get_local_id(2);
+
+    // do not need to access the last element when exclusive prefix sum
+    (*prefix_helper)[element_id] =
+        (tidx + 1 < num_elements) ? elements[tidx] : zero<ValueType>();
+    auto this_block = group::this_thread_block(item_ct1);
+    this_block.sync();
+
+    // Do a normal reduction
+#pragma unroll
+    for (int i = 1; i < block_size; i <<= 1) {
+        const auto ai = i * (2 * element_id + 1) - 1;
+        const auto bi = i * (2 * element_id + 2) - 1;
+        if (bi < block_size) {
+            (*prefix_helper)[bi] += (*prefix_helper)[ai];
+        }
+        this_block.sync();
+    }
+
+    if (element_id == 0) {
+        // Store the total sum except the last block
+        if (item_ct1.get_group(2) + 1 < item_ct1.get_group_range(2)) {
+            block_sum[item_ct1.get_group(2)] = (*prefix_helper)[block_size - 1];
+        }
+        (*prefix_helper)[block_size - 1] = zero<ValueType>();
+    }
+
+    this_block.sync();
+
+    // Perform the down-sweep phase to get the true prefix sum
+#pragma unroll
+    for (int i = block_size >> 1; i > 0; i >>= 1) {
+        const auto ai = i * (2 * element_id + 1) - 1;
+        const auto bi = i * (2 * element_id + 2) - 1;
+        if (bi < block_size) {
+            auto tmp = (*prefix_helper)[ai];
+            (*prefix_helper)[ai] = (*prefix_helper)[bi];
+            (*prefix_helper)[bi] += tmp;
+        }
+        this_block.sync();
+    }
+    if (tidx < num_elements) {
+        elements[tidx] = (*prefix_helper)[element_id];
+    }
+}
+
+template <int block_size, typename ValueType>
+void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                      sycl::queue *stream, size_type num_elements,
+                      ValueType *elements, ValueType *block_sum)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            prefix_helper_acc_ct1(cgh);
+
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             start_prefix_sum<block_size>(
+                                 num_elements, elements, block_sum, item_ct1,
+                                 (UninitializedArray<ValueType, block_size> *)
+                                     prefix_helper_acc_ct1.get_pointer());
+                         });
+    });
+}
+
+
+/**
+ * @internal
+ * Second step of the calculation of a prefix sum. Increases the value of each
+ * entry of `elements` by the total sum of all preceding blocks.
+ *
+ * @param elements  array on which the prefix sum is to be calculated
+ * @param block_sum  array storing the total sum of each block
+ * @param num_elements  total number of entries in `elements`
+ *
+ * @tparam block_size  thread block size for this kernel, has to be the same as
+ *                    for `start_prefix_sum`
+ *
+ * @note To calculate a prefix sum, first `start_prefix_sum` has to be called.
+ */
+template <int block_size, typename ValueType>
+void finalize_prefix_sum(size_type num_elements,
+                         ValueType *__restrict__ elements,
+                         const ValueType *__restrict__ block_sum,
+                         sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+
+    if (tidx < num_elements) {
+        ValueType prefix_block_sum = zero<ValueType>();
+        for (size_type i = 0; i < item_ct1.get_group(2); i++) {
+            prefix_block_sum += block_sum[i];
+        }
+        elements[tidx] += prefix_block_sum;
+    }
+}
+
+template <int block_size, typename ValueType>
+void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                         sycl::queue *stream, size_type num_elements,
+                         ValueType *elements, const ValueType *block_sum)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        auto local_range = block.reverse();
+        auto global_range = grid.reverse() * local_range;
+
+        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             finalize_prefix_sum<block_size>(
+                                 num_elements, elements, block_sum, item_ct1);
+                         });
+    });
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_
diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp
index fb7575bc202..415126b8ed3 100644
--- a/dpcpp/components/uninitialized_array.hpp
+++ b/dpcpp/components/uninitialized_array.hpp
@@ -88,7 +88,7 @@ class UninitializedArray {
     constexpr __dpct_inline__ ValueType &operator[](size_type pos) const
         noexcept
     {
-        return reinterpret_cast<const ValueType *>(data_)[pos];
+        return data_[pos];
     }
 
     /**
@@ -99,13 +99,14 @@ class UninitializedArray {
      *
      * @return a reference to the array entry at the given index.
      */
-    GKO_ATTRIBUTES ValueType &operator[](size_type pos) noexcept
+    __dpct_inline__ ValueType &operator[](size_type pos) noexcept
     {
-        return reinterpret_cast<ValueType *>(data_)[pos];
+        return data_[pos];
     }
 
 private:
-    unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
+    // unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
+    ValueType data_[size];
 };
 
 
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 36935b1a4d6..c5074b5cc38 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -33,11 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 
 
-#include <dpcpp/base/cublas_bindings.hpp>
-#include <dpcpp/base/pointer_mode_guard.hpp>
-
-
 #include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
 
 
 #include <ginkgo/core/base/math.hpp>
@@ -48,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
+#include <iostream>
 
 
 #include "core/components/prefix_sum.hpp"
@@ -70,7 +68,7 @@ namespace dpcpp {
 namespace dense {
 
 
-constexpr auto default_block_size = 512;
+constexpr auto default_block_size = 256;
 
 
 // #include "common/matrix/dense_kernels.hpp.inc"
@@ -236,18 +234,20 @@ void compute_partial_reduce(size_type num_rows, OutType *__restrict__ work,
     const auto global_id =
         thread::get_thread_id<config::warp_size, warps_per_block>(item_ct1);
 
+    OutType *tmp_work_array=*tmp_work;
     auto tmp = zero<OutType>();
     for (auto i = global_id; i < num_rows; i += block_size * num_blocks) {
         tmp = reduce_op(tmp, get_value(i));
     }
 
-    (*tmp_work)[local_id] = tmp;
+    tmp_work_array[local_id] = tmp;
 
-    reduce(group::this_thread_block(item_ct1),
-           static_cast<OutType *>((*tmp_work)), reduce_op);
+    ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1),
+                                  tmp_work_array,
+                                  reduce_op);
 
     if (local_id == 0) {
-        work[thread::get_block_id(item_ct1)] = (*tmp_work)[0];
+        work[thread::get_block_id(item_ct1)] = tmp_work_array[0];
     }
 }
 
@@ -267,14 +267,15 @@ void finalize_reduce_computation(
     for (auto i = local_id; i < size; i += block_size) {
         tmp = reduce_op(tmp, work[i]);
     }
+    ValueType *tmp_work_array=*tmp_work;
+    tmp_work_array[local_id] = tmp;
 
-    (*tmp_work)[local_id] = tmp;
-
-    reduce(group::this_thread_block(item_ct1),
-           static_cast<ValueType *>((*tmp_work)), reduce_op);
+    ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1),
+                                  tmp_work_array,
+                                  reduce_op);
 
     if (local_id == 0) {
-        *result = finalize_op((*tmp_work)[0]);
+        *result = finalize_op(tmp_work_array[0]);
     }
 }
 
@@ -287,12 +288,7 @@ void compute_partial_dot(size_type num_rows, const ValueType *__restrict__ x,
                          UninitializedArray<ValueType, block_size> *tmp_work)
 {
     compute_partial_reduce<block_size>(
-        /*
-        DPCT1007:4: Migration of this CUDA API is not supported by the Intel(R)
-        DPC++ Compatibility Tool.
-        */
-        num_rows,
-        work,
+        num_rows, work,
         [x, stride_x, y, stride_y](size_type i) {
             return x[i * stride_x] * conj(y[i * stride_y]);
         },
@@ -416,8 +412,7 @@ void finalize_norm2_computation(
     finalize_reduce_computation<block_size>(
         size, work, result,
         [](const ValueType &x, const ValueType &y) { return x + y; },
-        [](const ValueType &x) { return sycl::sqrt((float)x); }, item_ct1,
-        tmp_work);
+        [](const ValueType &x) { return sqrt(x); }, item_ct1, tmp_work);
 }
 
 template <size_type block_size, typename ValueType>
@@ -508,9 +503,7 @@ void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride,
                 part_result += 1;
             }
         }
-        result[row_idx] = std::reduce(
-            oneapi::dpl::execution::make_device_policy(
-                dpct::get_default_queue()),
+        result[row_idx] = ::gko::kernels::dpcpp::reduce(
             warp_tile, part_result,
             [](const size_type &a, const size_type &b) { return a + b; });
     }
@@ -648,7 +641,7 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size,
 
         auto warp_tile = group::tiled_partition<warp_size>(
             group::this_thread_block(item_ct1));
-        auto warp_result = reduce(
+        auto warp_result = ::gko::kernels::dpcpp::reduce(
             warp_tile, thread_result,
             [](const size_type &a, const size_type &b) { return max(a, b); });
 
@@ -745,7 +738,7 @@ void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
     auto block_max = (size_type *)dpct_local;
 
     reduce_array(
-        size, nnz_per_row, block_max,
+        size, nnz_per_row, block_max, item_ct1,
         [](const size_type &x, const size_type &y) { return max(x, y); });
 
     if (item_ct1.get_local_id(2) == 0) {
@@ -795,7 +788,7 @@ void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size,
         }
     }
 
-    auto warp_result = reduce(
+    auto warp_result = ::gko::kernels::dpcpp::reduce(
         warp_tile, thread_result,
         [](const size_type &a, const size_type &b) { return max(a, b); });
 
@@ -831,7 +824,7 @@ void reduce_total_cols(size_type num_slices,
 {
     auto block_result = (size_type *)dpct_local;
 
-    reduce_array(num_slices, max_nnz_per_slice, block_result,
+    reduce_array(num_slices, max_nnz_per_slice, block_result, item_ct1,
                  [](const size_type &x, const size_type &y) { return x + y; });
 
     if (item_ct1.get_local_id(2) == 0) {
@@ -1292,22 +1285,13 @@ void simple_apply(std::shared_ptr<const DpcppExecutor> exec,
                   const matrix::Dense<ValueType> *b,
                   matrix::Dense<ValueType> *c)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
-        {
-            cublas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            cublas::gemm(handle, oneapi::mkl::transpose::nontrans,
-                         oneapi::mkl::transpose::nontrans, c->get_size()[1],
-                         c->get_size()[0], a->get_size()[1], &alpha,
-                         b->get_const_values(), b->get_stride(),
-                         a->get_const_values(), a->get_stride(), &beta,
-                         c->get_values(), c->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
+    using namespace oneapi::mkl;
+    oneapi::mkl::blas::row_major::gemm(
+        *exec->get_queue(), transpose::nontrans, transpose::nontrans,
+        c->get_size()[0], c->get_size()[1], a->get_size()[1],
+        one<ValueType>(), a->get_const_values(), a->get_stride(),
+        b->get_const_values(), b->get_stride(), zero<ValueType>(),
+        c->get_values(), c->get_stride());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
@@ -1319,17 +1303,14 @@ void apply(std::shared_ptr<const DpcppExecutor> exec,
            const matrix::Dense<ValueType> *a, const matrix::Dense<ValueType> *b,
            const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *c)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        cublas::gemm(
-            exec->get_cublas_handle(), oneapi::mkl::transpose::nontrans,
-            oneapi::mkl::transpose::nontrans, c->get_size()[1],
-            c->get_size()[0], a->get_size()[1], alpha->get_const_values(),
-            b->get_const_values(), b->get_stride(), a->get_const_values(),
-            a->get_stride(), beta->get_const_values(), c->get_values(),
-            c->get_stride());
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
+    using namespace oneapi::mkl;
+    oneapi::mkl::blas::row_major::gemm(
+        *exec->get_queue(), transpose::nontrans, transpose::nontrans,
+        c->get_size()[0], c->get_size()[1], a->get_size()[1],
+        exec->copy_val_to_host(alpha->get_const_values()),
+        a->get_const_values(), a->get_stride(), b->get_const_values(),
+        b->get_stride(), exec->copy_val_to_host(beta->get_const_values()),
+        c->get_values(), c->get_stride());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
@@ -1341,20 +1322,20 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
                  const matrix::Dense<ValueType> *y,
                  matrix::Dense<ValueType> *result)
 {
-    if (cublas::is_supported<ValueType>::value) {
+    if (0) {
         // TODO: write a custom kernel which does this more efficiently
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::dot(exec->get_cublas_handle(), x->get_size()[0],
-                        x->get_const_values() + col, x->get_stride(),
-                        y->get_const_values() + col, y->get_stride(),
-                        result->get_values() + col);
+            dot(*exec->get_queue(), x->get_size()[0],
+                x->get_const_values() + col, x->get_stride(),
+                y->get_const_values() + col, y->get_stride(),
+                result->get_values() + col);
         }
     } else {
         // TODO: these are tuning parameters obtained experimentally, once
         // we decide how to handle this uniformly, they should be modified
         // appropriately
         constexpr auto work_per_thread = 32;
-        constexpr auto block_size = 1024;
+        constexpr auto block_size = default_block_size;
 
         constexpr auto work_per_block = work_per_thread * block_size;
         const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
@@ -1366,7 +1347,8 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
             kernel::compute_partial_dot<block_size>(
                 grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0],
                 x->get_const_values() + col, x->get_stride(),
-                y->get_const_values() + col, y->get_stride(), work.get_data());
+                y->get_const_values() + col, y->get_stride(),
+                work.get_data());
             kernel::finalize_dot_computation<block_size>(
                 1, block_dim, 0, exec->get_queue(), grid_dim.x,
                 work.get_const_data(), result->get_values() + col);
@@ -1391,19 +1373,20 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
                    const matrix::Dense<ValueType> *x,
                    matrix::Dense<remove_complex<ValueType>> *result)
 {
-    if (cublas::is_supported<ValueType>::value) {
+    if (0) {
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            cublas::norm2(exec->get_cublas_handle(), x->get_size()[0],
-                          x->get_const_values() + col, x->get_stride(),
-                          result->get_values() + col);
+            oneapi::mkl::blas::row_major::nrm2(
+                *exec->get_queue(), x->get_size()[0],
+                x->get_const_values() + col, x->get_stride(),
+                result->get_values() + col);
         }
     } else {
         using norm_type = remove_complex<ValueType>;
-        // TODO: these are tuning parameters obtained experimentally, once
-        // we decide how to handle this uniformly, they should be modified
-        // appropriately
+        // // TODO: these are tuning parameters obtained experimentally, once
+        // // we decide how to handle this uniformly, they should be modified
+        // // appropriately
         constexpr auto work_per_thread = 32;
-        constexpr auto block_size = 1024;
+        constexpr auto block_size = default_block_size;
 
         constexpr auto work_per_block = work_per_thread * block_size;
         const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
@@ -1414,7 +1397,8 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
             kernel::compute_partial_norm2<block_size>(
                 grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0],
-                x->get_const_values() + col, x->get_stride(), work.get_data());
+                x->get_const_values() + col, x->get_stride(),
+                work.get_data());
             kernel::finalize_norm2_computation<block_size>(
                 1, block_dim, 0, exec->get_queue(), grid_dim.x,
                 work.get_const_data(), result->get_values() + col);
@@ -1449,7 +1433,8 @@ void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
     kernel::fill_in_coo(grid_dim, default_block_size, 0, exec->get_queue(),
                         num_rows, num_cols, stride,
                         nnz_prefix_sum.get_const_data(),
-                        source->get_const_values(), row_idxs, col_idxs, values);
+                        source->get_const_values(), row_idxs, col_idxs,
+                        values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1470,8 +1455,9 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
 
     auto stride = source->get_stride();
 
-    const auto rows_per_block = ceildiv(default_block_size, config::warp_size);
-    const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
+    const auto rows_per_block = ceildiv(default_block_size,
+    config::warp_size); const auto grid_dim_nnz =
+    ceildiv(source->get_size()[0], rows_per_block);
 
     kernel::count_nnz_per_row(grid_dim_nnz, default_block_size, 0,
                               exec->get_queue(), num_rows, num_cols, stride,
@@ -1482,8 +1468,9 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
     size_type grid_dim = ceildiv(num_rows, default_block_size);
 
     kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(),
-                        num_rows, num_cols, stride, source->get_const_values(),
-                        row_ptrs, col_idxs, values);
+                        num_rows, num_cols, stride,
+                        source->get_const_values(), row_ptrs, col_idxs,
+                        values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1549,25 +1536,34 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
     const int slice_num = ceildiv(num_rows, slice_size);
 
     auto nnz_per_row = Array<size_type>(exec, num_rows);
+    std::cout << "calculate_nonzeros_per_row" << std::endl;
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
-
+    exec->synchronize();
+    std::cout << "calculate_nonzeros_per_row finish" << std::endl;
     auto grid_dim = slice_num;
 
     if (grid_dim > 0) {
+        std::cout << "calculate_slice_lengths" << std::endl;
         kernel::calculate_slice_lengths(
             grid_dim, config::warp_size, 0, exec->get_queue(), num_rows,
-            slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(),
-            slice_lengths, slice_sets);
+            slice_size, slice_num, stride_factor,
+            nnz_per_row.get_const_data(), slice_lengths, slice_sets);
+        exec->synchronize();
+        std::cout << "calculate_slice_lengths finish" << std::endl;
     }
-
+    std::cout << "prefix_sum" << std::endl;
     components::prefix_sum(exec, slice_sets, slice_num + 1);
-
+    // exec->synchronize();
+    std::cout << "prefix_sum finish" << std::endl;
     grid_dim = ceildiv(num_rows, default_block_size);
     if (grid_dim > 0) {
+        std::cout << "fill_in_sellp" << std::endl;
         kernel::fill_in_sellp(grid_dim, default_block_size, 0,
-                              exec->get_queue(), num_rows, num_cols, slice_size,
-                              stride, source->get_const_values(), slice_lengths,
-                              slice_sets, col_idxs, vals);
+                              exec->get_queue(), num_rows, num_cols,
+                              slice_size, stride, source->get_const_values(),
+                              slice_lengths, slice_sets, col_idxs, vals);
+        exec->synchronize();
+        std::cout << "fill_in_sellp finish" << std::endl;
     }
 }
 
@@ -1626,7 +1622,8 @@ void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
     kernel::reduce_max_nnz(1, default_block_size,
                            default_block_size * sizeof(size_type),
                            exec->get_queue(), grid_dim,
-                           block_results.get_const_data(), d_result.get_data());
+                           block_results.get_const_data(),
+                           d_result.get_data());
 
     *result = exec->copy_val_to_host(d_result.get_const_data());
 }
@@ -1646,9 +1643,10 @@ void calculate_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
     const dim3 grid_size(grid_x, 1, 1);
     if (grid_x > 0) {
         kernel::count_nnz_per_row(
-            grid_size, block_size, 0, exec->get_queue(), source->get_size()[0],
-            source->get_size()[1], source->get_stride(),
-            source->get_const_values(), result->get_data());
+            grid_size, block_size, 0, exec->get_queue(),
+            source->get_size()[0], source->get_size()[1],
+            source->get_stride(), source->get_const_values(),
+            result->get_data());
     }
 }
 
@@ -1678,7 +1676,8 @@ void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
 
     auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
 
-    auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size);
+    auto grid_dim = ceildiv(slice_num * config::warp_size,
+    default_block_size);
 
     kernel::reduce_max_nnz_per_slice(
         grid_dim, default_block_size, 0, exec->get_queue(), num_rows,
@@ -1712,22 +1711,24 @@ void transpose(std::shared_ptr<const DpcppExecutor> exec,
                const matrix::Dense<ValueType> *orig,
                matrix::Dense<ValueType> *trans)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
-        {
-            cublas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            cublas::geam(
-                handle, oneapi::mkl::transpose::trans,
-                oneapi::mkl::transpose::nontrans, orig->get_size()[0],
-                orig->get_size()[1], &alpha, orig->get_const_values(),
-                orig->get_stride(), &beta, static_cast<ValueType *>(nullptr),
-                trans->get_size()[1], trans->get_values(), trans->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
+    // if (cublas::is_supported<ValueType>::value) {
+    //     auto handle = exec->get_cublas_handle();
+    //     {
+    //         cublas::pointer_mode_guard pm_guard(handle);
+    //         auto alpha = one<ValueType>();
+    //         auto beta = zero<ValueType>();
+    //         cublas::geam(
+    //             handle, oneapi::mkl::transpose::trans,
+    //             oneapi::mkl::transpose::nontrans, orig->get_size()[0],
+    //             orig->get_size()[1], &alpha, orig->get_const_values(),
+    //             orig->get_stride(), &beta, static_cast<ValueType
+    //             *>(nullptr), trans->get_size()[1], trans->get_values(),
+    //             trans->get_stride());
+    //     }
+    // } else {
+    //     GKO_NOT_IMPLEMENTED;
+    // }
+    GKO_NOT_IMPLEMENTED;
 };
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
@@ -1738,22 +1739,24 @@ void conj_transpose(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *orig,
                     matrix::Dense<ValueType> *trans)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
-        {
-            cublas::pointer_mode_guard pm_guard(handle);
-            auto alpha = one<ValueType>();
-            auto beta = zero<ValueType>();
-            cublas::geam(
-                handle, oneapi::mkl::transpose::conjtrans,
-                oneapi::mkl::transpose::nontrans, orig->get_size()[0],
-                orig->get_size()[1], &alpha, orig->get_const_values(),
-                orig->get_stride(), &beta, static_cast<ValueType *>(nullptr),
-                trans->get_size()[1], trans->get_values(), trans->get_stride());
-        }
-    } else {
-        GKO_NOT_IMPLEMENTED;
-    }
+    // if (cublas::is_supported<ValueType>::value) {
+    //     auto handle = exec->get_cublas_handle();
+    //     {
+    //         cublas::pointer_mode_guard pm_guard(handle);
+    //         auto alpha = one<ValueType>();
+    //         auto beta = zero<ValueType>();
+    //         cublas::geam(
+    //             handle, oneapi::mkl::transpose::conjtrans,
+    //             oneapi::mkl::transpose::nontrans, orig->get_size()[0],
+    //             orig->get_size()[1], &alpha, orig->get_const_values(),
+    //             orig->get_stride(), &beta, static_cast<ValueType
+    //             *>(nullptr), trans->get_size()[1], trans->get_values(),
+    //             trans->get_stride());
+    //     }
+    // } else {
+    //     GKO_NOT_IMPLEMENTED;
+    // }
+    GKO_NOT_IMPLEMENTED;
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
diff --git a/dpcpp/test/components/CMakeLists.txt b/dpcpp/test/components/CMakeLists.txt
index 87a034a64df..77ad6684840 100644
--- a/dpcpp/test/components/CMakeLists.txt
+++ b/dpcpp/test/components/CMakeLists.txt
@@ -2,3 +2,4 @@ ginkgo_create_test(absolute_array)
 ginkgo_create_dpcpp_test(cooperative_groups_kernels)
 ginkgo_create_test(fill_array)
 ginkgo_create_test(precision_conversion)
+ginkgo_create_test(prefix_sum)
diff --git a/dpcpp/test/components/prefix_sum.cpp b/dpcpp/test/components/prefix_sum.cpp
new file mode 100644
index 00000000000..3e2e7ca9d64
--- /dev/null
+++ b/dpcpp/test/components/prefix_sum.cpp
@@ -0,0 +1,96 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "dpcpp/test/utils.hpp"
+
+
+namespace {
+
+
+class PrefixSum : public ::testing::Test {
+protected:
+    using index_type = gko::int32;
+    PrefixSum()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::DpcppExecutor::create(0, ref)),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          dvals(exec)
+    {
+        std::uniform_int_distribution<index_type> dist(0, 1000);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+        }
+        dvals = vals;
+    }
+
+    void test(gko::size_type size)
+    {
+        gko::kernels::reference::components::prefix_sum(ref, vals.get_data(),
+                                                        size);
+        gko::kernels::dpcpp::components::prefix_sum(exec, dvals.get_data(),
+                                                    size);
+
+        GKO_ASSERT_ARRAY_EQ(vals, dvals);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::DpcppExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<index_type> vals;
+    gko::Array<index_type> dvals;
+};
+
+
+TEST_F(PrefixSum, SmallEqualsReference) { test(100); }
+
+
+TEST_F(PrefixSum, BigEqualsReference) { test(total_size); }
+
+
+}  // namespace
diff --git a/dpcpp/test/matrix/CMakeLists.txt b/dpcpp/test/matrix/CMakeLists.txt
index af64b693718..ba8f0fb70fe 100644
--- a/dpcpp/test/matrix/CMakeLists.txt
+++ b/dpcpp/test/matrix/CMakeLists.txt
@@ -1 +1,2 @@
-ginkgo_create_test(csr_kernels)
\ No newline at end of file
+ginkgo_create_test(csr_kernels)
+ginkgo_create_test(dense_kernels)
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
new file mode 100644
index 00000000000..7c65e8b0f84
--- /dev/null
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -0,0 +1,806 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+
+
+#include "core/components/fill_array.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "dpcpp/test/utils.hpp"
+
+
+namespace {
+
+
+class Dense : public ::testing::Test {
+protected:
+    using itype = int;
+    using vtype = double;
+    using Mtx = gko::matrix::Dense<vtype>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<vtype>>;
+    using Arr = gko::Array<itype>;
+    using ComplexMtx = gko::matrix::Dense<std::complex<vtype>>;
+
+    Dense() : rand_engine(15) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::DpcppExecutor::get_num_devices("gpu"), 0);
+        ref = gko::ReferenceExecutor::create();
+        dpcpp = gko::DpcppExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (dpcpp != nullptr) {
+            ASSERT_NO_THROW(dpcpp->synchronize());
+        }
+    }
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<MtxType>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_vector_data(gko::size_type num_vecs,
+                            bool different_alpha = false)
+    {
+        x = gen_mtx<Mtx>(1000, num_vecs);
+        y = gen_mtx<Mtx>(1000, num_vecs);
+        if (different_alpha) {
+            alpha = gen_mtx<Mtx>(1, num_vecs);
+        } else {
+            alpha = gko::initialize<Mtx>({2.0}, ref);
+        }
+        dx = Mtx::create(dpcpp);
+        dx->copy_from(x.get());
+        dy = Mtx::create(dpcpp);
+        dy->copy_from(y.get());
+        dalpha = Mtx::create(dpcpp);
+        dalpha->copy_from(alpha.get());
+        expected = Mtx::create(ref, gko::dim<2>{1, num_vecs});
+        dresult = Mtx::create(dpcpp, gko::dim<2>{1, num_vecs});
+    }
+
+    void set_up_apply_data()
+    {
+        x = gen_mtx<Mtx>(65, 25);
+        c_x = gen_mtx<ComplexMtx>(65, 25);
+        y = gen_mtx<Mtx>(25, 35);
+        expected = gen_mtx<Mtx>(65, 35);
+        alpha = gko::initialize<Mtx>({2.0}, ref);
+        beta = gko::initialize<Mtx>({-1.0}, ref);
+        square = gen_mtx<Mtx>(x->get_size()[0], x->get_size()[0]);
+        dx = Mtx::create(dpcpp);
+        dx->copy_from(x.get());
+        dc_x = ComplexMtx::create(dpcpp);
+        dc_x->copy_from(c_x.get());
+        dy = Mtx::create(dpcpp);
+        dy->copy_from(y.get());
+        dresult = Mtx::create(dpcpp);
+        dresult->copy_from(expected.get());
+        dalpha = Mtx::create(dpcpp);
+        dalpha->copy_from(alpha.get());
+        dbeta = Mtx::create(dpcpp);
+        dbeta->copy_from(beta.get());
+        dsquare = Mtx::create(dpcpp);
+        dsquare->copy_from(square.get());
+
+        std::vector<itype> tmp(x->get_size()[0], 0);
+        auto rng = std::default_random_engine{};
+        std::iota(tmp.begin(), tmp.end(), 0);
+        std::shuffle(tmp.begin(), tmp.end(), rng);
+        std::vector<itype> tmp2(x->get_size()[1], 0);
+        std::iota(tmp2.begin(), tmp2.end(), 0);
+        std::shuffle(tmp2.begin(), tmp2.end(), rng);
+        std::vector<itype> tmp3(x->get_size()[0] / 10);
+        std::uniform_int_distribution<itype> row_dist(0, x->get_size()[0] - 1);
+        for (auto &i : tmp3) {
+            i = row_dist(rng);
+        }
+        rpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp.begin(), tmp.end()});
+        cpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp2.begin(), tmp2.end()});
+        rgather_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp3.begin(), tmp3.end()});
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::DpcppExecutor> dpcpp;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<ComplexMtx> c_x;
+    std::unique_ptr<Mtx> y;
+    std::unique_ptr<Mtx> alpha;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> expected;
+    std::unique_ptr<Mtx> square;
+    std::unique_ptr<Mtx> dresult;
+    std::unique_ptr<Mtx> dx;
+    std::unique_ptr<ComplexMtx> dc_x;
+    std::unique_ptr<Mtx> dy;
+    std::unique_ptr<Mtx> dalpha;
+    std::unique_ptr<Mtx> dbeta;
+    std::unique_ptr<Mtx> dsquare;
+    std::unique_ptr<Arr> rpermute_idxs;
+    std::unique_ptr<Arr> cpermute_idxs;
+    std::unique_ptr<Arr> rgather_idxs;
+};
+
+
+TEST_F(Dense, DpcppFillIsEquivalentToRef)
+{
+    set_up_vector_data(3);
+    auto result = Mtx::create(ref);
+
+    x->fill(42);
+    dx->fill(42);
+    result->copy_from(dx.get());
+
+    GKO_ASSERT_MTX_NEAR(result, x, 1e-14);
+}
+
+
+TEST_F(Dense, DpcppStridedFillIsEquivalentToRef)
+{
+    using T = double;
+    auto x = gko::initialize<gko::matrix::Dense<T>>(
+        4, {I<T>{1.0, 2.0}, I<T>{3.0, 4.0}, I<T>{5.0, 6.0}}, ref);
+    auto dx = gko::initialize<gko::matrix::Dense<T>>(
+        4, {I<T>{1.0, 2.0}, I<T>{3.0, 4.0}, I<T>{5.0, 6.0}}, dpcpp);
+    auto result = Mtx::create(ref);
+
+    x->fill(42);
+    dx->fill(42);
+    result->copy_from(dx.get());
+
+    GKO_ASSERT_MTX_NEAR(result, x, 1e-14);
+}
+
+
+TEST_F(Dense, SingleVectorDpcppScaleIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    auto result = Mtx::create(ref);
+
+    x->scale(alpha.get());
+    dx->scale(dalpha.get());
+    result->copy_from(dx.get());
+
+    GKO_ASSERT_MTX_NEAR(result, x, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorDpcppScaleIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->scale(alpha.get());
+    dx->scale(dalpha.get());
+
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorDpcppScaleWithDifferentAlphaIsEquivalentToRef)
+{
+    set_up_vector_data(20, true);
+
+    x->scale(alpha.get());
+    dx->scale(dalpha.get());
+
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+}
+
+
+TEST_F(Dense, SingleVectorDpcppAddScaledIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    x->add_scaled(alpha.get(), y.get());
+    dx->add_scaled(dalpha.get(), dy.get());
+
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorDpcppAddScaledIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->add_scaled(alpha.get(), y.get());
+    dx->add_scaled(dalpha.get(), dy.get());
+
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorDpcppAddScaledWithDifferentAlphaIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->add_scaled(alpha.get(), y.get());
+    dx->add_scaled(dalpha.get(), dy.get());
+
+    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+}
+
+
+TEST_F(Dense, AddsScaledDiagIsEquivalentToRef)
+{
+    auto mat = gen_mtx<Mtx>(532, 532);
+    gko::Array<Mtx::value_type> diag_values(ref, 532);
+    gko::kernels::reference::components::fill_array(ref, diag_values.get_data(),
+                                                    532, Mtx::value_type{2.0});
+    auto diag =
+        gko::matrix::Diagonal<Mtx::value_type>::create(ref, 532, diag_values);
+    alpha = gko::initialize<Mtx>({2.0}, ref);
+    auto dmat = Mtx::create(dpcpp);
+    dmat->copy_from(mat.get());
+    auto ddiag = gko::matrix::Diagonal<Mtx::value_type>::create(dpcpp);
+    ddiag->copy_from(diag.get());
+    dalpha = Mtx::create(dpcpp);
+    dalpha->copy_from(alpha.get());
+
+    mat->add_scaled(alpha.get(), diag.get());
+    dmat->add_scaled(dalpha.get(), ddiag.get());
+
+    GKO_ASSERT_MTX_NEAR(mat, dmat, 1e-14);
+}
+
+
+TEST_F(Dense, SingleVectorDpcppComputeDotIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    x->compute_dot(y.get(), expected.get());
+    dx->compute_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->compute_dot(y.get(), expected.get());
+    dx->compute_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef)
+{
+    set_up_vector_data(20);
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->dpcpp, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14);
+}
+
+
+TEST_F(Dense, SimpleApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    x->apply(y.get(), expected.get());
+    dx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Dense, AdvancedApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    x->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(25, 1);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(65, 1);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(complex_b.get(), complex_x.get());
+    dx->apply(dcomplex_b.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
+}
+
+
+TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(25, 1);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(65, 1);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get());
+    dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
+}
+
+
+TEST_F(Dense, IsTransposable)
+{
+    set_up_apply_data();
+
+    auto trans = x->transpose();
+    auto dtrans = dx->transpose();
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(dtrans.get()),
+                        static_cast<Mtx *>(trans.get()), 0);
+}
+
+
+TEST_F(Dense, IsConjugateTransposable)
+{
+    set_up_apply_data();
+
+    auto trans = c_x->conj_transpose();
+    auto dtrans = dc_x->conj_transpose();
+
+    GKO_ASSERT_MTX_NEAR(static_cast<ComplexMtx *>(dtrans.get()),
+                        static_cast<ComplexMtx *>(trans.get()), 0);
+}
+
+
+TEST_F(Dense, ConvertToCooIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto coo_mtx = gko::matrix::Coo<>::create(ref);
+    auto dcoo_mtx = gko::matrix::Coo<>::create(dpcpp);
+
+    x->convert_to(coo_mtx.get());
+    dx->convert_to(dcoo_mtx.get());
+
+    ASSERT_EQ(dcoo_mtx->get_num_stored_elements(),
+              coo_mtx->get_num_stored_elements());
+    GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, MoveToCooIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto coo_mtx = gko::matrix::Coo<>::create(ref);
+    auto dcoo_mtx = gko::matrix::Coo<>::create(dpcpp);
+
+    x->move_to(coo_mtx.get());
+    dx->move_to(dcoo_mtx.get());
+
+    ASSERT_EQ(dcoo_mtx->get_num_stored_elements(),
+              coo_mtx->get_num_stored_elements());
+    GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, ConvertToCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(dpcpp);
+
+    x->convert_to(csr_mtx.get());
+    dx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, MoveToCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(dpcpp);
+
+    x->move_to(csr_mtx.get());
+    dx->move_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, ConvertToEllIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto ell_mtx = gko::matrix::Ell<>::create(ref);
+    auto dell_mtx = gko::matrix::Ell<>::create(dpcpp);
+
+    x->convert_to(ell_mtx.get());
+    dx->convert_to(dell_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, MoveToEllIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto ell_mtx = gko::matrix::Ell<>::create(ref);
+    auto dell_mtx = gko::matrix::Ell<>::create(dpcpp);
+
+    x->move_to(ell_mtx.get());
+    dx->move_to(dell_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Dense, ConvertToSellpIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+
+    x->convert_to(sellp_mtx.get());
+    dx->convert_to(dsellp_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14);
+}
+
+
+TEST_F(Dense, MoveToSellpIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+
+    x->move_to(sellp_mtx.get());
+    dx->move_to(dsellp_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14);
+}
+
+
+TEST_F(Dense, ConvertsEmptyToSellp)
+{
+    auto dempty_mtx = Mtx::create(dpcpp);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+
+    dempty_mtx->convert_to(dsellp_mtx.get());
+
+    ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
+    ASSERT_FALSE(dsellp_mtx->get_size());
+}
+
+
+TEST_F(Dense, CountNNZIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::size_type nnz;
+    gko::size_type dnnz;
+
+    gko::kernels::reference::dense::count_nonzeros(ref, x.get(), &nnz);
+    gko::kernels::dpcpp::dense::count_nonzeros(dpcpp, dx.get(), &dnnz);
+
+    ASSERT_EQ(nnz, dnnz);
+}
+
+
+TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::Array<gko::size_type> nnz_per_row(ref);
+    nnz_per_row.resize_and_reset(x->get_size()[0]);
+    gko::Array<gko::size_type> dnnz_per_row(dpcpp);
+    dnnz_per_row.resize_and_reset(dx->get_size()[0]);
+
+    gko::kernels::reference::dense::calculate_nonzeros_per_row(ref, x.get(),
+                                                               &nnz_per_row);
+    gko::kernels::dpcpp::dense::calculate_nonzeros_per_row(dpcpp, dx.get(),
+                                                           &dnnz_per_row);
+
+    auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
+    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+        ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
+    }
+}
+
+
+TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::size_type max_nnz;
+    gko::size_type dmax_nnz;
+
+    gko::kernels::reference::dense::calculate_max_nnz_per_row(ref, x.get(),
+                                                              &max_nnz);
+    gko::kernels::dpcpp::dense::calculate_max_nnz_per_row(dpcpp, dx.get(),
+                                                          &dmax_nnz);
+
+    ASSERT_EQ(max_nnz, dmax_nnz);
+}
+
+
+TEST_F(Dense, CalculateTotalColsIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::size_type total_cols;
+    gko::size_type dtotal_cols;
+
+    gko::kernels::reference::dense::calculate_total_cols(
+        ref, x.get(), &total_cols, 2, gko::matrix::default_slice_size);
+    gko::kernels::dpcpp::dense::calculate_total_cols(
+        dpcpp, dx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size);
+
+    ASSERT_EQ(total_cols, dtotal_cols);
+}
+
+
+TEST_F(Dense, CanGatherRows)
+{
+    set_up_apply_data();
+
+    auto r_gather = x->row_gather(rgather_idxs.get());
+    auto dr_gather = dx->row_gather(rgather_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0);
+}
+
+
+TEST_F(Dense, CanGatherRowsIntoDense)
+{
+    set_up_apply_data();
+    auto gather_size =
+        gko::dim<2>{rgather_idxs->get_num_elems(), x->get_size()[1]};
+    auto r_gather = Mtx::create(ref, gather_size);
+    // test make_temporary_clone and non-default stride
+    auto dr_gather = Mtx::create(ref, gather_size, x->get_size()[1] + 2);
+
+    x->row_gather(rgather_idxs.get(), r_gather.get());
+    dx->row_gather(rgather_idxs.get(), dr_gather.get());
+
+    GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0);
+}
+
+
+TEST_F(Dense, IsPermutable)
+{
+    set_up_apply_data();
+
+    auto permuted = square->permute(rpermute_idxs.get());
+    auto dpermuted = dsquare->permute(rpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(permuted.get()),
+                        static_cast<Mtx *>(dpermuted.get()), 0);
+}
+
+
+TEST_F(Dense, IsInversePermutable)
+{
+    set_up_apply_data();
+
+    auto permuted = square->inverse_permute(rpermute_idxs.get());
+    auto dpermuted = dsquare->inverse_permute(rpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(permuted.get()),
+                        static_cast<Mtx *>(dpermuted.get()), 0);
+}
+
+
+TEST_F(Dense, IsRowPermutable)
+{
+    set_up_apply_data();
+
+    auto r_permute = x->row_permute(rpermute_idxs.get());
+    auto dr_permute = dx->row_permute(rpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(r_permute.get()),
+                        static_cast<Mtx *>(dr_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsColPermutable)
+{
+    set_up_apply_data();
+
+    auto c_permute = x->column_permute(cpermute_idxs.get());
+    auto dc_permute = dx->column_permute(cpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(c_permute.get()),
+                        static_cast<Mtx *>(dc_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsInverseRowPermutable)
+{
+    set_up_apply_data();
+
+    auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get());
+    auto d_inverse_r_permute = dx->inverse_row_permute(rpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_r_permute.get()),
+                        static_cast<Mtx *>(d_inverse_r_permute.get()), 0);
+}
+
+
+TEST_F(Dense, IsInverseColPermutable)
+{
+    set_up_apply_data();
+
+    auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get());
+    auto d_inverse_c_permute = dx->inverse_column_permute(cpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_c_permute.get()),
+                        static_cast<Mtx *>(d_inverse_c_permute.get()), 0);
+}
+
+
+TEST_F(Dense, ExtractDiagonalIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto diag = x->extract_diagonal();
+    auto ddiag = dx->extract_diagonal();
+
+    GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0);
+}
+
+
+TEST_F(Dense, InplaceAbsoluteMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    x->compute_absolute_inplace();
+    dx->compute_absolute_inplace();
+
+    GKO_ASSERT_MTX_NEAR(x, dx, 1e-14);
+}
+
+
+TEST_F(Dense, OutplaceAbsoluteMatrixIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto abs_x = x->compute_absolute();
+    auto dabs_x = dx->compute_absolute();
+
+    GKO_ASSERT_MTX_NEAR(abs_x, dabs_x, 1e-14);
+}
+
+
+TEST_F(Dense, MakeComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto complex_x = x->make_complex();
+    auto dcomplex_x = dx->make_complex();
+
+    GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0);
+}
+
+
+TEST_F(Dense, MakeComplexWithGivenResultIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto complex_x = ComplexMtx::create(ref, x->get_size());
+    x->make_complex(complex_x.get());
+    auto dcomplex_x = ComplexMtx::create(dpcpp, x->get_size());
+    dx->make_complex(dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0);
+}
+
+
+TEST_F(Dense, GetRealIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto real_x = x->get_real();
+    auto dreal_x = dx->get_real();
+
+    GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0);
+}
+
+
+TEST_F(Dense, GetRealWithGivenResultIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto real_x = Mtx::create(ref, x->get_size());
+    x->get_real(real_x.get());
+    auto dreal_x = Mtx::create(dpcpp, dx->get_size());
+    dx->get_real(dreal_x.get());
+
+    GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0);
+}
+
+
+TEST_F(Dense, GetImagIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto imag_x = x->get_imag();
+    auto dimag_x = dx->get_imag();
+
+    GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0);
+}
+
+
+TEST_F(Dense, GetImagWithGivenResultIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto imag_x = Mtx::create(ref, x->get_size());
+    x->get_imag(imag_x.get());
+    auto dimag_x = Mtx::create(dpcpp, dx->get_size());
+    dx->get_imag(dimag_x.get());
+
+    GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0);
+}
+
+
+}  // namespace
diff --git a/dpcpp/test/utils.hpp b/dpcpp/test/utils.hpp
new file mode 100644
index 00000000000..88d98f0d9f6
--- /dev/null
+++ b/dpcpp/test/utils.hpp
@@ -0,0 +1,54 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_TEST_UTILS_HPP_
+#define GKO_DPCPP_TEST_UTILS_HPP_
+
+
+#include "core/test/utils.hpp"
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+namespace {
+
+
+// prevent device reset after each test
+auto no_reset_exec =
+    gko::DpcppExecutor::create(0, gko::ReferenceExecutor::create());
+
+
+}  // namespace
+
+
+#endif  // GKO_DPCPP_TEST_UTILS_HPP_

From ccc609853574a902d4312d7f59eab80a90fc6b32 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 20 May 2021 11:42:51 +0200
Subject: [PATCH 06/22] use warp_size 32 to check

---
 dpcpp/base/config.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp
index 78fe25978a7..78fdcc2b819 100644
--- a/dpcpp/base/config.hpp
+++ b/dpcpp/base/config.hpp
@@ -49,6 +49,12 @@ struct config {
      */
     using lane_mask_type = uint64;
 
+
+    /**
+     * The number of threads within a CUDA warp.
+     */
+    static constexpr uint32 warp_size = 32;
+
     /**
      * The bitmask of the entire warp.
      */

From 8e8f9c7c1802766fc539fdc8913caacdff29a50a Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 20 May 2021 16:06:36 +0200
Subject: [PATCH 07/22] fix extract_diag, try default, and single test

---
 dpcpp/base/helper.hpp               |  22 ++
 dpcpp/components/prefix_sum.dp.hpp  |   8 +-
 dpcpp/components/reduction.dp.hpp   |   4 +-
 dpcpp/matrix/dense_kernels.dp.cpp   | 363 +++++++++------------------
 dpcpp/test/matrix/dense_kernels.cpp | 374 +++++++++-------------------
 5 files changed, 274 insertions(+), 497 deletions(-)

diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index c888eb9d99d..f8eee93f25b 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -46,6 +46,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/base/dim3.dp.hpp"
 
 
+/**
+ * GKO_ENABLE_DEFAULT_HOST gives a default host implementation for those
+ * kernels which require encoded config but do not need explicit template
+ * parameter and share memory
+ *
+ * @param name_  the name of the host function with config
+ * @param kernel_  the kernel name
+ */
+#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_)                     \
+    template <typename... InferredArgs>                             \
+    void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \
+               sycl::queue *queue, InferredArgs... args)            \
+    {                                                               \
+        queue->submit([&](sycl::handler &cgh) {                     \
+            cgh.parallel_for(sycl_nd_range(grid, block),            \
+                             [=](sycl::nd_item<3> item_ct1) {       \
+                                 kernel_(args..., item_ct1);        \
+                             });                                    \
+        });                                                         \
+    }
+
+
 /**
  * GKO_ENABLE_DEFAULT_HOST_CONFIG gives a default host implementation for those
  * kernels which require encoded config but do not need explicit template
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
index 6b3498d1dea..c6f7c7cfb20 100644
--- a/dpcpp/components/prefix_sum.dp.hpp
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -189,8 +189,8 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::access::target::local>
             prefix_helper_acc_ct1(cgh);
 
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
+        auto local_range = block.get_range();
+        auto global_range = grid.get_range() * local_range;
 
         cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
                          [=](sycl::nd_item<3> item_ct1) {
@@ -240,8 +240,8 @@ void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          ValueType *elements, const ValueType *block_sum)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
+        auto local_range = block.get_range();
+        auto global_range = grid.get_range() * local_range;
 
         cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
                          [=](sycl::nd_item<3> item_ct1) {
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index 4caf46229c8..e47d9038af3 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -216,8 +216,8 @@ void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::access::target::local>
             block_sum_acc_ct1(cgh);
 
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
+        auto local_range = block.get_range();
+        auto global_range = grid.get_range() * local_range;
 
         cgh.parallel_for(
             sycl::nd_range<3>(global_range, local_range),
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index c5074b5cc38..494caff94c2 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/prefix_sum.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
@@ -88,22 +89,7 @@ void strided_fill(size_type num_rows, size_type num_cols, size_type stride,
     }
 }
 
-template <typename ValueType>
-void strided_fill(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                  sycl::queue *stream, size_type num_rows, size_type num_cols,
-                  size_type stride, ValueType *mat, ValueType value)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             strided_fill(num_rows, num_cols, stride, mat,
-                                          value, item_ct1);
-                         });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(strided_fill, strided_fill)
 
 
 template <size_type block_size, typename ValueType>
@@ -132,15 +118,11 @@ void scale(dim3 grid, dim3 block, size_t dynamic_shared_memory,
            size_type stride_x)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             scale<block_size>(num_rows, num_cols,
-                                               num_alpha_cols, alpha, x,
-                                               stride_x, item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                scale<block_size>(num_rows, num_cols, num_alpha_cols, alpha, x,
+                                  stride_x, item_ct1);
+            });
     });
 }
 
@@ -172,15 +154,12 @@ void add_scaled(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                 size_type stride_y)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             add_scaled<block_size>(
-                                 num_rows, num_cols, num_alpha_cols, alpha, x,
-                                 stride_x, y, stride_y, item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                add_scaled<block_size>(num_rows, num_cols, num_alpha_cols,
+                                       alpha, x, stride_x, y, stride_y,
+                                       item_ct1);
+            });
     });
 }
 
@@ -207,14 +186,10 @@ void add_scaled_diag(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                      ValueType *y, size_type stride_y)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             add_scaled_diag(size, alpha, diag, y, stride_y,
-                                             item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                add_scaled_diag(size, alpha, diag, y, stride_y, item_ct1);
+            });
     });
 }
 
@@ -234,7 +209,7 @@ void compute_partial_reduce(size_type num_rows, OutType *__restrict__ work,
     const auto global_id =
         thread::get_thread_id<config::warp_size, warps_per_block>(item_ct1);
 
-    OutType *tmp_work_array=*tmp_work;
+    OutType *tmp_work_array = *tmp_work;
     auto tmp = zero<OutType>();
     for (auto i = global_id; i < num_rows; i += block_size * num_blocks) {
         tmp = reduce_op(tmp, get_value(i));
@@ -243,8 +218,7 @@ void compute_partial_reduce(size_type num_rows, OutType *__restrict__ work,
     tmp_work_array[local_id] = tmp;
 
     ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1),
-                                  tmp_work_array,
-                                  reduce_op);
+                                  tmp_work_array, reduce_op);
 
     if (local_id == 0) {
         work[thread::get_block_id(item_ct1)] = tmp_work_array[0];
@@ -267,12 +241,11 @@ void finalize_reduce_computation(
     for (auto i = local_id; i < size; i += block_size) {
         tmp = reduce_op(tmp, work[i]);
     }
-    ValueType *tmp_work_array=*tmp_work;
+    ValueType *tmp_work_array = *tmp_work;
     tmp_work_array[local_id] = tmp;
 
     ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1),
-                                  tmp_work_array,
-                                  reduce_op);
+                                  tmp_work_array, reduce_op);
 
     if (local_id == 0) {
         *result = finalize_op(tmp_work_array[0]);
@@ -309,17 +282,14 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
 
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             compute_partial_dot<block_size>(
-                                 num_rows, x, stride_x, y, stride_y, work,
-                                 item_ct1,
-                                 (UninitializedArray<ValueType, block_size> *)
-                                     tmp_work_acc_ct1.get_pointer());
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                compute_partial_dot<block_size>(
+                    num_rows, x, stride_x, y, stride_y, work, item_ct1,
+                    (UninitializedArray<ValueType, block_size> *)
+                        tmp_work_acc_ct1.get_pointer());
+            });
     });
 }
 
@@ -348,10 +318,8 @@ void finalize_dot_computation(dim3 grid, dim3 block,
                        sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
 
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              finalize_dot_computation<block_size>(
                                  size, work, result, item_ct1,
@@ -388,12 +356,9 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
             sycl::access::mode::read_write, sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
 
         cgh.parallel_for(
-            sycl::nd_range<3>(global_range, local_range),
-            [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 compute_partial_norm2<block_size>(
                     num_rows, x, stride_x, work, item_ct1,
                     (UninitializedArray<remove_complex<ValueType>, block_size>
@@ -427,10 +392,8 @@ void finalize_norm2_computation(dim3 grid, dim3 block,
                        sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
 
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              finalize_norm2_computation<block_size>(
                                  size, work, result, item_ct1,
@@ -472,15 +435,11 @@ void fill_in_coo(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                  IndexType *col_idxs, ValueType *values)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             fill_in_coo(num_rows, num_cols, stride, row_ptrs,
-                                         source, row_idxs, col_idxs, values,
-                                         item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                fill_in_coo(num_rows, num_cols, stride, row_ptrs, source,
+                            row_idxs, col_idxs, values, item_ct1);
+            });
     });
 }
 
@@ -516,10 +475,7 @@ void count_nnz_per_row(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        const ValueType *work, IndexType *result)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              count_nnz_per_row(num_rows, num_cols, stride, work,
                                                result, item_ct1);
@@ -556,10 +512,7 @@ void fill_in_csr(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                  IndexType *col_idxs, ValueType *values)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              fill_in_csr(num_rows, num_cols, stride, source,
                                          row_ptrs, col_idxs, values, item_ct1);
@@ -606,10 +559,7 @@ void fill_in_ell(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                  IndexType *col_ptrs, ValueType *values)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              fill_in_ell(num_rows, num_cols, source_stride,
                                          source, max_nnz_per_row, result_stride,
@@ -662,16 +612,12 @@ void calculate_slice_lengths(dim3 grid, dim3 block,
                              size_type *slice_lengths, size_type *slice_sets)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             calculate_slice_lengths(num_rows, slice_size,
-                                                     slice_num, stride_factor,
-                                                     nnz_per_row, slice_lengths,
-                                                     slice_sets, item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                calculate_slice_lengths(num_rows, slice_size, slice_num,
+                                        stride_factor, nnz_per_row,
+                                        slice_lengths, slice_sets, item_ct1);
+            });
     });
 }
 
@@ -717,16 +663,12 @@ void fill_in_sellp(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                    size_type *slice_sets, IndexType *col_idxs, ValueType *vals)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             fill_in_sellp(num_rows, num_cols, slice_size,
-                                           stride, source, slice_lengths,
-                                           slice_sets, col_idxs, vals,
-                                           item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                fill_in_sellp(num_rows, num_cols, slice_size, stride, source,
+                              slice_lengths, slice_sets, col_idxs, vals,
+                              item_ct1);
+            });
     });
 }
 
@@ -755,10 +697,8 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::access::target::local>
             dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh);
 
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
 
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              reduce_max_nnz(size, nnz_per_row, result, item_ct1,
                                             dpct_local_acc_ct1.get_pointer());
@@ -804,15 +744,11 @@ void reduce_max_nnz_per_slice(dim3 grid, dim3 block,
                               const size_type *nnz_per_row, size_type *result)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             reduce_max_nnz_per_slice(
-                                 num_rows, slice_size, stride_factor,
-                                 nnz_per_row, result, item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                reduce_max_nnz_per_slice(num_rows, slice_size, stride_factor,
+                                         nnz_per_row, result, item_ct1);
+            });
     });
 }
 
@@ -841,15 +777,12 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::access::target::local>
             dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh);
 
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
 
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             reduce_total_cols(
-                                 num_slices, max_nnz_per_slice, result,
-                                 item_ct1, dpct_local_acc_ct1.get_pointer());
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                reduce_total_cols(num_slices, max_nnz_per_slice, result,
+                                  item_ct1, dpct_local_acc_ct1.get_pointer());
+            });
     });
 }
 
@@ -878,15 +811,11 @@ void symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                   size_type stride_result)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             symm_permute(num_rows, num_cols, perm_idxs, orig,
-                                          stride_orig, result, stride_result,
-                                          item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                symm_permute(num_rows, num_cols, perm_idxs, orig, stride_orig,
+                             result, stride_result, item_ct1);
+            });
     });
 }
 
@@ -915,15 +844,11 @@ void inv_symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                       ValueType *result, size_type stride_result)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             inv_symm_permute(num_rows, num_cols, perm_idxs,
-                                              orig, stride_orig, result,
-                                              stride_result, item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                inv_symm_permute(num_rows, num_cols, perm_idxs, orig,
+                                 stride_orig, result, stride_result, item_ct1);
+            });
     });
 }
 
@@ -952,15 +877,11 @@ void row_gather(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                 size_type stride_result)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             row_gather(num_rows, num_cols, perm_idxs, orig,
-                                        stride_orig, result, stride_result,
-                                        item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                row_gather(num_rows, num_cols, perm_idxs, orig, stride_orig,
+                           result, stride_result, item_ct1);
+            });
     });
 }
 
@@ -989,15 +910,11 @@ void column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                     size_type stride_result)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             column_permute(num_rows, num_cols, perm_idxs, orig,
-                                            stride_orig, result, stride_result,
-                                            item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                column_permute(num_rows, num_cols, perm_idxs, orig, stride_orig,
+                               result, stride_result, item_ct1);
+            });
     });
 }
 
@@ -1026,10 +943,7 @@ void inverse_row_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          ValueType *result, size_type stride_result)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              inverse_row_permute(num_rows, num_cols, perm_idxs,
                                                  orig, stride_orig, result,
@@ -1064,26 +978,21 @@ void inverse_column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                             ValueType *result, size_type stride_result)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             inverse_column_permute(
-                                 num_rows, num_cols, perm_idxs, orig,
-                                 stride_orig, result, stride_result, item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                inverse_column_permute(num_rows, num_cols, perm_idxs, orig,
+                                       stride_orig, result, stride_result,
+                                       item_ct1);
+            });
     });
 }
 
-
 template <typename ValueType>
 void extract_diagonal(size_type problem_size,
                       const ValueType *__restrict__ orig, size_type stride_orig,
                       ValueType *__restrict__ diag, sycl::nd_item<3> item_ct1)
 {
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-
+    const auto tidx = thread::get_thread_id_flat<int>(item_ct1);
     if (tidx < problem_size) {
         diag[tidx] = orig[tidx * stride_orig + tidx];
     }
@@ -1096,10 +1005,7 @@ void extract_diagonal(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                       ValueType *diag)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              extract_diagonal(problem_size, orig, stride_orig,
                                               diag, item_ct1);
@@ -1117,7 +1023,7 @@ void inplace_absolute_dense(size_type num_rows, size_type num_cols,
     auto row = tidx / num_cols;
     auto col = tidx % num_cols;
     if (row < num_rows) {
-        data[row * stride + col] = dpcpp::abs(data[row * stride + col]);
+        data[row * stride + col] = std::abs(data[row * stride + col]);
     }
 }
 
@@ -1128,10 +1034,7 @@ void inplace_absolute_dense(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                             size_type stride)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              inplace_absolute_dense(num_rows, num_cols, data,
                                                     stride, item_ct1);
@@ -1151,7 +1054,7 @@ void outplace_absolute_dense(size_type num_rows, size_type num_cols,
     auto row = tidx / num_cols;
     auto col = tidx % num_cols;
     if (row < num_rows) {
-        out[row * stride_out + col] = dpcpp::abs(in[row * stride_in + col]);
+        out[row * stride_out + col] = std::abs(in[row * stride_in + col]);
     }
 }
 
@@ -1164,15 +1067,11 @@ void outplace_absolute_dense(dim3 grid, dim3 block,
                              size_type stride_out)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             outplace_absolute_dense(num_rows, num_cols, in,
-                                                     stride_in, out, stride_out,
-                                                     item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                outplace_absolute_dense(num_rows, num_cols, in, stride_in, out,
+                                        stride_out, item_ct1);
+            });
     });
 }
 
@@ -1198,10 +1097,7 @@ void make_complex(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                   size_type stride_out)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              make_complex(num_rows, num_cols, in, stride_in,
                                           out, stride_out, item_ct1);
@@ -1231,10 +1127,7 @@ void get_real(dim3 grid, dim3 block, size_t dynamic_shared_memory,
               remove_complex<ValueType> *out, size_type stride_out)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              get_real(num_rows, num_cols, in, stride_in, out,
                                       stride_out, item_ct1);
@@ -1264,10 +1157,7 @@ void get_imag(dim3 grid, dim3 block, size_t dynamic_shared_memory,
               remove_complex<ValueType> *out, size_type stride_out)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.reverse();
-        auto global_range = grid.reverse() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              get_imag(num_rows, num_cols, in, stride_in, out,
                                       stride_out, item_ct1);
@@ -1288,10 +1178,9 @@ void simple_apply(std::shared_ptr<const DpcppExecutor> exec,
     using namespace oneapi::mkl;
     oneapi::mkl::blas::row_major::gemm(
         *exec->get_queue(), transpose::nontrans, transpose::nontrans,
-        c->get_size()[0], c->get_size()[1], a->get_size()[1],
-        one<ValueType>(), a->get_const_values(), a->get_stride(),
-        b->get_const_values(), b->get_stride(), zero<ValueType>(),
-        c->get_values(), c->get_stride());
+        c->get_size()[0], c->get_size()[1], a->get_size()[1], one<ValueType>(),
+        a->get_const_values(), a->get_stride(), b->get_const_values(),
+        b->get_stride(), zero<ValueType>(), c->get_values(), c->get_stride());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
@@ -1347,8 +1236,7 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
             kernel::compute_partial_dot<block_size>(
                 grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0],
                 x->get_const_values() + col, x->get_stride(),
-                y->get_const_values() + col, y->get_stride(),
-                work.get_data());
+                y->get_const_values() + col, y->get_stride(), work.get_data());
             kernel::finalize_dot_computation<block_size>(
                 1, block_dim, 0, exec->get_queue(), grid_dim.x,
                 work.get_const_data(), result->get_values() + col);
@@ -1397,8 +1285,7 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
             kernel::compute_partial_norm2<block_size>(
                 grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0],
-                x->get_const_values() + col, x->get_stride(),
-                work.get_data());
+                x->get_const_values() + col, x->get_stride(), work.get_data());
             kernel::finalize_norm2_computation<block_size>(
                 1, block_dim, 0, exec->get_queue(), grid_dim.x,
                 work.get_const_data(), result->get_values() + col);
@@ -1433,8 +1320,7 @@ void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
     kernel::fill_in_coo(grid_dim, default_block_size, 0, exec->get_queue(),
                         num_rows, num_cols, stride,
                         nnz_prefix_sum.get_const_data(),
-                        source->get_const_values(), row_idxs, col_idxs,
-                        values);
+                        source->get_const_values(), row_idxs, col_idxs, values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1455,9 +1341,8 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
 
     auto stride = source->get_stride();
 
-    const auto rows_per_block = ceildiv(default_block_size,
-    config::warp_size); const auto grid_dim_nnz =
-    ceildiv(source->get_size()[0], rows_per_block);
+    const auto rows_per_block = ceildiv(default_block_size, config::warp_size);
+    const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
 
     kernel::count_nnz_per_row(grid_dim_nnz, default_block_size, 0,
                               exec->get_queue(), num_rows, num_cols, stride,
@@ -1468,9 +1353,8 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
     size_type grid_dim = ceildiv(num_rows, default_block_size);
 
     kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(),
-                        num_rows, num_cols, stride,
-                        source->get_const_values(), row_ptrs, col_idxs,
-                        values);
+                        num_rows, num_cols, stride, source->get_const_values(),
+                        row_ptrs, col_idxs, values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1546,8 +1430,8 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
         std::cout << "calculate_slice_lengths" << std::endl;
         kernel::calculate_slice_lengths(
             grid_dim, config::warp_size, 0, exec->get_queue(), num_rows,
-            slice_size, slice_num, stride_factor,
-            nnz_per_row.get_const_data(), slice_lengths, slice_sets);
+            slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(),
+            slice_lengths, slice_sets);
         exec->synchronize();
         std::cout << "calculate_slice_lengths finish" << std::endl;
     }
@@ -1559,9 +1443,9 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
     if (grid_dim > 0) {
         std::cout << "fill_in_sellp" << std::endl;
         kernel::fill_in_sellp(grid_dim, default_block_size, 0,
-                              exec->get_queue(), num_rows, num_cols,
-                              slice_size, stride, source->get_const_values(),
-                              slice_lengths, slice_sets, col_idxs, vals);
+                              exec->get_queue(), num_rows, num_cols, slice_size,
+                              stride, source->get_const_values(), slice_lengths,
+                              slice_sets, col_idxs, vals);
         exec->synchronize();
         std::cout << "fill_in_sellp finish" << std::endl;
     }
@@ -1622,8 +1506,7 @@ void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
     kernel::reduce_max_nnz(1, default_block_size,
                            default_block_size * sizeof(size_type),
                            exec->get_queue(), grid_dim,
-                           block_results.get_const_data(),
-                           d_result.get_data());
+                           block_results.get_const_data(), d_result.get_data());
 
     *result = exec->copy_val_to_host(d_result.get_const_data());
 }
@@ -1643,10 +1526,9 @@ void calculate_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
     const dim3 grid_size(grid_x, 1, 1);
     if (grid_x > 0) {
         kernel::count_nnz_per_row(
-            grid_size, block_size, 0, exec->get_queue(),
-            source->get_size()[0], source->get_size()[1],
-            source->get_stride(), source->get_const_values(),
-            result->get_data());
+            grid_size, block_size, 0, exec->get_queue(), source->get_size()[0],
+            source->get_size()[1], source->get_stride(),
+            source->get_const_values(), result->get_data());
     }
 }
 
@@ -1676,8 +1558,7 @@ void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
 
     auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
 
-    auto grid_dim = ceildiv(slice_num * config::warp_size,
-    default_block_size);
+    auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size);
 
     kernel::reduce_max_nnz_per_slice(
         grid_dim, default_block_size, 0, exec->get_queue(), num_rows,
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
index 7c65e8b0f84..3cd080313cf 100644
--- a/dpcpp/test/matrix/dense_kernels.cpp
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -59,17 +59,21 @@ namespace {
 class Dense : public ::testing::Test {
 protected:
     using itype = int;
+#if GINKGO_DPCPP_SINGLE_MODE
+    using vtype = float;
+#else
     using vtype = double;
+#endif  // GINKGO_DPCPP_SINGLE_MODE
     using Mtx = gko::matrix::Dense<vtype>;
     using NormVector = gko::matrix::Dense<gko::remove_complex<vtype>>;
     using Arr = gko::Array<itype>;
-    using ComplexMtx = gko::matrix::Dense<std::complex<vtype>>;
+    // using ComplexMtx = gko::matrix::Dense<std::complex<vtype>>;
 
     Dense() : rand_engine(15) {}
 
     void SetUp()
     {
-        ASSERT_GT(gko::DpcppExecutor::get_num_devices("gpu"), 0);
+        ASSERT_GT(gko::DpcppExecutor::get_num_devices("all"), 0);
         ref = gko::ReferenceExecutor::create();
         dpcpp = gko::DpcppExecutor::create(0, ref);
     }
@@ -113,16 +117,15 @@ class Dense : public ::testing::Test {
     void set_up_apply_data()
     {
         x = gen_mtx<Mtx>(65, 25);
-        c_x = gen_mtx<ComplexMtx>(65, 25);
+        // c_x = gen_mtx<ComplexMtx>(65, 25);
         y = gen_mtx<Mtx>(25, 35);
         expected = gen_mtx<Mtx>(65, 35);
         alpha = gko::initialize<Mtx>({2.0}, ref);
         beta = gko::initialize<Mtx>({-1.0}, ref);
-        square = gen_mtx<Mtx>(x->get_size()[0], x->get_size()[0]);
         dx = Mtx::create(dpcpp);
         dx->copy_from(x.get());
-        dc_x = ComplexMtx::create(dpcpp);
-        dc_x->copy_from(c_x.get());
+        // dc_x = ComplexMtx::create(dpcpp);
+        // dc_x->copy_from(c_x.get());
         dy = Mtx::create(dpcpp);
         dy->copy_from(y.get());
         dresult = Mtx::create(dpcpp);
@@ -131,8 +134,6 @@ class Dense : public ::testing::Test {
         dalpha->copy_from(alpha.get());
         dbeta = Mtx::create(dpcpp);
         dbeta->copy_from(beta.get());
-        dsquare = Mtx::create(dpcpp);
-        dsquare->copy_from(square.get());
 
         std::vector<itype> tmp(x->get_size()[0], 0);
         auto rng = std::default_random_engine{};
@@ -141,17 +142,14 @@ class Dense : public ::testing::Test {
         std::vector<itype> tmp2(x->get_size()[1], 0);
         std::iota(tmp2.begin(), tmp2.end(), 0);
         std::shuffle(tmp2.begin(), tmp2.end(), rng);
-        std::vector<itype> tmp3(x->get_size()[0] / 10);
-        std::uniform_int_distribution<itype> row_dist(0, x->get_size()[0] - 1);
-        for (auto &i : tmp3) {
-            i = row_dist(rng);
-        }
         rpermute_idxs =
             std::unique_ptr<Arr>(new Arr{ref, tmp.begin(), tmp.end()});
+        drpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{dpcpp, tmp.begin(), tmp.end()});
         cpermute_idxs =
             std::unique_ptr<Arr>(new Arr{ref, tmp2.begin(), tmp2.end()});
-        rgather_idxs =
-            std::unique_ptr<Arr>(new Arr{ref, tmp3.begin(), tmp3.end()});
+        dcpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{dpcpp, tmp2.begin(), tmp2.end()});
     }
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
@@ -160,22 +158,21 @@ class Dense : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> x;
-    std::unique_ptr<ComplexMtx> c_x;
+    // std::unique_ptr<ComplexMtx> c_x;
     std::unique_ptr<Mtx> y;
     std::unique_ptr<Mtx> alpha;
     std::unique_ptr<Mtx> beta;
     std::unique_ptr<Mtx> expected;
-    std::unique_ptr<Mtx> square;
     std::unique_ptr<Mtx> dresult;
     std::unique_ptr<Mtx> dx;
-    std::unique_ptr<ComplexMtx> dc_x;
+    // std::unique_ptr<ComplexMtx> dc_x;
     std::unique_ptr<Mtx> dy;
     std::unique_ptr<Mtx> dalpha;
     std::unique_ptr<Mtx> dbeta;
-    std::unique_ptr<Mtx> dsquare;
     std::unique_ptr<Arr> rpermute_idxs;
+    std::unique_ptr<Arr> drpermute_idxs;
     std::unique_ptr<Arr> cpermute_idxs;
-    std::unique_ptr<Arr> rgather_idxs;
+    std::unique_ptr<Arr> dcpermute_idxs;
 };
 
 
@@ -188,13 +185,13 @@ TEST_F(Dense, DpcppFillIsEquivalentToRef)
     dx->fill(42);
     result->copy_from(dx.get());
 
-    GKO_ASSERT_MTX_NEAR(result, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(result, x, r<vtype>::value);
 }
 
 
 TEST_F(Dense, DpcppStridedFillIsEquivalentToRef)
 {
-    using T = double;
+    using T = vtype;
     auto x = gko::initialize<gko::matrix::Dense<T>>(
         4, {I<T>{1.0, 2.0}, I<T>{3.0, 4.0}, I<T>{5.0, 6.0}}, ref);
     auto dx = gko::initialize<gko::matrix::Dense<T>>(
@@ -205,7 +202,7 @@ TEST_F(Dense, DpcppStridedFillIsEquivalentToRef)
     dx->fill(42);
     result->copy_from(dx.get());
 
-    GKO_ASSERT_MTX_NEAR(result, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(result, x, r<T>::value);
 }
 
 
@@ -218,7 +215,7 @@ TEST_F(Dense, SingleVectorDpcppScaleIsEquivalentToRef)
     dx->scale(dalpha.get());
     result->copy_from(dx.get());
 
-    GKO_ASSERT_MTX_NEAR(result, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(result, x, r<vtype>::value);
 }
 
 
@@ -229,7 +226,7 @@ TEST_F(Dense, MultipleVectorDpcppScaleIsEquivalentToRef)
     x->scale(alpha.get());
     dx->scale(dalpha.get());
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, r<vtype>::value);
 }
 
 
@@ -240,7 +237,7 @@ TEST_F(Dense, MultipleVectorDpcppScaleWithDifferentAlphaIsEquivalentToRef)
     x->scale(alpha.get());
     dx->scale(dalpha.get());
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, r<vtype>::value);
 }
 
 
@@ -251,7 +248,7 @@ TEST_F(Dense, SingleVectorDpcppAddScaledIsEquivalentToRef)
     x->add_scaled(alpha.get(), y.get());
     dx->add_scaled(dalpha.get(), dy.get());
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, r<vtype>::value);
 }
 
 
@@ -262,7 +259,7 @@ TEST_F(Dense, MultipleVectorDpcppAddScaledIsEquivalentToRef)
     x->add_scaled(alpha.get(), y.get());
     dx->add_scaled(dalpha.get(), dy.get());
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, r<vtype>::value);
 }
 
 
@@ -273,7 +270,7 @@ TEST_F(Dense, MultipleVectorDpcppAddScaledWithDifferentAlphaIsEquivalentToRef)
     x->add_scaled(alpha.get(), y.get());
     dx->add_scaled(dalpha.get(), dy.get());
 
-    GKO_ASSERT_MTX_NEAR(dx, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx, x, r<vtype>::value);
 }
 
 
@@ -296,7 +293,7 @@ TEST_F(Dense, AddsScaledDiagIsEquivalentToRef)
     mat->add_scaled(alpha.get(), diag.get());
     dmat->add_scaled(dalpha.get(), ddiag.get());
 
-    GKO_ASSERT_MTX_NEAR(mat, dmat, 1e-14);
+    GKO_ASSERT_MTX_NEAR(mat, dmat, r<vtype>::value);
 }
 
 
@@ -307,7 +304,7 @@ TEST_F(Dense, SingleVectorDpcppComputeDotIsEquivalentToRef)
     x->compute_dot(y.get(), expected.get());
     dx->compute_dot(dy.get(), dresult.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
 }
 
 
@@ -318,7 +315,7 @@ TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef)
     x->compute_dot(y.get(), expected.get());
     dx->compute_dot(dy.get(), dresult.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
 }
 
 
@@ -332,7 +329,7 @@ TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef)
     x->compute_norm2(norm_expected.get());
     dx->compute_norm2(dnorm.get());
 
-    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14);
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);
 }
 
 
@@ -343,7 +340,7 @@ TEST_F(Dense, SimpleApplyIsEquivalentToRef)
     x->apply(y.get(), expected.get());
     dx->apply(dy.get(), dresult.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
 }
 
 
@@ -354,186 +351,186 @@ TEST_F(Dense, AdvancedApplyIsEquivalentToRef)
     x->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
 
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
 }
 
 
-TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
-{
-    set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexMtx>(25, 1);
-    auto dcomplex_b = ComplexMtx::create(dpcpp);
-    dcomplex_b->copy_from(complex_b.get());
-    auto complex_x = gen_mtx<ComplexMtx>(65, 1);
-    auto dcomplex_x = ComplexMtx::create(dpcpp);
-    dcomplex_x->copy_from(complex_x.get());
+// TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
+// {
+//     set_up_apply_data();
+//     auto complex_b = gen_mtx<ComplexMtx>(25, 1);
+//     auto dcomplex_b = ComplexMtx::create(dpcpp);
+//     dcomplex_b->copy_from(complex_b.get());
+//     auto complex_x = gen_mtx<ComplexMtx>(65, 1);
+//     auto dcomplex_x = ComplexMtx::create(dpcpp);
+//     dcomplex_x->copy_from(complex_x.get());
 
-    x->apply(complex_b.get(), complex_x.get());
-    dx->apply(dcomplex_b.get(), dcomplex_x.get());
+//     x->apply(complex_b.get(), complex_x.get());
+//     dx->apply(dcomplex_b.get(), dcomplex_x.get());
 
-    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
-}
+//     GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
+// }
 
 
-TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef)
-{
-    set_up_apply_data();
-    auto complex_b = gen_mtx<ComplexMtx>(25, 1);
-    auto dcomplex_b = ComplexMtx::create(dpcpp);
-    dcomplex_b->copy_from(complex_b.get());
-    auto complex_x = gen_mtx<ComplexMtx>(65, 1);
-    auto dcomplex_x = ComplexMtx::create(dpcpp);
-    dcomplex_x->copy_from(complex_x.get());
+// TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef)
+// {
+//     set_up_apply_data();
+//     auto complex_b = gen_mtx<ComplexMtx>(25, 1);
+//     auto dcomplex_b = ComplexMtx::create(dpcpp);
+//     dcomplex_b->copy_from(complex_b.get());
+//     auto complex_x = gen_mtx<ComplexMtx>(65, 1);
+//     auto dcomplex_x = ComplexMtx::create(dpcpp);
+//     dcomplex_x->copy_from(complex_x.get());
 
-    x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get());
-    dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get());
+//     x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get());
+//     dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get());
 
-    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
-}
+//     GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
+// }
 
 
-TEST_F(Dense, IsTransposable)
-{
-    set_up_apply_data();
+// TEST_F(Dense, IsTransposable)
+// {
+//     set_up_apply_data();
 
-    auto trans = x->transpose();
-    auto dtrans = dx->transpose();
+//     auto trans = x->transpose();
+//     auto dtrans = dx->transpose();
 
-    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(dtrans.get()),
-                        static_cast<Mtx *>(trans.get()), 0);
-}
+//     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(dtrans.get()),
+//                         static_cast<Mtx *>(trans.get()), 0);
+// }
 
 
-TEST_F(Dense, IsConjugateTransposable)
-{
-    set_up_apply_data();
+// TEST_F(Dense, IsConjugateTransposable)
+// {
+//     set_up_apply_data();
 
-    auto trans = c_x->conj_transpose();
-    auto dtrans = dc_x->conj_transpose();
+//     auto trans = c_x->conj_transpose();
+//     auto dtrans = dc_x->conj_transpose();
 
-    GKO_ASSERT_MTX_NEAR(static_cast<ComplexMtx *>(dtrans.get()),
-                        static_cast<ComplexMtx *>(trans.get()), 0);
-}
+//     GKO_ASSERT_MTX_NEAR(static_cast<ComplexMtx *>(dtrans.get()),
+//                         static_cast<ComplexMtx *>(trans.get()), 0);
+// }
 
 
 TEST_F(Dense, ConvertToCooIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto coo_mtx = gko::matrix::Coo<>::create(ref);
-    auto dcoo_mtx = gko::matrix::Coo<>::create(dpcpp);
+    auto coo_mtx = gko::matrix::Coo<vtype>::create(ref);
+    auto dcoo_mtx = gko::matrix::Coo<vtype>::create(dpcpp);
 
     x->convert_to(coo_mtx.get());
     dx->convert_to(dcoo_mtx.get());
 
     ASSERT_EQ(dcoo_mtx->get_num_stored_elements(),
               coo_mtx->get_num_stored_elements());
-    GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), r<vtype>::value);
 }
 
 
 TEST_F(Dense, MoveToCooIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto coo_mtx = gko::matrix::Coo<>::create(ref);
-    auto dcoo_mtx = gko::matrix::Coo<>::create(dpcpp);
+    auto coo_mtx = gko::matrix::Coo<vtype>::create(ref);
+    auto dcoo_mtx = gko::matrix::Coo<vtype>::create(dpcpp);
 
     x->move_to(coo_mtx.get());
     dx->move_to(dcoo_mtx.get());
 
     ASSERT_EQ(dcoo_mtx->get_num_stored_elements(),
               coo_mtx->get_num_stored_elements());
-    GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), r<vtype>::value);
 }
 
 
 TEST_F(Dense, ConvertToCsrIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto csr_mtx = gko::matrix::Csr<>::create(ref);
-    auto dcsr_mtx = gko::matrix::Csr<>::create(dpcpp);
+    auto csr_mtx = gko::matrix::Csr<vtype>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<vtype>::create(dpcpp);
 
     x->convert_to(csr_mtx.get());
     dx->convert_to(dcsr_mtx.get());
 
-    GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), r<vtype>::value);
 }
 
 
 TEST_F(Dense, MoveToCsrIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto csr_mtx = gko::matrix::Csr<>::create(ref);
-    auto dcsr_mtx = gko::matrix::Csr<>::create(dpcpp);
+    auto csr_mtx = gko::matrix::Csr<vtype>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<vtype>::create(dpcpp);
 
     x->move_to(csr_mtx.get());
     dx->move_to(dcsr_mtx.get());
 
-    GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), r<vtype>::value);
 }
 
 
 TEST_F(Dense, ConvertToEllIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto ell_mtx = gko::matrix::Ell<>::create(ref);
-    auto dell_mtx = gko::matrix::Ell<>::create(dpcpp);
+    auto ell_mtx = gko::matrix::Ell<vtype>::create(ref);
+    auto dell_mtx = gko::matrix::Ell<vtype>::create(dpcpp);
 
     x->convert_to(ell_mtx.get());
     dx->convert_to(dell_mtx.get());
 
-    GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), r<vtype>::value);
 }
 
 
 TEST_F(Dense, MoveToEllIsEquivalentToRef)
 {
     set_up_apply_data();
-    auto ell_mtx = gko::matrix::Ell<>::create(ref);
-    auto dell_mtx = gko::matrix::Ell<>::create(dpcpp);
+    auto ell_mtx = gko::matrix::Ell<vtype>::create(ref);
+    auto dell_mtx = gko::matrix::Ell<vtype>::create(dpcpp);
 
     x->move_to(ell_mtx.get());
     dx->move_to(dell_mtx.get());
 
-    GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), 1e-14);
+    GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), r<vtype>::value);
 }
 
 
-TEST_F(Dense, ConvertToSellpIsEquivalentToRef)
-{
-    set_up_apply_data();
-    auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
-    auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+// TEST_F(Dense, ConvertToSellpIsEquivalentToRef)
+// {
+//     set_up_apply_data();
+//     auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
+//     auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
 
-    x->convert_to(sellp_mtx.get());
-    dx->convert_to(dsellp_mtx.get());
+//     x->convert_to(sellp_mtx.get());
+//     dx->convert_to(dsellp_mtx.get());
 
-    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14);
-}
+//     GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6);
+// }
 
 
-TEST_F(Dense, MoveToSellpIsEquivalentToRef)
-{
-    set_up_apply_data();
-    auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
-    auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+// TEST_F(Dense, MoveToSellpIsEquivalentToRef)
+// {
+//     set_up_apply_data();
+//     auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
+//     auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
 
-    x->move_to(sellp_mtx.get());
-    dx->move_to(dsellp_mtx.get());
+//     x->move_to(sellp_mtx.get());
+//     dx->move_to(dsellp_mtx.get());
 
-    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-14);
-}
+//     GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6);
+// }
 
 
-TEST_F(Dense, ConvertsEmptyToSellp)
-{
-    auto dempty_mtx = Mtx::create(dpcpp);
-    auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+// TEST_F(Dense, ConvertsEmptyToSellp)
+// {
+//     auto dempty_mtx = Mtx::create(dpcpp);
+//     auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
 
-    dempty_mtx->convert_to(dsellp_mtx.get());
+//     dempty_mtx->convert_to(dsellp_mtx.get());
 
-    ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
-    ASSERT_FALSE(dsellp_mtx->get_size());
-}
+//     ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()),
+//     0); ASSERT_FALSE(dsellp_mtx->get_size());
+// }
 
 
 TEST_F(Dense, CountNNZIsEquivalentToRef)
@@ -599,63 +596,12 @@ TEST_F(Dense, CalculateTotalColsIsEquivalentToRef)
 }
 
 
-TEST_F(Dense, CanGatherRows)
-{
-    set_up_apply_data();
-
-    auto r_gather = x->row_gather(rgather_idxs.get());
-    auto dr_gather = dx->row_gather(rgather_idxs.get());
-
-    GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0);
-}
-
-
-TEST_F(Dense, CanGatherRowsIntoDense)
-{
-    set_up_apply_data();
-    auto gather_size =
-        gko::dim<2>{rgather_idxs->get_num_elems(), x->get_size()[1]};
-    auto r_gather = Mtx::create(ref, gather_size);
-    // test make_temporary_clone and non-default stride
-    auto dr_gather = Mtx::create(ref, gather_size, x->get_size()[1] + 2);
-
-    x->row_gather(rgather_idxs.get(), r_gather.get());
-    dx->row_gather(rgather_idxs.get(), dr_gather.get());
-
-    GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0);
-}
-
-
-TEST_F(Dense, IsPermutable)
-{
-    set_up_apply_data();
-
-    auto permuted = square->permute(rpermute_idxs.get());
-    auto dpermuted = dsquare->permute(rpermute_idxs.get());
-
-    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(permuted.get()),
-                        static_cast<Mtx *>(dpermuted.get()), 0);
-}
-
-
-TEST_F(Dense, IsInversePermutable)
-{
-    set_up_apply_data();
-
-    auto permuted = square->inverse_permute(rpermute_idxs.get());
-    auto dpermuted = dsquare->inverse_permute(rpermute_idxs.get());
-
-    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(permuted.get()),
-                        static_cast<Mtx *>(dpermuted.get()), 0);
-}
-
-
 TEST_F(Dense, IsRowPermutable)
 {
     set_up_apply_data();
 
     auto r_permute = x->row_permute(rpermute_idxs.get());
-    auto dr_permute = dx->row_permute(rpermute_idxs.get());
+    auto dr_permute = dx->row_permute(drpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(r_permute.get()),
                         static_cast<Mtx *>(dr_permute.get()), 0);
@@ -667,7 +613,7 @@ TEST_F(Dense, IsColPermutable)
     set_up_apply_data();
 
     auto c_permute = x->column_permute(cpermute_idxs.get());
-    auto dc_permute = dx->column_permute(cpermute_idxs.get());
+    auto dc_permute = dx->column_permute(dcpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(c_permute.get()),
                         static_cast<Mtx *>(dc_permute.get()), 0);
@@ -679,7 +625,7 @@ TEST_F(Dense, IsInverseRowPermutable)
     set_up_apply_data();
 
     auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get());
-    auto d_inverse_r_permute = dx->inverse_row_permute(rpermute_idxs.get());
+    auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_r_permute.get()),
                         static_cast<Mtx *>(d_inverse_r_permute.get()), 0);
@@ -691,7 +637,7 @@ TEST_F(Dense, IsInverseColPermutable)
     set_up_apply_data();
 
     auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get());
-    auto d_inverse_c_permute = dx->inverse_column_permute(cpermute_idxs.get());
+    auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_c_permute.get()),
                         static_cast<Mtx *>(d_inverse_c_permute.get()), 0);
@@ -716,7 +662,7 @@ TEST_F(Dense, InplaceAbsoluteMatrixIsEquivalentToRef)
     x->compute_absolute_inplace();
     dx->compute_absolute_inplace();
 
-    GKO_ASSERT_MTX_NEAR(x, dx, 1e-14);
+    GKO_ASSERT_MTX_NEAR(x, dx, r<vtype>::value);
 }
 
 
@@ -727,79 +673,7 @@ TEST_F(Dense, OutplaceAbsoluteMatrixIsEquivalentToRef)
     auto abs_x = x->compute_absolute();
     auto dabs_x = dx->compute_absolute();
 
-    GKO_ASSERT_MTX_NEAR(abs_x, dabs_x, 1e-14);
-}
-
-
-TEST_F(Dense, MakeComplexIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto complex_x = x->make_complex();
-    auto dcomplex_x = dx->make_complex();
-
-    GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0);
-}
-
-
-TEST_F(Dense, MakeComplexWithGivenResultIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto complex_x = ComplexMtx::create(ref, x->get_size());
-    x->make_complex(complex_x.get());
-    auto dcomplex_x = ComplexMtx::create(dpcpp, x->get_size());
-    dx->make_complex(dcomplex_x.get());
-
-    GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0);
-}
-
-
-TEST_F(Dense, GetRealIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto real_x = x->get_real();
-    auto dreal_x = dx->get_real();
-
-    GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0);
-}
-
-
-TEST_F(Dense, GetRealWithGivenResultIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto real_x = Mtx::create(ref, x->get_size());
-    x->get_real(real_x.get());
-    auto dreal_x = Mtx::create(dpcpp, dx->get_size());
-    dx->get_real(dreal_x.get());
-
-    GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0);
-}
-
-
-TEST_F(Dense, GetImagIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto imag_x = x->get_imag();
-    auto dimag_x = dx->get_imag();
-
-    GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0);
-}
-
-
-TEST_F(Dense, GetImagWithGivenResultIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto imag_x = Mtx::create(ref, x->get_size());
-    x->get_imag(imag_x.get());
-    auto dimag_x = Mtx::create(dpcpp, dx->get_size());
-    dx->get_imag(dimag_x.get());
-
-    GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0);
+    GKO_ASSERT_MTX_NEAR(abs_x, dabs_x, r<vtype>::value);
 }
 
 

From 9e54bd2f5e7a07b977b463614882af7330ce627b Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 21 May 2021 14:31:26 +0200
Subject: [PATCH 08/22] use simple macro

---
 dpcpp/matrix/dense_kernels.dp.cpp | 319 +++---------------------------
 1 file changed, 24 insertions(+), 295 deletions(-)

diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 494caff94c2..58199a221d6 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -92,14 +92,12 @@ void strided_fill(size_type num_rows, size_type num_cols, size_type stride,
 GKO_ENABLE_DEFAULT_HOST(strided_fill, strided_fill)
 
 
-template <size_type block_size, typename ValueType>
+template <typename ValueType>
 void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols,
            const ValueType *__restrict__ alpha, ValueType *__restrict__ x,
            size_type stride_x, sycl::nd_item<3> item_ct1)
 {
-    constexpr auto warps_per_block = block_size / config::warp_size;
-    const auto global_id =
-        thread::get_thread_id<config::warp_size, warps_per_block>(item_ct1);
+    const auto global_id = thread::get_thread_id_flat(item_ct1);
     const auto row_id = global_id / num_cols;
     const auto col_id = global_id % num_cols;
     const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id;
@@ -111,32 +109,16 @@ void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols,
     }
 }
 
-template <size_type block_size, typename ValueType>
-void scale(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-           sycl::queue *stream, size_type num_rows, size_type num_cols,
-           size_type num_alpha_cols, const ValueType *alpha, ValueType *x,
-           size_type stride_x)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                scale<block_size>(num_rows, num_cols, num_alpha_cols, alpha, x,
-                                  stride_x, item_ct1);
-            });
-    });
-}
-
+GKO_ENABLE_DEFAULT_HOST(scale, scale)
 
-template <size_type block_size, typename ValueType>
+template <typename ValueType>
 void add_scaled(size_type num_rows, size_type num_cols,
                 size_type num_alpha_cols, const ValueType *__restrict__ alpha,
                 const ValueType *__restrict__ x, size_type stride_x,
                 ValueType *__restrict__ y, size_type stride_y,
                 sycl::nd_item<3> item_ct1)
 {
-    constexpr auto warps_per_block = block_size / config::warp_size;
-    const auto global_id =
-        thread::get_thread_id<config::warp_size, warps_per_block>(item_ct1);
+    const auto global_id = thread::get_thread_id_flat(item_ct1);
     const auto row_id = global_id / num_cols;
     const auto col_id = global_id % num_cols;
     const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id;
@@ -146,22 +128,7 @@ void add_scaled(size_type num_rows, size_type num_cols,
     }
 }
 
-template <size_type block_size, typename ValueType>
-void add_scaled(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                sycl::queue *stream, size_type num_rows, size_type num_cols,
-                size_type num_alpha_cols, const ValueType *alpha,
-                const ValueType *x, size_type stride_x, ValueType *y,
-                size_type stride_y)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                add_scaled<block_size>(num_rows, num_cols, num_alpha_cols,
-                                       alpha, x, stride_x, y, stride_y,
-                                       item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(add_scaled, add_scaled)
 
 
 template <typename ValueType>
@@ -179,19 +146,7 @@ void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha,
     y[tidx * stride_y + tidx] += alpha[0] * diag[tidx];
 }
 
-template <typename ValueType>
-void add_scaled_diag(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                     sycl::queue *stream, size_type size,
-                     const ValueType *alpha, const ValueType *diag,
-                     ValueType *y, size_type stride_y)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                add_scaled_diag(size, alpha, diag, y, stride_y, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag)
 
 
 template <size_type block_size, typename OutType, typename CallableGetValue,
@@ -282,7 +237,6 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
-
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 compute_partial_dot<block_size>(
@@ -318,7 +272,6 @@ void finalize_dot_computation(dim3 grid, dim3 block,
                        sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
-
         cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              finalize_dot_computation<block_size>(
@@ -356,7 +309,6 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
             sycl::access::mode::read_write, sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
-
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 compute_partial_norm2<block_size>(
@@ -427,21 +379,7 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
     }
 }
 
-template <typename ValueType, typename IndexType>
-void fill_in_coo(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                 sycl::queue *stream, size_type num_rows, size_type num_cols,
-                 size_type stride, const size_type *row_ptrs,
-                 const ValueType *source, IndexType *row_idxs,
-                 IndexType *col_idxs, ValueType *values)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                fill_in_coo(num_rows, num_cols, stride, row_ptrs, source,
-                            row_idxs, col_idxs, values, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo)
 
 
 template <typename ValueType, typename IndexType>
@@ -505,20 +443,7 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
     }
 }
 
-template <typename ValueType, typename IndexType>
-void fill_in_csr(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                 sycl::queue *stream, size_type num_rows, size_type num_cols,
-                 size_type stride, const ValueType *source, IndexType *row_ptrs,
-                 IndexType *col_idxs, ValueType *values)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             fill_in_csr(num_rows, num_cols, stride, source,
-                                         row_ptrs, col_idxs, values, item_ct1);
-                         });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr)
 
 
 template <typename ValueType, typename IndexType>
@@ -551,22 +476,7 @@ void fill_in_ell(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename ValueType, typename IndexType>
-void fill_in_ell(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                 sycl::queue *stream, size_type num_rows, size_type num_cols,
-                 size_type source_stride, const ValueType *source,
-                 size_type max_nnz_per_row, size_type result_stride,
-                 IndexType *col_ptrs, ValueType *values)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             fill_in_ell(num_rows, num_cols, source_stride,
-                                         source, max_nnz_per_row, result_stride,
-                                         col_ptrs, values, item_ct1);
-                         });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell)
 
 
 void calculate_slice_lengths(size_type num_rows, size_type slice_size,
@@ -655,22 +565,7 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
     }
 }
 
-template <typename ValueType, typename IndexType>
-void fill_in_sellp(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                   sycl::queue *stream, size_type num_rows, size_type num_cols,
-                   size_type slice_size, size_type stride,
-                   const ValueType *source, size_type *slice_lengths,
-                   size_type *slice_sets, IndexType *col_idxs, ValueType *vals)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                fill_in_sellp(num_rows, num_cols, slice_size, stride, source,
-                              slice_lengths, slice_sets, col_idxs, vals,
-                              item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp)
 
 
 void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
@@ -777,7 +672,6 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::access::target::local>
             dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh);
 
-
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 reduce_total_cols(num_slices, max_nnz_per_slice, result,
@@ -803,21 +697,7 @@ void symm_permute(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename IndexType, typename ValueType>
-void symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                  sycl::queue *stream, size_type num_rows, size_type num_cols,
-                  const IndexType *perm_idxs, const ValueType *orig,
-                  size_type stride_orig, ValueType *result,
-                  size_type stride_result)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                symm_permute(num_rows, num_cols, perm_idxs, orig, stride_orig,
-                             result, stride_result, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(symm_permute, symm_permute)
 
 
 template <typename IndexType, typename ValueType>
@@ -836,21 +716,7 @@ void inv_symm_permute(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename IndexType, typename ValueType>
-void inv_symm_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                      sycl::queue *stream, size_type num_rows,
-                      size_type num_cols, const IndexType *perm_idxs,
-                      const ValueType *orig, size_type stride_orig,
-                      ValueType *result, size_type stride_result)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                inv_symm_permute(num_rows, num_cols, perm_idxs, orig,
-                                 stride_orig, result, stride_result, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(inv_symm_permute, inv_symm_permute)
 
 
 template <typename IndexType, typename ValueType>
@@ -869,21 +735,7 @@ void row_gather(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename IndexType, typename ValueType>
-void row_gather(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                sycl::queue *stream, size_type num_rows, size_type num_cols,
-                const IndexType *perm_idxs, const ValueType *orig,
-                size_type stride_orig, ValueType *result,
-                size_type stride_result)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                row_gather(num_rows, num_cols, perm_idxs, orig, stride_orig,
-                           result, stride_result, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(row_gather, row_gather)
 
 
 template <typename IndexType, typename ValueType>
@@ -902,21 +754,7 @@ void column_permute(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename IndexType, typename ValueType>
-void column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                    sycl::queue *stream, size_type num_rows, size_type num_cols,
-                    const IndexType *perm_idxs, const ValueType *orig,
-                    size_type stride_orig, ValueType *result,
-                    size_type stride_result)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                column_permute(num_rows, num_cols, perm_idxs, orig, stride_orig,
-                               result, stride_result, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(column_permute, column_permute)
 
 
 template <typename IndexType, typename ValueType>
@@ -935,22 +773,7 @@ void inverse_row_permute(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename IndexType, typename ValueType>
-void inverse_row_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                         sycl::queue *stream, size_type num_rows,
-                         size_type num_cols, const IndexType *perm_idxs,
-                         const ValueType *orig, size_type stride_orig,
-                         ValueType *result, size_type stride_result)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             inverse_row_permute(num_rows, num_cols, perm_idxs,
-                                                 orig, stride_orig, result,
-                                                 stride_result, item_ct1);
-                         });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(inverse_row_permute, inverse_row_permute)
 
 
 template <typename IndexType, typename ValueType>
@@ -970,22 +793,8 @@ void inverse_column_permute(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename IndexType, typename ValueType>
-void inverse_column_permute(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                            sycl::queue *stream, size_type num_rows,
-                            size_type num_cols, const IndexType *perm_idxs,
-                            const ValueType *orig, size_type stride_orig,
-                            ValueType *result, size_type stride_result)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                inverse_column_permute(num_rows, num_cols, perm_idxs, orig,
-                                       stride_orig, result, stride_result,
-                                       item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(inverse_column_permute, inverse_column_permute)
+
 
 template <typename ValueType>
 void extract_diagonal(size_type problem_size,
@@ -998,20 +807,7 @@ void extract_diagonal(size_type problem_size,
     }
 }
 
-template <typename ValueType>
-void extract_diagonal(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                      sycl::queue *stream, size_type problem_size,
-                      const ValueType *orig, size_type stride_orig,
-                      ValueType *diag)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             extract_diagonal(problem_size, orig, stride_orig,
-                                              diag, item_ct1);
-                         });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(extract_diagonal, extract_diagonal)
 
 
 template <typename ValueType>
@@ -1027,20 +823,7 @@ void inplace_absolute_dense(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename ValueType>
-void inplace_absolute_dense(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                            sycl::queue *stream, size_type num_rows,
-                            size_type num_cols, ValueType *data,
-                            size_type stride)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             inplace_absolute_dense(num_rows, num_cols, data,
-                                                    stride, item_ct1);
-                         });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(inplace_absolute_dense, inplace_absolute_dense)
 
 
 template <typename ValueType>
@@ -1058,22 +841,7 @@ void outplace_absolute_dense(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename ValueType>
-void outplace_absolute_dense(dim3 grid, dim3 block,
-                             size_t dynamic_shared_memory, sycl::queue *stream,
-                             size_type num_rows, size_type num_cols,
-                             const ValueType *in, size_type stride_in,
-                             remove_complex<ValueType> *out,
-                             size_type stride_out)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                outplace_absolute_dense(num_rows, num_cols, in, stride_in, out,
-                                        stride_out, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(outplace_absolute_dense, outplace_absolute_dense)
 
 
 template <typename ValueType, typename ComplexType>
@@ -1090,20 +858,7 @@ void make_complex(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename ValueType, typename ComplexType>
-void make_complex(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                  sycl::queue *stream, size_type num_rows, size_type num_cols,
-                  const ValueType *in, size_type stride_in, ComplexType *out,
-                  size_type stride_out)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             make_complex(num_rows, num_cols, in, stride_in,
-                                          out, stride_out, item_ct1);
-                         });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(make_complex, make_complex)
 
 
 template <typename ValueType>
@@ -1120,20 +875,7 @@ void get_real(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename ValueType>
-void get_real(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-              sycl::queue *stream, size_type num_rows, size_type num_cols,
-              const ValueType *in, size_type stride_in,
-              remove_complex<ValueType> *out, size_type stride_out)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             get_real(num_rows, num_cols, in, stride_in, out,
-                                      stride_out, item_ct1);
-                         });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(get_real, get_real)
 
 
 template <typename ValueType>
@@ -1150,20 +892,7 @@ void get_imag(size_type num_rows, size_type num_cols,
     }
 }
 
-template <typename ValueType>
-void get_imag(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-              sycl::queue *stream, size_type num_rows, size_type num_cols,
-              const ValueType *in, size_type stride_in,
-              remove_complex<ValueType> *out, size_type stride_out)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             get_imag(num_rows, num_cols, in, stride_in, out,
-                                      stride_out, item_ct1);
-                         });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST(get_imag, get_imag)
 
 
 }  // namespace kernel

From d9c6f6432d3ac7f1de0c83710b6a5f02f0e4adfd Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sun, 23 May 2021 00:08:32 +0200
Subject: [PATCH 09/22] add as_array and reduce_add_array for config

---
 dpcpp/components/reduction.dp.hpp             | 89 ++++++++++++-------
 .../ginkgo/core/synthesizer/containers.hpp    |  8 ++
 2 files changed, 63 insertions(+), 34 deletions(-)

diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index e47d9038af3..4f1835dfea7 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -44,21 +44,29 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 
 
+#include <ginkgo/core/synthesizer/containers.hpp>
+#include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/uninitialized_array.hpp"
 
-
 namespace gko {
 namespace kernels {
 namespace dpcpp {
 
 
 constexpr int default_block_size = 256;
-
+using KCFG_1D = ConfigSet<11, 7>;
+constexpr auto kcfg_1d_list =
+    syn::value_list<ConfigSetType, KCFG_1D::encode(512, 64),
+                    KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
+                    KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
+                    KCFG_1D::encode(256, 8)>();
+constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
 
 // #include "common/components/reduction.hpp.inc"
 /**
@@ -130,21 +138,22 @@ __dpct_inline__ int choose_pivot(const Group &group, ValueType local_data,
  * array.
  */
 template <
-    typename Group, typename ValueType, typename Operator,
+    unsigned int sg_size = 32, typename Group, typename ValueType,
+    typename Operator,
     typename = std::enable_if_t<group::is_synchronizable_group<Group>::value>>
 void reduce(const Group &__restrict__ group, ValueType *__restrict__ data,
             Operator reduce_op = Operator{})
 {
     const auto local_id = group.thread_rank();
 
-    for (int k = group.size() / 2; k >= config::warp_size; k /= 2) {
+    for (int k = group.size() / 2; k >= sg_size; k /= 2) {
         group.sync();
         if (local_id < k) {
             data[local_id] = reduce_op(data[local_id], data[local_id + k]);
         }
     }
 
-    const auto warp = group::tiled_partition<config::warp_size>(group);
+    const auto warp = group::tiled_partition<sg_size>(group);
     const auto warp_id = group.thread_rank() / warp.size();
     if (warp_id > 0) {
         return;
@@ -164,7 +173,7 @@ void reduce(const Group &__restrict__ group, ValueType *__restrict__ data,
  * `source` of any size. Has to be called a second time on `result` to reduce
  * an array larger than `block_size`.
  */
-template <typename Operator, typename ValueType>
+template <unsigned int sg_size = 32, typename Operator, typename ValueType>
 void reduce_array(size_type size, const ValueType *__restrict__ source,
                   ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
                   Operator reduce_op = Operator{})
@@ -180,7 +189,7 @@ void reduce_array(size_type size, const ValueType *__restrict__ source,
     group::this_thread_block(item_ct1).sync();
 
     // Stores the result of the reduction inside `result[0]`
-    reduce(group::this_thread_block(item_ct1), result, reduce_op);
+    reduce<sg_size>(group::this_thread_block(item_ct1), result, reduce_op);
 }
 
 
@@ -189,47 +198,50 @@ void reduce_array(size_type size, const ValueType *__restrict__ source,
  *
  * Computes a reduction using the add operation (+) on an array
  * `source` of any size. Has to be called a second time on `result` to reduce
- * an array larger than `default_block_size`.
+ * an array larger than `block_size`.
  */
-template <typename ValueType>
+template <ConfigSetType cfg, typename ValueType>
 void reduce_add_array(
     size_type size, const ValueType *__restrict__ source,
     ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, default_block_size> *block_sum)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *block_sum)
 {
-    reduce_array(size, source, static_cast<ValueType *>((*block_sum)), item_ct1,
-                 [](const ValueType &x, const ValueType &y) { return x + y; });
+    reduce_array<KCFG_1D::decode<1>(cfg)>(
+        size, source, static_cast<ValueType *>((*block_sum)), item_ct1,
+        [](const ValueType &x, const ValueType &y) { return x + y; });
 
     if (item_ct1.get_local_id(2) == 0) {
         result[item_ct1.get_group(2)] = (*block_sum)[0];
     }
 }
 
-template <typename ValueType>
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                       sycl::queue *stream, size_type size,
                       const ValueType *source, ValueType *result)
 {
     stream->submit([&](sycl::handler &cgh) {
-        sycl::accessor<UninitializedArray<ValueType, default_block_size>, 0,
-                       sycl::access::mode::read_write,
+        sycl::accessor<UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>,
+                       0, sycl::access::mode::read_write,
                        sycl::access::target::local>
             block_sum_acc_ct1(cgh);
 
-        auto local_range = block.get_range();
-        auto global_range = grid.get_range() * local_range;
-
         cgh.parallel_for(
-            sycl::nd_range<3>(global_range, local_range),
-            [=](sycl::nd_item<3> item_ct1) {
-                reduce_add_array(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                reduce_add_array<cfg>(
                     size, source, result, item_ct1,
-                    (UninitializedArray<ValueType, default_block_size> *)
+                    (UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *)
                         block_sum_acc_ct1.get_pointer());
             });
     });
 }
 
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_add_array_config,
+                                           reduce_add_array);
+
+GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config,
+                               KCFG_1D, kcfg_1d_list);
+
 
 /**
  * Compute a reduction using add operation (+).
@@ -247,23 +259,32 @@ ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,
     auto block_results_val = source;
     size_type grid_dim = size;
     auto block_results = Array<ValueType>(exec);
-    if (size > default_block_size) {
-        const auto n = ceildiv(size, default_block_size);
-        grid_dim = (n <= default_block_size) ? n : default_block_size;
+    ValueType answer = zero<ValueType>();
+    for (auto &cfg : kcfg_1d_array) {
+        const auto block_size = KCFG_1D::decode<0>(cfg);
+        const auto warp_size = KCFG_1D::decode<1>(cfg);
+        if (!validate(exec->get_queue(), block_size, warp_size)) {
+            continue;
+        }
+        if (size > block_size) {
+            const auto n = ceildiv(size, block_size);
+            grid_dim = (n <= block_size) ? n : block_size;
 
-        block_results.resize_and_reset(grid_dim);
+            block_results.resize_and_reset(grid_dim);
 
-        reduce_add_array(grid_dim, default_block_size, 0, exec->get_queue(),
-                         size, source, block_results.get_data());
+            reduce_add_array_call(grid_dim, block_size, 0, exec->get_queue(),
+                                  size, source, block_results.get_data());
 
-        block_results_val = block_results.get_const_data();
-    }
+            block_results_val = block_results.get_const_data();
+        }
 
-    auto d_result = Array<ValueType>(exec, 1);
+        auto d_result = Array<ValueType>(exec, 1);
 
-    reduce_add_array(1, default_block_size, 0, exec->get_queue(), grid_dim,
-                     block_results_val, d_result.get_data());
-    auto answer = exec->copy_val_to_host(d_result.get_const_data());
+        reduce_add_array_call(1, block_size, 0, exec->get_queue(), grid_dim,
+                              block_results_val, d_result.get_data());
+        answer = exec->copy_val_to_host(d_result.get_const_data());
+        break;
+    }
     return answer;
 }
 
diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp
index ebd5b441f6c..3c79c7b7455 100644
--- a/include/ginkgo/core/synthesizer/containers.hpp
+++ b/include/ginkgo/core/synthesizer/containers.hpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_PUBLIC_CORE_SYNTHESIZER_CONTAINERS_HPP_
 
 
+#include <array>
 #include <type_traits>
 
 
@@ -113,6 +114,13 @@ template <typename T>
 using as_list = typename detail::as_list_impl<T>::type;
 
 
+template <typename T, T... Value>
+constexpr std::array<T, sizeof...(Value)> as_array(value_list<T, Value...> vl)
+{
+    return std::array<T, sizeof...(Value)>{Value...};
+}
+
+
 }  // namespace syn
 }  // namespace gko
 

From 3d06e1779feb8adff5135c8a299c943d65f767d7 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 25 May 2021 00:46:29 +0200
Subject: [PATCH 10/22] fix dpcpp doesn't have subwarp and use ConfigSet

---
 dpcpp/CMakeLists.txt               |   1 +
 dpcpp/base/helper.dp.cpp           |  31 ++
 dpcpp/base/helper.hpp              |  22 +-
 dpcpp/components/reduction.dp.hpp  |   1 +
 dpcpp/components/thread_ids.dp.hpp |   6 +-
 dpcpp/matrix/dense_kernels.dp.cpp  | 485 +++++++++++++++++------------
 6 files changed, 336 insertions(+), 210 deletions(-)
 create mode 100644 dpcpp/base/helper.dp.cpp

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index b3101d8b2e2..48addebaf5f 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -11,6 +11,7 @@ target_sources(ginkgo_dpcpp
     PRIVATE
     base/version.dp.cpp
     base/executor.dp.cpp
+    base/helper.dp.cpp
     components/absolute_array.dp.cpp
     components/fill_array.dp.cpp
     components/prefix_sum.dp.cpp
diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp
new file mode 100644
index 00000000000..fe4395e2534
--- /dev/null
+++ b/dpcpp/base/helper.dp.cpp
@@ -0,0 +1,31 @@
+#include <CL/sycl.hpp>
+
+#include "dpcpp/base/helper.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+bool validate(sycl::queue *queue, unsigned int workgroup_size,
+              unsigned int subgroup_size)
+{
+    {
+        auto device = queue->get_device();
+        auto subgroup_size_list =
+            device.get_info<cl::sycl::info::device::sub_group_sizes>();
+        auto max_workgroup_size =
+            device.get_info<sycl::info::device::max_work_group_size>();
+        bool allowed = false;
+        for (auto &i : subgroup_size_list) {
+            allowed |= (i == subgroup_size);
+        }
+        return allowed && (workgroup_size <= max_workgroup_size);
+    }
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index f8eee93f25b..3979caa905c 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <CL/sycl.hpp>
 
 
+#include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
@@ -137,18 +138,19 @@ namespace dpcpp {
 
 
 bool validate(sycl::queue *queue, unsigned workgroup_size,
-              unsigned subgroup_size)
+              unsigned subgroup_size);
+
+
+template <typename IterArr, typename Validate>
+ConfigSetType get_first_cfg(IterArr &arr, Validate verify)
 {
-    auto device = queue->get_device();
-    auto subgroup_size_list =
-        device.get_info<cl::sycl::info::device::sub_group_sizes>();
-    auto max_workgroup_size =
-        device.get_info<sycl::info::device::max_work_group_size>();
-    bool allowed = false;
-    for (auto &i : subgroup_size_list) {
-        allowed |= (i == subgroup_size);
+    for (auto &cfg : arr) {
+        if (verify(cfg)) {
+            return cfg;
+        }
     }
-    return allowed && (workgroup_size <= max_workgroup_size);
+    GKO_NOT_SUPPORTED(arr);
+    return 0;
 }
 
 
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index 4f1835dfea7..bc9937ddb1a 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -54,6 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/components/thread_ids.dp.hpp"
 #include "dpcpp/components/uninitialized_array.hpp"
 
+
 namespace gko {
 namespace kernels {
 namespace dpcpp {
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
index 8694d6a88c9..5b656c5e0db 100644
--- a/dpcpp/components/thread_ids.dp.hpp
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -124,7 +124,8 @@ __dpct_inline__ size_type get_local_warp_id(sycl::nd_item<3> item_ct1)
 template <int subwarp_size>
 __dpct_inline__ size_type get_local_subwarp_id(sycl::nd_item<3> item_ct1)
 {
-    constexpr auto subwarps_per_warp = config::warp_size / subwarp_size;
+    // dpcpp does not have subwarp.
+    constexpr auto subwarps_per_warp = subwarp_size / subwarp_size;
     return get_local_warp_id(item_ct1) * subwarps_per_warp +
            item_ct1.get_local_id(1);
 }
@@ -195,7 +196,8 @@ __dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1)
 template <int subwarp_size, int warps_per_block>
 __dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1)
 {
-    constexpr auto subwarps_per_warp = config::warp_size / subwarp_size;
+    // dpcpp dose not have subwarp
+    constexpr auto subwarps_per_warp = subwarp_size / subwarp_size;
     return get_warp_id<warps_per_block>(item_ct1) * subwarps_per_warp +
            item_ct1.get_local_id(1);
 }
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 58199a221d6..0d1be735e17 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -68,7 +68,13 @@ namespace dpcpp {
  */
 namespace dense {
 
-
+using KCFG_1D = ConfigSet<11, 7>;
+constexpr auto kcfg_1d_list =
+    syn::value_list<ConfigSetType, KCFG_1D::encode(512, 64),
+                    KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
+                    KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
+                    KCFG_1D::encode(256, 8)>();
+constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
 constexpr auto default_block_size = 256;
 
 
@@ -149,58 +155,67 @@ void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha,
 GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag)
 
 
-template <size_type block_size, typename OutType, typename CallableGetValue,
-          typename CallableReduce>
-void compute_partial_reduce(size_type num_rows, OutType *__restrict__ work,
-                            CallableGetValue get_value,
-                            CallableReduce reduce_op, sycl::nd_item<3> item_ct1,
-                            UninitializedArray<OutType, block_size> *tmp_work)
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename OutType,
+          typename CallableGetValue, typename CallableReduce>
+void compute_partial_reduce(
+    size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value,
+    CallableReduce reduce_op, sycl::nd_item<3> item_ct1,
+    UninitializedArray<OutType, KCFG_1D::decode<0>(cfg)> *tmp_work)
 {
-    constexpr auto warps_per_block = block_size / config::warp_size;
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+
+    constexpr auto warps_per_block = wg_size / sg_size;
 
     const auto num_blocks = item_ct1.get_group_range(2);
-    const auto local_id =
-        thread::get_local_thread_id<config::warp_size>(item_ct1);
+    const auto local_id = thread::get_local_thread_id<sg_size>(item_ct1);
     const auto global_id =
-        thread::get_thread_id<config::warp_size, warps_per_block>(item_ct1);
+        thread::get_thread_id<sg_size, warps_per_block>(item_ct1);
 
     OutType *tmp_work_array = *tmp_work;
     auto tmp = zero<OutType>();
-    for (auto i = global_id; i < num_rows; i += block_size * num_blocks) {
+    for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) {
         tmp = reduce_op(tmp, get_value(i));
     }
 
     tmp_work_array[local_id] = tmp;
 
-    ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1),
-                                  tmp_work_array, reduce_op);
+    ::gko::kernels::dpcpp::reduce<sg_size>(group::this_thread_block(item_ct1),
+                                           tmp_work_array, reduce_op);
 
     if (local_id == 0) {
         work[thread::get_block_id(item_ct1)] = tmp_work_array[0];
     }
 }
 
+// GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_reduce_config,
+//                                            compute_partial_reduce);
+// GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_reduce_call,
+// compute_partial_reduce_config,
+//                                KCFG_1D, kcfg_1d_list);
 
-template <size_type block_size, typename ValueType, typename CallableReduce,
-          typename CallableFinalize>
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType,
+          typename CallableReduce, typename CallableFinalize>
 void finalize_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
     CallableReduce reduce_op, CallableFinalize finalize_op,
     sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, block_size> *tmp_work)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
 {
-    const auto local_id =
-        thread::get_local_thread_id<config::warp_size>(item_ct1);
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+
+    const auto local_id = thread::get_local_thread_id<sg_size>(item_ct1);
 
     ValueType tmp = zero<ValueType>();
-    for (auto i = local_id; i < size; i += block_size) {
+    for (auto i = local_id; i < size; i += wg_size) {
         tmp = reduce_op(tmp, work[i]);
     }
     ValueType *tmp_work_array = *tmp_work;
     tmp_work_array[local_id] = tmp;
 
-    ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1),
-                                  tmp_work_array, reduce_op);
+    ::gko::kernels::dpcpp::reduce<sg_size>(group::this_thread_block(item_ct1),
+                                           tmp_work_array, reduce_op);
 
     if (local_id == 0) {
         *result = finalize_op(tmp_work_array[0]);
@@ -208,14 +223,14 @@ void finalize_reduce_computation(
 }
 
 
-template <size_type block_size, typename ValueType>
-void compute_partial_dot(size_type num_rows, const ValueType *__restrict__ x,
-                         size_type stride_x, const ValueType *__restrict__ y,
-                         size_type stride_y, ValueType *__restrict__ work,
-                         sycl::nd_item<3> item_ct1,
-                         UninitializedArray<ValueType, block_size> *tmp_work)
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void compute_partial_dot(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    const ValueType *__restrict__ y, size_type stride_y,
+    ValueType *__restrict__ work, sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
 {
-    compute_partial_reduce<block_size>(
+    compute_partial_reduce<cfg>(
         num_rows, work,
         [x, stride_x, y, stride_y](size_type i) {
             return x[i * stride_x] * conj(y[i * stride_y]);
@@ -224,122 +239,175 @@ void compute_partial_dot(size_type num_rows, const ValueType *__restrict__ x,
         tmp_work);
 }
 
-template <size_type block_size, typename ValueType>
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          sycl::queue *stream, size_type num_rows,
                          const ValueType *x, size_type stride_x,
                          const ValueType *y, size_type stride_y,
                          ValueType *work)
 {
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    std::cout << "partial " << cfg << std::endl;
     stream->submit([&](sycl::handler &cgh) {
-        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
+        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
                        sycl::access::mode::read_write,
                        sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_dot<block_size>(
+                compute_partial_dot<cfg>(
                     num_rows, x, stride_x, y, stride_y, work, item_ct1,
-                    (UninitializedArray<ValueType, block_size> *)
+                    (UninitializedArray<ValueType, wg_size> *)
                         tmp_work_acc_ct1.get_pointer());
             });
     });
 }
 
 
-template <size_type block_size, typename ValueType>
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void finalize_dot_computation(
     size_type size, const ValueType *work, ValueType *result,
     sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, block_size> *tmp_work)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
 {
-    finalize_reduce_computation<block_size>(
+    finalize_reduce_computation<cfg>(
         size, work, result,
         [](const ValueType &x, const ValueType &y) { return x + y; },
         [](const ValueType &x) { return x; }, item_ct1, tmp_work);
 }
 
-template <size_type block_size, typename ValueType>
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void finalize_dot_computation(dim3 grid, dim3 block,
                               size_t dynamic_shared_memory, sycl::queue *stream,
                               size_type size, const ValueType *work,
                               ValueType *result)
 {
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    std::cout << "finalize " << cfg << std::endl;
     stream->submit([&](sycl::handler &cgh) {
-        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
+        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
                        sycl::access::mode::read_write,
                        sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
         cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
-                             finalize_dot_computation<block_size>(
+                             finalize_dot_computation<cfg>(
                                  size, work, result, item_ct1,
-                                 (UninitializedArray<ValueType, block_size> *)
+                                 (UninitializedArray<ValueType, wg_size> *)
                                      tmp_work_acc_ct1.get_pointer());
                          });
     });
 }
 
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
+                 const matrix::Dense<ValueType> *x,
+                 const matrix::Dense<ValueType> *y,
+                 matrix::Dense<ValueType> *result)
+{
+    constexpr auto work_per_thread = 32;
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    std::cout << "dot " << cfg << " " << wg_size << " " << sg_size << std::endl;
+    constexpr auto work_per_block = work_per_thread * wg_size;
+    const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+    const dim3 block_dim{sg_size, 1, wg_size / sg_size};
+    Array<ValueType> work(exec, grid_dim.x);
+    // TODO: write a kernel which does this more efficiently
+    for (size_type col = 0; col < x->get_size()[1]; ++col) {
+        compute_partial_dot<cfg>(grid_dim, block_dim, 0, exec->get_queue(),
+                                 x->get_size()[0], x->get_const_values() + col,
+                                 x->get_stride(), y->get_const_values() + col,
+                                 y->get_stride(), work.get_data());
+        finalize_dot_computation<cfg>(1, block_dim, 0, exec->get_queue(),
+                                      grid_dim.x, work.get_const_data(),
+                                      result->get_values() + col);
+    }
+}
 
-template <size_type block_size, typename ValueType>
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_dot_config, compute_dot)
+
+template <typename ValueType>
+void compute_dot_call(std::shared_ptr<const DpcppExecutor> exec,
+                      const matrix::Dense<ValueType> *x,
+                      const matrix::Dense<ValueType> *y,
+                      matrix::Dense<ValueType> *result)
+{
+    auto queue = exec->get_queue();
+    compute_dot_config(
+        kcfg_1d_list,
+        [&queue](::gko::ConfigSetType cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        },
+        ::gko::syn::value_list<bool>(), ::gko::syn::value_list<int>(),
+        ::gko::syn::value_list<gko::size_type>(), ::gko::syn::type_list<>(),
+        exec, x, y, result);
+}
+
+
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_norm2(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     remove_complex<ValueType> *__restrict__ work, sycl::nd_item<3> item_ct1,
-    UninitializedArray<remove_complex<ValueType>, block_size> *tmp_work)
+    UninitializedArray<remove_complex<ValueType>, KCFG_1D::decode<0>(cfg)>
+        *tmp_work)
 {
     using norm_type = remove_complex<ValueType>;
-    compute_partial_reduce<block_size>(
+    compute_partial_reduce<cfg>(
         num_rows, work,
         [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); },
         [](const norm_type &x, const norm_type &y) { return x + y; }, item_ct1,
         tmp_work);
 }
 
-template <size_type block_size, typename ValueType>
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                            sycl::queue *stream, size_type num_rows,
                            const ValueType *x, size_type stride_x,
                            remove_complex<ValueType> *work)
 {
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     stream->submit([&](sycl::handler &cgh) {
-        sycl::accessor<
-            UninitializedArray<remove_complex<ValueType>, block_size>, 0,
-            sycl::access::mode::read_write, sycl::access::target::local>
+        sycl::accessor<UninitializedArray<remove_complex<ValueType>, wg_size>,
+                       0, sycl::access::mode::read_write,
+                       sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_norm2<block_size>(
+                compute_partial_norm2<cfg>(
                     num_rows, x, stride_x, work, item_ct1,
-                    (UninitializedArray<remove_complex<ValueType>, block_size>
-                         *)tmp_work_acc_ct1.get_pointer());
+                    (UninitializedArray<remove_complex<ValueType>, wg_size> *)
+                        tmp_work_acc_ct1.get_pointer());
             });
     });
 }
 
 
-template <size_type block_size, typename ValueType>
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void finalize_norm2_computation(
     size_type size, const ValueType *work, ValueType *result,
     sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, block_size> *tmp_work)
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
 {
-    finalize_reduce_computation<block_size>(
+    finalize_reduce_computation<cfg>(
         size, work, result,
         [](const ValueType &x, const ValueType &y) { return x + y; },
         [](const ValueType &x) { return sqrt(x); }, item_ct1, tmp_work);
 }
 
-template <size_type block_size, typename ValueType>
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void finalize_norm2_computation(dim3 grid, dim3 block,
                                 size_t dynamic_shared_memory,
                                 sycl::queue *stream, size_type size,
                                 const ValueType *work, ValueType *result)
 {
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     stream->submit([&](sycl::handler &cgh) {
-        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
+        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
                        sycl::access::mode::read_write,
                        sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
@@ -347,15 +415,63 @@ void finalize_norm2_computation(dim3 grid, dim3 block,
 
         cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
-                             finalize_norm2_computation<block_size>(
+                             finalize_norm2_computation<cfg>(
                                  size, work, result, item_ct1,
-                                 (UninitializedArray<ValueType, block_size> *)
+                                 (UninitializedArray<ValueType, wg_size> *)
                                      tmp_work_acc_ct1.get_pointer());
                          });
     });
 }
 
 
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
+                   const matrix::Dense<ValueType> *x,
+                   matrix::Dense<remove_complex<ValueType>> *result)
+{
+    using norm_type = remove_complex<ValueType>;
+    // // TODO: these are tuning parameters obtained experimentally, once
+    // // we decide how to handle this uniformly, they should be modified
+    // // appropriately
+    constexpr auto work_per_thread = 32;
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+
+    constexpr auto work_per_block = work_per_thread * wg_size;
+    const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+    const dim3 block_dim{sg_size, 1, wg_size / sg_size};
+    Array<norm_type> work(exec, grid_dim.x);
+    // TODO: write a kernel which does this more efficiently
+    for (size_type col = 0; col < x->get_size()[1]; ++col) {
+        compute_partial_norm2<cfg>(
+            grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0],
+            x->get_const_values() + col, x->get_stride(), work.get_data());
+        finalize_norm2_computation<cfg>(1, block_dim, 0, exec->get_queue(),
+                                        grid_dim.x, work.get_const_data(),
+                                        result->get_values() + col);
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_norm2_config, compute_norm2)
+
+template <typename ValueType>
+void compute_norm2_call(std::shared_ptr<const DpcppExecutor> exec,
+                        const matrix::Dense<ValueType> *x,
+                        matrix::Dense<remove_complex<ValueType>> *result)
+{
+    auto queue = exec->get_queue();
+    compute_norm2_config(
+        kcfg_1d_list,
+        [&queue](::gko::ConfigSetType cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        },
+        ::gko::syn::value_list<bool>(), ::gko::syn::value_list<int>(),
+        ::gko::syn::value_list<gko::size_type>(), ::gko::syn::type_list<>(),
+        exec, x, result);
+}
+
+
 template <typename ValueType, typename IndexType>
 void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
                  const size_type *__restrict__ row_ptrs,
@@ -382,20 +498,20 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
 GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo)
 
 
-template <typename ValueType, typename IndexType>
+template <ConfigSetType cfg, typename ValueType, typename IndexType>
 void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride,
                        const ValueType *__restrict__ work,
                        IndexType *__restrict__ result,
                        sycl::nd_item<3> item_ct1)
 {
-    constexpr auto warp_size = config::warp_size;
-    const auto row_idx = thread::get_subwarp_id_flat<warp_size>(item_ct1);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    const auto row_idx = thread::get_subwarp_id_flat<sg_size>(item_ct1);
     auto warp_tile =
-        group::tiled_partition<warp_size>(group::this_thread_block(item_ct1));
+        group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
 
     if (row_idx < num_rows) {
         IndexType part_result{};
-        for (auto i = warp_tile.thread_rank(); i < num_cols; i += warp_size) {
+        for (auto i = warp_tile.thread_rank(); i < num_cols; i += sg_size) {
             if (work[stride * row_idx + i] != zero<ValueType>()) {
                 part_result += 1;
             }
@@ -406,20 +522,10 @@ void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride,
     }
 }
 
-template <typename ValueType, typename IndexType>
-void count_nnz_per_row(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                       sycl::queue *stream, size_type num_rows,
-                       size_type num_cols, size_type stride,
-                       const ValueType *work, IndexType *result)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             count_nnz_per_row(num_rows, num_cols, stride, work,
-                                               result, item_ct1);
-                         });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST_CONFIG(count_nnz_per_row, count_nnz_per_row)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(count_nnz_per_row, count_nnz_per_row)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row,
+                               KCFG_1D, kcfg_1d_list)
 
 
 template <typename ValueType, typename IndexType>
@@ -479,6 +585,7 @@ void fill_in_ell(size_type num_rows, size_type num_cols,
 GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell)
 
 
+template <ConfigSetType cfg>
 void calculate_slice_lengths(size_type num_rows, size_type slice_size,
                              int slice_num, size_type stride_factor,
                              const size_type *__restrict__ nnz_per_row,
@@ -486,21 +593,21 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size,
                              size_type *__restrict__ slice_sets,
                              sycl::nd_item<3> item_ct1)
 {
-    constexpr auto warp_size = config::warp_size;
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     const auto sliceid = item_ct1.get_group(2);
     const auto tid_in_warp = item_ct1.get_local_id(2);
 
     if (sliceid * slice_size + tid_in_warp < num_rows) {
         size_type thread_result = 0;
-        for (size_type i = tid_in_warp; i < slice_size; i += warp_size) {
+        for (size_type i = tid_in_warp; i < slice_size; i += sg_size) {
             thread_result =
                 (i + slice_size * sliceid < num_rows)
                     ? max(thread_result, nnz_per_row[sliceid * slice_size + i])
                     : thread_result;
         }
 
-        auto warp_tile = group::tiled_partition<warp_size>(
-            group::this_thread_block(item_ct1));
+        auto warp_tile =
+            group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
         auto warp_result = ::gko::kernels::dpcpp::reduce(
             warp_tile, thread_result,
             [](const size_type &a, const size_type &b) { return max(a, b); });
@@ -514,22 +621,11 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size,
     }
 }
 
-void calculate_slice_lengths(dim3 grid, dim3 block,
-                             size_t dynamic_shared_memory, sycl::queue *stream,
-                             size_type num_rows, size_type slice_size,
-                             int slice_num, size_type stride_factor,
-                             const size_type *nnz_per_row,
-                             size_type *slice_lengths, size_type *slice_sets)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                calculate_slice_lengths(num_rows, slice_size, slice_num,
-                                        stride_factor, nnz_per_row,
-                                        slice_lengths, slice_sets, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths,
+                                           calculate_slice_lengths)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call,
+                               calculate_slice_lengths, KCFG_1D, kcfg_1d_list)
 
 
 template <typename ValueType, typename IndexType>
@@ -567,14 +663,15 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
 
 GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp)
 
-
+template <ConfigSetType cfg>
 void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
                     size_type *__restrict__ result, sycl::nd_item<3> item_ct1,
                     uint8_t *dpct_local)
 {
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     auto block_max = (size_type *)dpct_local;
 
-    reduce_array(
+    reduce_array<sg_size>(
         size, nnz_per_row, block_max, item_ct1,
         [](const size_type &x, const size_type &y) { return max(x, y); });
 
@@ -583,6 +680,7 @@ void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
     }
 }
 
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32)>
 void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                     sycl::queue *stream, size_type size,
                     const size_type *nnz_per_row, size_type *result)
@@ -593,30 +691,34 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
             dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh);
 
 
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             reduce_max_nnz(size, nnz_per_row, result, item_ct1,
-                                            dpct_local_acc_ct1.get_pointer());
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                reduce_max_nnz<cfg>(size, nnz_per_row, result, item_ct1,
+                                    dpct_local_acc_ct1.get_pointer());
+            });
     });
 }
 
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz);
+GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz, KCFG_1D,
+                               kcfg_1d_list)
 
+template <ConfigSetType cfg>
 void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size,
                               size_type stride_factor,
                               const size_type *__restrict__ nnz_per_row,
                               size_type *__restrict__ result,
                               sycl::nd_item<3> item_ct1)
 {
-    constexpr auto warp_size = config::warp_size;
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     auto warp_tile =
-        group::tiled_partition<warp_size>(group::this_thread_block(item_ct1));
-    const auto warpid = thread::get_subwarp_id_flat<warp_size>(item_ct1);
+        group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
+    const auto warpid = thread::get_subwarp_id_flat<sg_size>(item_ct1);
     const auto tid_in_warp = warp_tile.thread_rank();
     const auto slice_num = ceildiv(num_rows, slice_size);
 
     size_type thread_result = 0;
-    for (size_type i = tid_in_warp; i < slice_size; i += warp_size) {
+    for (size_type i = tid_in_warp; i < slice_size; i += sg_size) {
         if (warpid * slice_size + i < num_rows) {
             thread_result =
                 max(thread_result, nnz_per_row[warpid * slice_size + i]);
@@ -632,37 +734,32 @@ void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size,
     }
 }
 
-void reduce_max_nnz_per_slice(dim3 grid, dim3 block,
-                              size_t dynamic_shared_memory, sycl::queue *stream,
-                              size_type num_rows, size_type slice_size,
-                              size_type stride_factor,
-                              const size_type *nnz_per_row, size_type *result)
-{
-    stream->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                reduce_max_nnz_per_slice(num_rows, slice_size, stride_factor,
-                                         nnz_per_row, result, item_ct1);
-            });
-    });
-}
+GKO_ENABLE_DEFAULT_HOST_CONFIG(reduce_max_nnz_per_slice,
+                               reduce_max_nnz_per_slice)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz_per_slice,
+                                           reduce_max_nnz_per_slice)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call,
+                               reduce_max_nnz_per_slice, KCFG_1D, kcfg_1d_list)
 
 
+template <ConfigSetType cfg>
 void reduce_total_cols(size_type num_slices,
                        const size_type *__restrict__ max_nnz_per_slice,
                        size_type *__restrict__ result,
                        sycl::nd_item<3> item_ct1, uint8_t *dpct_local)
 {
     auto block_result = (size_type *)dpct_local;
-
-    reduce_array(num_slices, max_nnz_per_slice, block_result, item_ct1,
-                 [](const size_type &x, const size_type &y) { return x + y; });
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    reduce_array<sg_size>(
+        num_slices, max_nnz_per_slice, block_result, item_ct1,
+        [](const size_type &x, const size_type &y) { return x + y; });
 
     if (item_ct1.get_local_id(2) == 0) {
         result[item_ct1.get_group(2)] = block_result[0];
     }
 }
 
+template <ConfigSetType cfg = KCFG_1D::encode(256, 32)>
 void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::queue *stream, size_type num_slices,
                        const size_type *max_nnz_per_slice, size_type *result)
@@ -674,12 +771,18 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                reduce_total_cols(num_slices, max_nnz_per_slice, result,
-                                  item_ct1, dpct_local_acc_ct1.get_pointer());
+                reduce_total_cols<cfg>(num_slices, max_nnz_per_slice, result,
+                                       item_ct1,
+                                       dpct_local_acc_ct1.get_pointer());
             });
     });
 }
 
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_total_cols,
+                                           reduce_total_cols);
+GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols,
+                               KCFG_1D, kcfg_1d_list)
+
 
 template <typename IndexType, typename ValueType>
 void symm_permute(size_type num_rows, size_type num_cols,
@@ -952,24 +1055,7 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
         // TODO: these are tuning parameters obtained experimentally, once
         // we decide how to handle this uniformly, they should be modified
         // appropriately
-        constexpr auto work_per_thread = 32;
-        constexpr auto block_size = default_block_size;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<ValueType> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_dot<block_size>(
-                grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0],
-                x->get_const_values() + col, x->get_stride(),
-                y->get_const_values() + col, y->get_stride(), work.get_data());
-            kernel::finalize_dot_computation<block_size>(
-                1, block_dim, 0, exec->get_queue(), grid_dim.x,
-                work.get_const_data(), result->get_values() + col);
-        }
+        kernel::compute_dot_call(exec, x, y, result);
     }
 }
 
@@ -998,27 +1084,7 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
                 result->get_values() + col);
         }
     } else {
-        using norm_type = remove_complex<ValueType>;
-        // // TODO: these are tuning parameters obtained experimentally, once
-        // // we decide how to handle this uniformly, they should be modified
-        // // appropriately
-        constexpr auto work_per_thread = 32;
-        constexpr auto block_size = default_block_size;
-
-        constexpr auto work_per_block = work_per_thread * block_size;
-        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-        const dim3 block_dim{config::warp_size, 1,
-                             block_size / config::warp_size};
-        Array<norm_type> work(exec, grid_dim.x);
-        // TODO: write a kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            kernel::compute_partial_norm2<block_size>(
-                grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0],
-                x->get_const_values() + col, x->get_stride(), work.get_data());
-            kernel::finalize_norm2_computation<block_size>(
-                1, block_dim, 0, exec->get_queue(), grid_dim.x,
-                work.get_const_data(), result->get_values() + col);
-        }
+        kernel::compute_norm2_call(exec, x, result);
     }
 }
 
@@ -1073,9 +1139,9 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
     const auto rows_per_block = ceildiv(default_block_size, config::warp_size);
     const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
 
-    kernel::count_nnz_per_row(grid_dim_nnz, default_block_size, 0,
-                              exec->get_queue(), num_rows, num_cols, stride,
-                              source->get_const_values(), row_ptrs);
+    kernel::count_nnz_per_row_call(
+        grid_dim_nnz, default_block_size, 0, exec->get_queue(), num_rows,
+        num_cols, stride, source->get_const_values(), row_ptrs);
 
     components::prefix_sum(exec, row_ptrs, num_rows + 1);
 
@@ -1157,7 +1223,7 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
 
     if (grid_dim > 0) {
         std::cout << "calculate_slice_lengths" << std::endl;
-        kernel::calculate_slice_lengths(
+        kernel::calculate_slice_lengths_call(
             grid_dim, config::warp_size, 0, exec->get_queue(), num_rows,
             slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(),
             slice_lengths, slice_sets);
@@ -1218,24 +1284,30 @@ void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
     auto nnz_per_row = Array<size_type>(exec, num_rows);
 
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
-
-    const auto n = ceildiv(num_rows, default_block_size);
-    const size_type grid_dim =
-        (n <= default_block_size) ? n : default_block_size;
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const ConfigSetType cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    std::cout << "wg_size " << wg_size << "sg_size " << KCFG_1D::decode<1>(cfg)
+              << std::endl;
+    const auto n = ceildiv(num_rows, wg_size);
+    const size_type grid_dim = (n <= wg_size) ? n : wg_size;
 
     auto block_results = Array<size_type>(exec, grid_dim);
 
-    kernel::reduce_max_nnz(
-        grid_dim, default_block_size, default_block_size * sizeof(size_type),
-        exec->get_queue(), num_rows, nnz_per_row.get_const_data(),
-        block_results.get_data());
+    kernel::reduce_max_nnz_call(
+        grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
+        num_rows, nnz_per_row.get_const_data(), block_results.get_data());
 
     auto d_result = Array<size_type>(exec, 1);
 
-    kernel::reduce_max_nnz(1, default_block_size,
-                           default_block_size * sizeof(size_type),
-                           exec->get_queue(), grid_dim,
-                           block_results.get_const_data(), d_result.get_data());
+    kernel::reduce_max_nnz_call(
+        1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), grid_dim,
+        block_results.get_const_data(), d_result.get_data());
 
     *result = exec->copy_val_to_host(d_result.get_const_data());
 }
@@ -1249,12 +1321,21 @@ void calculate_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
                                 const matrix::Dense<ValueType> *source,
                                 Array<size_type> *result)
 {
-    const dim3 block_size(default_block_size, 1, 1);
-    auto rows_per_block = ceildiv(default_block_size, config::warp_size);
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const ConfigSetType cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+    const dim3 block_size(wg_size, 1, 1);
+    auto rows_per_block = ceildiv(wg_size, sg_size);
     const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block);
     const dim3 grid_size(grid_x, 1, 1);
     if (grid_x > 0) {
-        kernel::count_nnz_per_row(
+        kernel::count_nnz_per_row_call(
             grid_size, block_size, 0, exec->get_queue(), source->get_size()[0],
             source->get_size()[1], source->get_stride(),
             source->get_const_values(), result->get_data());
@@ -1286,28 +1367,36 @@ void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
 
     auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
-
-    auto grid_dim = ceildiv(slice_num * config::warp_size, default_block_size);
-
-    kernel::reduce_max_nnz_per_slice(
-        grid_dim, default_block_size, 0, exec->get_queue(), num_rows,
-        slice_size, stride_factor, nnz_per_row.get_const_data(),
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const ConfigSetType cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+
+    auto grid_dim = ceildiv(slice_num * sg_size, wg_size);
+
+    kernel::reduce_max_nnz_per_slice_call(
+        grid_dim, wg_size, 0, exec->get_queue(), num_rows, slice_size,
+        stride_factor, nnz_per_row.get_const_data(),
         max_nnz_per_slice.get_data());
 
-    grid_dim = ceildiv(slice_num, default_block_size);
+    grid_dim = ceildiv(slice_num, wg_size);
     auto block_results = Array<size_type>(exec, grid_dim);
 
-    kernel::reduce_total_cols(
-        grid_dim, default_block_size, default_block_size * sizeof(size_type),
-        exec->get_queue(), slice_num, max_nnz_per_slice.get_const_data(),
-        block_results.get_data());
+    kernel::reduce_total_cols(grid_dim, wg_size, wg_size * sizeof(size_type),
+                              exec->get_queue(), slice_num,
+                              max_nnz_per_slice.get_const_data(),
+                              block_results.get_data());
 
     auto d_result = Array<size_type>(exec, 1);
 
     kernel::reduce_total_cols(
-        1, default_block_size, default_block_size * sizeof(size_type),
-        exec->get_queue(), grid_dim, block_results.get_const_data(),
-        d_result.get_data());
+        1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), grid_dim,
+        block_results.get_const_data(), d_result.get_data());
 
     *result = exec->copy_val_to_host(d_result.get_const_data());
 }

From 20061b64477142be4b834b733002553879b9ef04 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 25 May 2021 22:02:18 +0200
Subject: [PATCH 11/22] move to new style for config selection

---
 dpcpp/components/prefix_sum.dp.cpp |  38 ++++-
 dpcpp/components/prefix_sum.dp.hpp |  18 +-
 dpcpp/components/reduction.dp.hpp  |  42 ++---
 dpcpp/matrix/dense_kernels.dp.cpp  | 256 ++++++++++++++---------------
 4 files changed, 181 insertions(+), 173 deletions(-)

diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp
index b4961809a8b..62a9700473d 100644
--- a/dpcpp/components/prefix_sum.dp.cpp
+++ b/dpcpp/components/prefix_sum.dp.cpp
@@ -36,6 +36,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <CL/sycl.hpp>
 
 
+#include <include/ginkgo/core/base/types.hpp>
+
+
+#include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/prefix_sum.dp.hpp"
 
 
@@ -45,7 +49,20 @@ namespace dpcpp {
 namespace components {
 
 
-constexpr int prefix_sum_block_size = 256;
+using BlockCfg = ConfigSet<11>;
+
+constexpr auto block_cfg_list =
+    ::gko::syn::value_list<ConfigSetType, BlockCfg::encode(512),
+                           BlockCfg::encode(256), BlockCfg::encode(128)>();
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum,
+                               BlockCfg, block_cfg_list)
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_prefix_sum,
+                                           finalize_prefix_sum)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_prefix_sum_call, finalize_prefix_sum,
+                               BlockCfg, block_cfg_list)
 
 
 template <typename IndexType>
@@ -54,18 +71,23 @@ void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
 {
     // prefix_sum should be on the valid array
     if (num_entries > 0) {
-        auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
+        auto queue = exec->get_queue();
+        constexpr auto block_cfg_array = as_array(block_cfg_list);
+        const ConfigSetType cfg =
+            get_first_cfg(block_cfg_array, [&queue](ConfigSetType cfg) {
+                return validate(queue, BlockCfg::decode<0>(cfg), 16);
+            });
+        const auto wg_size = BlockCfg::decode<0>(cfg);
+        auto num_blocks = ceildiv(num_entries, wg_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
         auto block_sums = block_sum_array.get_data();
-        start_prefix_sum<prefix_sum_block_size>(
-            num_blocks, prefix_sum_block_size, 0, exec->get_queue(),
-            num_entries, counts, block_sums);
+        start_prefix_sum_call(num_blocks, wg_size, 0, exec->get_queue(), cfg,
+                              num_entries, counts, block_sums);
         // add the total sum of the previous block only when the number of block
         // is larger than 1.
         if (num_blocks > 1) {
-            finalize_prefix_sum<prefix_sum_block_size>(
-                num_blocks, prefix_sum_block_size, 0, exec->get_queue(),
-                num_entries, counts, block_sums);
+            finalize_prefix_sum_call(num_blocks, wg_size, 0, exec->get_queue(),
+                                     cfg, num_entries, counts, block_sums);
         }
     }
 }
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
index c6f7c7cfb20..fd9ff2ac263 100644
--- a/dpcpp/components/prefix_sum.dp.hpp
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -125,7 +125,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element,
  * @note To calculate the prefix sum over an array of size bigger than
  *       `block_size`, `finalize_prefix_sum` has to be used as well.
  */
-template <int block_size, typename ValueType>
+template <ConfigSetType block_size, typename ValueType>
 void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
                       ValueType *__restrict__ block_sum,
                       sycl::nd_item<3> item_ct1,
@@ -178,7 +178,7 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
     }
 }
 
-template <int block_size, typename ValueType>
+template <ConfigSetType block_size, typename ValueType>
 void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                       sycl::queue *stream, size_type num_elements,
                       ValueType *elements, ValueType *block_sum)
@@ -189,10 +189,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::access::target::local>
             prefix_helper_acc_ct1(cgh);
 
-        auto local_range = block.get_range();
-        auto global_range = grid.get_range() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              start_prefix_sum<block_size>(
                                  num_elements, elements, block_sum, item_ct1,
@@ -217,7 +214,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
  *
  * @note To calculate a prefix sum, first `start_prefix_sum` has to be called.
  */
-template <int block_size, typename ValueType>
+template <ConfigSetType block_size, typename ValueType>
 void finalize_prefix_sum(size_type num_elements,
                          ValueType *__restrict__ elements,
                          const ValueType *__restrict__ block_sum,
@@ -234,16 +231,13 @@ void finalize_prefix_sum(size_type num_elements,
     }
 }
 
-template <int block_size, typename ValueType>
+template <ConfigSetType block_size, typename ValueType>
 void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          sycl::queue *stream, size_type num_elements,
                          ValueType *elements, const ValueType *block_sum)
 {
     stream->submit([&](sycl::handler &cgh) {
-        auto local_range = block.get_range();
-        auto global_range = grid.get_range() * local_range;
-
-        cgh.parallel_for(sycl::nd_range<3>(global_range, local_range),
+        cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              finalize_prefix_sum<block_size>(
                                  num_elements, elements, block_sum, item_ct1);
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index bc9937ddb1a..2626d40b314 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -261,31 +261,33 @@ ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,
     size_type grid_dim = size;
     auto block_results = Array<ValueType>(exec);
     ValueType answer = zero<ValueType>();
-    for (auto &cfg : kcfg_1d_array) {
-        const auto block_size = KCFG_1D::decode<0>(cfg);
-        const auto warp_size = KCFG_1D::decode<1>(cfg);
-        if (!validate(exec->get_queue(), block_size, warp_size)) {
-            continue;
-        }
-        if (size > block_size) {
-            const auto n = ceildiv(size, block_size);
-            grid_dim = (n <= block_size) ? n : block_size;
-
-            block_results.resize_and_reset(grid_dim);
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const ConfigSetType cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
 
-            reduce_add_array_call(grid_dim, block_size, 0, exec->get_queue(),
-                                  size, source, block_results.get_data());
+    if (size > wg_size) {
+        const auto n = ceildiv(size, wg_size);
+        grid_dim = (n <= wg_size) ? n : wg_size;
 
-            block_results_val = block_results.get_const_data();
-        }
+        block_results.resize_and_reset(grid_dim);
 
-        auto d_result = Array<ValueType>(exec, 1);
+        reduce_add_array_call(grid_dim, wg_size, 0, exec->get_queue(), cfg,
+                              size, source, block_results.get_data());
 
-        reduce_add_array_call(1, block_size, 0, exec->get_queue(), grid_dim,
-                              block_results_val, d_result.get_data());
-        answer = exec->copy_val_to_host(d_result.get_const_data());
-        break;
+        block_results_val = block_results.get_const_data();
     }
+
+    auto d_result = Array<ValueType>(exec, 1);
+
+    reduce_add_array_call(1, wg_size, 0, exec->get_queue(), cfg, grid_dim,
+                          block_results_val, d_result.get_data());
+    answer = exec->copy_val_to_host(d_result.get_const_data());
     return answer;
 }
 
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 0d1be735e17..3e4fee0a2d5 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -264,6 +264,11 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
     });
 }
 
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot,
+                                           compute_partial_dot)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot,
+                               KCFG_1D, kcfg_1d_list)
+
 
 template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void finalize_dot_computation(
@@ -301,51 +306,10 @@ void finalize_dot_computation(dim3 grid, dim3 block,
     });
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
-void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
-                 const matrix::Dense<ValueType> *x,
-                 const matrix::Dense<ValueType> *y,
-                 matrix::Dense<ValueType> *result)
-{
-    constexpr auto work_per_thread = 32;
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
-    std::cout << "dot " << cfg << " " << wg_size << " " << sg_size << std::endl;
-    constexpr auto work_per_block = work_per_thread * wg_size;
-    const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-    const dim3 block_dim{sg_size, 1, wg_size / sg_size};
-    Array<ValueType> work(exec, grid_dim.x);
-    // TODO: write a kernel which does this more efficiently
-    for (size_type col = 0; col < x->get_size()[1]; ++col) {
-        compute_partial_dot<cfg>(grid_dim, block_dim, 0, exec->get_queue(),
-                                 x->get_size()[0], x->get_const_values() + col,
-                                 x->get_stride(), y->get_const_values() + col,
-                                 y->get_stride(), work.get_data());
-        finalize_dot_computation<cfg>(1, block_dim, 0, exec->get_queue(),
-                                      grid_dim.x, work.get_const_data(),
-                                      result->get_values() + col);
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_dot_config, compute_dot)
-
-template <typename ValueType>
-void compute_dot_call(std::shared_ptr<const DpcppExecutor> exec,
-                      const matrix::Dense<ValueType> *x,
-                      const matrix::Dense<ValueType> *y,
-                      matrix::Dense<ValueType> *result)
-{
-    auto queue = exec->get_queue();
-    compute_dot_config(
-        kcfg_1d_list,
-        [&queue](::gko::ConfigSetType cfg) {
-            return validate(queue, KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
-        },
-        ::gko::syn::value_list<bool>(), ::gko::syn::value_list<int>(),
-        ::gko::syn::value_list<gko::size_type>(), ::gko::syn::type_list<>(),
-        exec, x, y, result);
-}
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_dot_computation,
+                                           finalize_dot_computation)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_dot_computation_call,
+                               finalize_dot_computation, KCFG_1D, kcfg_1d_list)
 
 
 template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
@@ -386,6 +350,11 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
     });
 }
 
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2,
+                                           compute_partial_norm2)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call,
+                               compute_partial_norm2, KCFG_1D, kcfg_1d_list)
+
 
 template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void finalize_norm2_computation(
@@ -423,53 +392,11 @@ void finalize_norm2_computation(dim3 grid, dim3 block,
     });
 }
 
-
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
-void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
-                   const matrix::Dense<ValueType> *x,
-                   matrix::Dense<remove_complex<ValueType>> *result)
-{
-    using norm_type = remove_complex<ValueType>;
-    // // TODO: these are tuning parameters obtained experimentally, once
-    // // we decide how to handle this uniformly, they should be modified
-    // // appropriately
-    constexpr auto work_per_thread = 32;
-    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
-
-    constexpr auto work_per_block = work_per_thread * wg_size;
-    const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
-    const dim3 block_dim{sg_size, 1, wg_size / sg_size};
-    Array<norm_type> work(exec, grid_dim.x);
-    // TODO: write a kernel which does this more efficiently
-    for (size_type col = 0; col < x->get_size()[1]; ++col) {
-        compute_partial_norm2<cfg>(
-            grid_dim, block_dim, 0, exec->get_queue(), x->get_size()[0],
-            x->get_const_values() + col, x->get_stride(), work.get_data());
-        finalize_norm2_computation<cfg>(1, block_dim, 0, exec->get_queue(),
-                                        grid_dim.x, work.get_const_data(),
-                                        result->get_values() + col);
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_norm2_config, compute_norm2)
-
-template <typename ValueType>
-void compute_norm2_call(std::shared_ptr<const DpcppExecutor> exec,
-                        const matrix::Dense<ValueType> *x,
-                        matrix::Dense<remove_complex<ValueType>> *result)
-{
-    auto queue = exec->get_queue();
-    compute_norm2_config(
-        kcfg_1d_list,
-        [&queue](::gko::ConfigSetType cfg) {
-            return validate(queue, KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
-        },
-        ::gko::syn::value_list<bool>(), ::gko::syn::value_list<int>(),
-        ::gko::syn::value_list<gko::size_type>(), ::gko::syn::type_list<>(),
-        exec, x, result);
-}
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_norm2_computation,
+                                           finalize_norm2_computation)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_norm2_computation_call,
+                               finalize_norm2_computation, KCFG_1D,
+                               kcfg_1d_list)
 
 
 template <typename ValueType, typename IndexType>
@@ -1055,7 +982,32 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
         // TODO: these are tuning parameters obtained experimentally, once
         // we decide how to handle this uniformly, they should be modified
         // appropriately
-        kernel::compute_dot_call(exec, x, y, result);
+        constexpr auto work_per_thread = 32;
+        auto queue = exec->get_queue();
+        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+        const ConfigSetType cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+                return validate(queue, KCFG_1D::decode<0>(cfg),
+                                KCFG_1D::decode<1>(cfg));
+            });
+        const auto wg_size = KCFG_1D::decode<0>(cfg);
+        const auto sg_size = KCFG_1D::decode<1>(cfg);
+        std::cout << "dot " << cfg << " " << wg_size << " " << sg_size
+                  << std::endl;
+        const auto work_per_block = work_per_thread * wg_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
+        Array<ValueType> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_dot_call(
+                grid_dim, block_dim, 0, exec->get_queue(), cfg,
+                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
+                y->get_const_values() + col, y->get_stride(), work.get_data());
+            kernel::finalize_dot_computation_call(
+                1, block_dim, 0, exec->get_queue(), cfg, grid_dim.x,
+                work.get_const_data(), result->get_values() + col);
+        }
     }
 }
 
@@ -1084,7 +1036,35 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
                 result->get_values() + col);
         }
     } else {
-        kernel::compute_norm2_call(exec, x, result);
+        using norm_type = remove_complex<ValueType>;
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        auto queue = exec->get_queue();
+        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+        const ConfigSetType cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+                return validate(queue, KCFG_1D::decode<0>(cfg),
+                                KCFG_1D::decode<1>(cfg));
+            });
+        const auto wg_size = KCFG_1D::decode<0>(cfg);
+        const auto sg_size = KCFG_1D::decode<1>(cfg);
+
+        const auto work_per_block = work_per_thread * wg_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
+        Array<norm_type> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_norm2_call(
+                grid_dim, block_dim, 0, exec->get_queue(), cfg,
+                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
+                work.get_data());
+            kernel::finalize_norm2_computation_call(
+                1, block_dim, 0, exec->get_queue(), cfg, grid_dim.x,
+                work.get_const_data(), result->get_values() + col);
+        }
     }
 }
 
@@ -1127,6 +1107,16 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *source,
                     matrix::Csr<ValueType, IndexType> *result)
 {
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const ConfigSetType cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+
     auto num_rows = result->get_size()[0];
     auto num_cols = result->get_size()[1];
 
@@ -1136,20 +1126,20 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
 
     auto stride = source->get_stride();
 
-    const auto rows_per_block = ceildiv(default_block_size, config::warp_size);
+    const auto rows_per_block = ceildiv(wg_size, sg_size);
     const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
 
-    kernel::count_nnz_per_row_call(
-        grid_dim_nnz, default_block_size, 0, exec->get_queue(), num_rows,
-        num_cols, stride, source->get_const_values(), row_ptrs);
+    kernel::count_nnz_per_row_call(grid_dim_nnz, wg_size, 0, exec->get_queue(),
+                                   cfg, num_rows, num_cols, stride,
+                                   source->get_const_values(), row_ptrs);
 
     components::prefix_sum(exec, row_ptrs, num_rows + 1);
 
-    size_type grid_dim = ceildiv(num_rows, default_block_size);
+    size_type grid_dim = ceildiv(num_rows, wg_size);
 
-    kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(),
-                        num_rows, num_cols, stride, source->get_const_values(),
-                        row_ptrs, col_idxs, values);
+    kernel::fill_in_csr(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
+                        num_cols, stride, source->get_const_values(), row_ptrs,
+                        col_idxs, values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1197,6 +1187,16 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
                       const matrix::Dense<ValueType> *source,
                       matrix::Sellp<ValueType, IndexType> *result)
 {
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const ConfigSetType cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+
     const auto stride = source->get_stride();
     const auto num_rows = result->get_size()[0];
     const auto num_cols = result->get_size()[1];
@@ -1215,34 +1215,24 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
     const int slice_num = ceildiv(num_rows, slice_size);
 
     auto nnz_per_row = Array<size_type>(exec, num_rows);
-    std::cout << "calculate_nonzeros_per_row" << std::endl;
+
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
-    exec->synchronize();
-    std::cout << "calculate_nonzeros_per_row finish" << std::endl;
+
     auto grid_dim = slice_num;
 
     if (grid_dim > 0) {
-        std::cout << "calculate_slice_lengths" << std::endl;
         kernel::calculate_slice_lengths_call(
-            grid_dim, config::warp_size, 0, exec->get_queue(), num_rows,
-            slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(),
+            grid_dim, sg_size, 0, exec->get_queue(), cfg, num_rows, slice_size,
+            slice_num, stride_factor, nnz_per_row.get_const_data(),
             slice_lengths, slice_sets);
-        exec->synchronize();
-        std::cout << "calculate_slice_lengths finish" << std::endl;
     }
-    std::cout << "prefix_sum" << std::endl;
     components::prefix_sum(exec, slice_sets, slice_num + 1);
-    // exec->synchronize();
-    std::cout << "prefix_sum finish" << std::endl;
-    grid_dim = ceildiv(num_rows, default_block_size);
+    grid_dim = ceildiv(num_rows, wg_size);
     if (grid_dim > 0) {
-        std::cout << "fill_in_sellp" << std::endl;
-        kernel::fill_in_sellp(grid_dim, default_block_size, 0,
-                              exec->get_queue(), num_rows, num_cols, slice_size,
-                              stride, source->get_const_values(), slice_lengths,
+        kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
+                              num_cols, slice_size, stride,
+                              source->get_const_values(), slice_lengths,
                               slice_sets, col_idxs, vals);
-        exec->synchronize();
-        std::cout << "fill_in_sellp finish" << std::endl;
     }
 }
 
@@ -1300,14 +1290,14 @@ void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
     auto block_results = Array<size_type>(exec, grid_dim);
 
     kernel::reduce_max_nnz_call(
-        grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
+        grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
         num_rows, nnz_per_row.get_const_data(), block_results.get_data());
 
     auto d_result = Array<size_type>(exec, 1);
 
     kernel::reduce_max_nnz_call(
-        1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), grid_dim,
-        block_results.get_const_data(), d_result.get_data());
+        1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
+        grid_dim, block_results.get_const_data(), d_result.get_data());
 
     *result = exec->copy_val_to_host(d_result.get_const_data());
 }
@@ -1336,8 +1326,8 @@ void calculate_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
     const dim3 grid_size(grid_x, 1, 1);
     if (grid_x > 0) {
         kernel::count_nnz_per_row_call(
-            grid_size, block_size, 0, exec->get_queue(), source->get_size()[0],
-            source->get_size()[1], source->get_stride(),
+            grid_size, block_size, 0, exec->get_queue(), cfg,
+            source->get_size()[0], source->get_size()[1], source->get_stride(),
             source->get_const_values(), result->get_data());
     }
 }
@@ -1380,23 +1370,23 @@ void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
     auto grid_dim = ceildiv(slice_num * sg_size, wg_size);
 
     kernel::reduce_max_nnz_per_slice_call(
-        grid_dim, wg_size, 0, exec->get_queue(), num_rows, slice_size,
+        grid_dim, wg_size, 0, exec->get_queue(), cfg, num_rows, slice_size,
         stride_factor, nnz_per_row.get_const_data(),
         max_nnz_per_slice.get_data());
 
     grid_dim = ceildiv(slice_num, wg_size);
     auto block_results = Array<size_type>(exec, grid_dim);
 
-    kernel::reduce_total_cols(grid_dim, wg_size, wg_size * sizeof(size_type),
-                              exec->get_queue(), slice_num,
-                              max_nnz_per_slice.get_const_data(),
-                              block_results.get_data());
+    kernel::reduce_total_cols_call(
+        grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
+        slice_num, max_nnz_per_slice.get_const_data(),
+        block_results.get_data());
 
     auto d_result = Array<size_type>(exec, 1);
 
-    kernel::reduce_total_cols(
-        1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), grid_dim,
-        block_results.get_const_data(), d_result.get_data());
+    kernel::reduce_total_cols_call(
+        1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
+        grid_dim, block_results.get_const_data(), d_result.get_data());
 
     *result = exec->copy_val_to_host(d_result.get_const_data());
 }

From 295ea5d079f9f4692b90e82e0bd9f69fde3e45bc Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 25 May 2021 23:07:31 +0200
Subject: [PATCH 12/22] mv config to the first argument

---
 dev_tools/scripts/regroup            |  2 +-
 dpcpp/base/helper.dp.cpp             | 33 ++++++++++++++++
 dpcpp/components/prefix_sum.dp.cpp   | 13 ++++---
 dpcpp/components/reduction.dp.hpp    |  8 ++--
 dpcpp/matrix/dense_kernels.dp.cpp    | 56 +++++++++++++---------------
 dpcpp/test/components/prefix_sum.cpp |  3 --
 dpcpp/test/matrix/dense_kernels.cpp  |  1 -
 dpcpp/test/utils.hpp                 | 54 ---------------------------
 8 files changed, 71 insertions(+), 99 deletions(-)
 delete mode 100644 dpcpp/test/utils.hpp

diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup
index 9e8a5172d05..1756481e2e4 100644
--- a/dev_tools/scripts/regroup
+++ b/dev_tools/scripts/regroup
@@ -2,7 +2,7 @@ IncludeBlocks: Regroup
 IncludeCategories:
   - Regex: '^<(rapidjson|gflags|gtest|papi).*'
     Priority: 3
-  - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative).*'
+  - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi).*'
     Priority: 2
   - Regex: '^<ginkgo.*'
     Priority: 5
diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp
index fe4395e2534..ae453dd937d 100644
--- a/dpcpp/base/helper.dp.cpp
+++ b/dpcpp/base/helper.dp.cpp
@@ -1,5 +1,38 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
 #include <CL/sycl.hpp>
 
+
 #include "dpcpp/base/helper.hpp"
 
 
diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp
index 62a9700473d..330fa297e58 100644
--- a/dpcpp/components/prefix_sum.dp.cpp
+++ b/dpcpp/components/prefix_sum.dp.cpp
@@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <CL/sycl.hpp>
 
 
-#include <include/ginkgo/core/base/types.hpp>
+#include <ginkgo/core/base/types.hpp>
 
 
 #include "dpcpp/base/helper.hpp"
@@ -57,12 +57,12 @@ constexpr auto block_cfg_list =
 
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum,
-                               BlockCfg, block_cfg_list)
+                               block_cfg_list)
 
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_prefix_sum,
                                            finalize_prefix_sum)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_prefix_sum_call, finalize_prefix_sum,
-                               BlockCfg, block_cfg_list)
+                               block_cfg_list)
 
 
 template <typename IndexType>
@@ -81,13 +81,14 @@ void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
         auto num_blocks = ceildiv(num_entries, wg_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
         auto block_sums = block_sum_array.get_data();
-        start_prefix_sum_call(num_blocks, wg_size, 0, exec->get_queue(), cfg,
+        start_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(),
                               num_entries, counts, block_sums);
         // add the total sum of the previous block only when the number of block
         // is larger than 1.
         if (num_blocks > 1) {
-            finalize_prefix_sum_call(num_blocks, wg_size, 0, exec->get_queue(),
-                                     cfg, num_entries, counts, block_sums);
+            finalize_prefix_sum_call(cfg, num_blocks, wg_size, 0,
+                                     exec->get_queue(), num_entries, counts,
+                                     block_sums);
         }
     }
 }
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index 2626d40b314..d3e925ee4ba 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -42,9 +42,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/synthesizer/containers.hpp>
 
 
-#include <ginkgo/core/synthesizer/containers.hpp>
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
@@ -241,7 +241,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_add_array_config,
                                            reduce_add_array);
 
 GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config,
-                               KCFG_1D, kcfg_1d_list);
+                               kcfg_1d_list);
 
 
 /**
@@ -277,7 +277,7 @@ ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,
 
         block_results.resize_and_reset(grid_dim);
 
-        reduce_add_array_call(grid_dim, wg_size, 0, exec->get_queue(), cfg,
+        reduce_add_array_call(cfg, grid_dim, wg_size, 0, exec->get_queue(),
                               size, source, block_results.get_data());
 
         block_results_val = block_results.get_const_data();
@@ -285,7 +285,7 @@ ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,
 
     auto d_result = Array<ValueType>(exec, 1);
 
-    reduce_add_array_call(1, wg_size, 0, exec->get_queue(), cfg, grid_dim,
+    reduce_add_array_call(cfg, 1, wg_size, 0, exec->get_queue(), grid_dim,
                           block_results_val, d_result.get_data());
     answer = exec->copy_val_to_host(d_result.get_const_data());
     return answer;
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 3e4fee0a2d5..e83c5dba5cc 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 
 
+#include <iostream>
+
+
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
@@ -45,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
-#include <iostream>
 
 
 #include "core/components/prefix_sum.hpp"
@@ -188,11 +190,6 @@ void compute_partial_reduce(
     }
 }
 
-// GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_reduce_config,
-//                                            compute_partial_reduce);
-// GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_reduce_call,
-// compute_partial_reduce_config,
-//                                KCFG_1D, kcfg_1d_list);
 
 template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType,
           typename CallableReduce, typename CallableFinalize>
@@ -267,7 +264,7 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot,
                                            compute_partial_dot)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot,
-                               KCFG_1D, kcfg_1d_list)
+                               kcfg_1d_list)
 
 
 template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
@@ -309,7 +306,7 @@ void finalize_dot_computation(dim3 grid, dim3 block,
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_dot_computation,
                                            finalize_dot_computation)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_dot_computation_call,
-                               finalize_dot_computation, KCFG_1D, kcfg_1d_list)
+                               finalize_dot_computation, kcfg_1d_list)
 
 
 template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
@@ -353,7 +350,7 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2,
                                            compute_partial_norm2)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call,
-                               compute_partial_norm2, KCFG_1D, kcfg_1d_list)
+                               compute_partial_norm2, kcfg_1d_list)
 
 
 template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
@@ -395,8 +392,7 @@ void finalize_norm2_computation(dim3 grid, dim3 block,
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_norm2_computation,
                                            finalize_norm2_computation)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_norm2_computation_call,
-                               finalize_norm2_computation, KCFG_1D,
-                               kcfg_1d_list)
+                               finalize_norm2_computation, kcfg_1d_list)
 
 
 template <typename ValueType, typename IndexType>
@@ -452,7 +448,7 @@ void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride,
 GKO_ENABLE_DEFAULT_HOST_CONFIG(count_nnz_per_row, count_nnz_per_row)
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(count_nnz_per_row, count_nnz_per_row)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row,
-                               KCFG_1D, kcfg_1d_list)
+                               kcfg_1d_list)
 
 
 template <typename ValueType, typename IndexType>
@@ -552,7 +548,7 @@ GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths)
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths,
                                            calculate_slice_lengths)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call,
-                               calculate_slice_lengths, KCFG_1D, kcfg_1d_list)
+                               calculate_slice_lengths, kcfg_1d_list)
 
 
 template <typename ValueType, typename IndexType>
@@ -627,7 +623,7 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
 }
 
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz);
-GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz, KCFG_1D,
+GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz,
                                kcfg_1d_list)
 
 template <ConfigSetType cfg>
@@ -666,7 +662,7 @@ GKO_ENABLE_DEFAULT_HOST_CONFIG(reduce_max_nnz_per_slice,
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz_per_slice,
                                            reduce_max_nnz_per_slice)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call,
-                               reduce_max_nnz_per_slice, KCFG_1D, kcfg_1d_list)
+                               reduce_max_nnz_per_slice, kcfg_1d_list)
 
 
 template <ConfigSetType cfg>
@@ -708,7 +704,7 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_total_cols,
                                            reduce_total_cols);
 GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols,
-                               KCFG_1D, kcfg_1d_list)
+                               kcfg_1d_list)
 
 
 template <typename IndexType, typename ValueType>
@@ -1001,11 +997,11 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
         // TODO: write a kernel which does this more efficiently
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
             kernel::compute_partial_dot_call(
-                grid_dim, block_dim, 0, exec->get_queue(), cfg,
+                cfg, grid_dim, block_dim, 0, exec->get_queue(),
                 x->get_size()[0], x->get_const_values() + col, x->get_stride(),
                 y->get_const_values() + col, y->get_stride(), work.get_data());
             kernel::finalize_dot_computation_call(
-                1, block_dim, 0, exec->get_queue(), cfg, grid_dim.x,
+                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
                 work.get_const_data(), result->get_values() + col);
         }
     }
@@ -1058,11 +1054,11 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
         // TODO: write a kernel which does this more efficiently
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
             kernel::compute_partial_norm2_call(
-                grid_dim, block_dim, 0, exec->get_queue(), cfg,
+                cfg, grid_dim, block_dim, 0, exec->get_queue(),
                 x->get_size()[0], x->get_const_values() + col, x->get_stride(),
                 work.get_data());
             kernel::finalize_norm2_computation_call(
-                1, block_dim, 0, exec->get_queue(), cfg, grid_dim.x,
+                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
                 work.get_const_data(), result->get_values() + col);
         }
     }
@@ -1129,9 +1125,9 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
     const auto rows_per_block = ceildiv(wg_size, sg_size);
     const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
 
-    kernel::count_nnz_per_row_call(grid_dim_nnz, wg_size, 0, exec->get_queue(),
-                                   cfg, num_rows, num_cols, stride,
-                                   source->get_const_values(), row_ptrs);
+    kernel::count_nnz_per_row_call(
+        cfg, grid_dim_nnz, wg_size, 0, exec->get_queue(), num_rows, num_cols,
+        stride, source->get_const_values(), row_ptrs);
 
     components::prefix_sum(exec, row_ptrs, num_rows + 1);
 
@@ -1222,7 +1218,7 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
 
     if (grid_dim > 0) {
         kernel::calculate_slice_lengths_call(
-            grid_dim, sg_size, 0, exec->get_queue(), cfg, num_rows, slice_size,
+            cfg, grid_dim, sg_size, 0, exec->get_queue(), num_rows, slice_size,
             slice_num, stride_factor, nnz_per_row.get_const_data(),
             slice_lengths, slice_sets);
     }
@@ -1290,13 +1286,13 @@ void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
     auto block_results = Array<size_type>(exec, grid_dim);
 
     kernel::reduce_max_nnz_call(
-        grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
+        cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
         num_rows, nnz_per_row.get_const_data(), block_results.get_data());
 
     auto d_result = Array<size_type>(exec, 1);
 
     kernel::reduce_max_nnz_call(
-        1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
+        cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
         grid_dim, block_results.get_const_data(), d_result.get_data());
 
     *result = exec->copy_val_to_host(d_result.get_const_data());
@@ -1326,7 +1322,7 @@ void calculate_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
     const dim3 grid_size(grid_x, 1, 1);
     if (grid_x > 0) {
         kernel::count_nnz_per_row_call(
-            grid_size, block_size, 0, exec->get_queue(), cfg,
+            cfg, grid_size, block_size, 0, exec->get_queue(),
             source->get_size()[0], source->get_size()[1], source->get_stride(),
             source->get_const_values(), result->get_data());
     }
@@ -1370,7 +1366,7 @@ void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
     auto grid_dim = ceildiv(slice_num * sg_size, wg_size);
 
     kernel::reduce_max_nnz_per_slice_call(
-        grid_dim, wg_size, 0, exec->get_queue(), cfg, num_rows, slice_size,
+        cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, slice_size,
         stride_factor, nnz_per_row.get_const_data(),
         max_nnz_per_slice.get_data());
 
@@ -1378,14 +1374,14 @@ void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
     auto block_results = Array<size_type>(exec, grid_dim);
 
     kernel::reduce_total_cols_call(
-        grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
+        cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
         slice_num, max_nnz_per_slice.get_const_data(),
         block_results.get_data());
 
     auto d_result = Array<size_type>(exec, 1);
 
     kernel::reduce_total_cols_call(
-        1, wg_size, wg_size * sizeof(size_type), exec->get_queue(), cfg,
+        cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
         grid_dim, block_results.get_const_data(), d_result.get_data());
 
     *result = exec->copy_val_to_host(d_result.get_const_data());
diff --git a/dpcpp/test/components/prefix_sum.cpp b/dpcpp/test/components/prefix_sum.cpp
index 3e2e7ca9d64..2ae72880443 100644
--- a/dpcpp/test/components/prefix_sum.cpp
+++ b/dpcpp/test/components/prefix_sum.cpp
@@ -44,9 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/array.hpp>
 
 
-#include "dpcpp/test/utils.hpp"
-
-
 namespace {
 
 
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
index 3cd080313cf..e47de0a6487 100644
--- a/dpcpp/test/matrix/dense_kernels.cpp
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -50,7 +50,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/components/fill_array.hpp"
 #include "core/matrix/dense_kernels.hpp"
-#include "dpcpp/test/utils.hpp"
 
 
 namespace {
diff --git a/dpcpp/test/utils.hpp b/dpcpp/test/utils.hpp
deleted file mode 100644
index 88d98f0d9f6..00000000000
--- a/dpcpp/test/utils.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2020, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#ifndef GKO_DPCPP_TEST_UTILS_HPP_
-#define GKO_DPCPP_TEST_UTILS_HPP_
-
-
-#include "core/test/utils.hpp"
-
-
-#include <ginkgo/core/base/executor.hpp>
-
-
-namespace {
-
-
-// prevent device reset after each test
-auto no_reset_exec =
-    gko::DpcppExecutor::create(0, gko::ReferenceExecutor::create());
-
-
-}  // namespace
-
-
-#endif  // GKO_DPCPP_TEST_UTILS_HPP_

From e0ed0c3145855c7c33317d0ca510f10fdc1f0cf3 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 1 Jun 2021 17:40:52 +0200
Subject: [PATCH 13/22] update lateset ConfigSet and dense kernel/test

---
 dpcpp/base/helper.hpp                |   3 +-
 dpcpp/components/prefix_sum.dp.cpp   |   7 +-
 dpcpp/components/prefix_sum.dp.hpp   |   9 +-
 dpcpp/components/reduction.dp.hpp    |  11 +-
 dpcpp/matrix/dense_kernels.dp.cpp    | 374 ++++++++++++++---------
 dpcpp/test/components/prefix_sum.cpp |   3 +
 dpcpp/test/matrix/dense_kernels.cpp  | 436 ++++++++++++++++++++++-----
 7 files changed, 622 insertions(+), 221 deletions(-)

diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index 3979caa905c..8c7f45e5174 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "core/base/types.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 
 
@@ -142,7 +143,7 @@ bool validate(sycl::queue *queue, unsigned workgroup_size,
 
 
 template <typename IterArr, typename Validate>
-ConfigSetType get_first_cfg(IterArr &arr, Validate verify)
+std::uint32_t get_first_cfg(IterArr &arr, Validate verify)
 {
     for (auto &cfg : arr) {
         if (verify(cfg)) {
diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp
index 330fa297e58..07cdb5b38aa 100644
--- a/dpcpp/components/prefix_sum.dp.cpp
+++ b/dpcpp/components/prefix_sum.dp.cpp
@@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "core/base/types.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/prefix_sum.dp.hpp"
 
@@ -52,7 +53,7 @@ namespace components {
 using BlockCfg = ConfigSet<11>;
 
 constexpr auto block_cfg_list =
-    ::gko::syn::value_list<ConfigSetType, BlockCfg::encode(512),
+    ::gko::syn::value_list<std::uint32_t, BlockCfg::encode(512),
                            BlockCfg::encode(256), BlockCfg::encode(128)>();
 
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum)
@@ -73,8 +74,8 @@ void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
     if (num_entries > 0) {
         auto queue = exec->get_queue();
         constexpr auto block_cfg_array = as_array(block_cfg_list);
-        const ConfigSetType cfg =
-            get_first_cfg(block_cfg_array, [&queue](ConfigSetType cfg) {
+        const std::uint32_t cfg =
+            get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) {
                 return validate(queue, BlockCfg::decode<0>(cfg), 16);
             });
         const auto wg_size = BlockCfg::decode<0>(cfg);
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
index fd9ff2ac263..f76f85135eb 100644
--- a/dpcpp/components/prefix_sum.dp.hpp
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <CL/sycl.hpp>
 
 
+#include "core/base/types.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
@@ -125,7 +126,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element,
  * @note To calculate the prefix sum over an array of size bigger than
  *       `block_size`, `finalize_prefix_sum` has to be used as well.
  */
-template <ConfigSetType block_size, typename ValueType>
+template <std::uint32_t block_size, typename ValueType>
 void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
                       ValueType *__restrict__ block_sum,
                       sycl::nd_item<3> item_ct1,
@@ -178,7 +179,7 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
     }
 }
 
-template <ConfigSetType block_size, typename ValueType>
+template <std::uint32_t block_size, typename ValueType>
 void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                       sycl::queue *stream, size_type num_elements,
                       ValueType *elements, ValueType *block_sum)
@@ -214,7 +215,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
  *
  * @note To calculate a prefix sum, first `start_prefix_sum` has to be called.
  */
-template <ConfigSetType block_size, typename ValueType>
+template <std::uint32_t block_size, typename ValueType>
 void finalize_prefix_sum(size_type num_elements,
                          ValueType *__restrict__ elements,
                          const ValueType *__restrict__ block_sum,
@@ -231,7 +232,7 @@ void finalize_prefix_sum(size_type num_elements,
     }
 }
 
-template <ConfigSetType block_size, typename ValueType>
+template <std::uint32_t block_size, typename ValueType>
 void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          sycl::queue *stream, size_type num_elements,
                          ValueType *elements, const ValueType *block_sum)
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index d3e925ee4ba..9c2387a7113 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/synthesizer/containers.hpp>
 
 
+#include "core/base/types.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
@@ -63,7 +64,7 @@ namespace dpcpp {
 constexpr int default_block_size = 256;
 using KCFG_1D = ConfigSet<11, 7>;
 constexpr auto kcfg_1d_list =
-    syn::value_list<ConfigSetType, KCFG_1D::encode(512, 64),
+    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 64),
                     KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
                     KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
                     KCFG_1D::encode(256, 8)>();
@@ -201,7 +202,7 @@ void reduce_array(size_type size, const ValueType *__restrict__ source,
  * `source` of any size. Has to be called a second time on `result` to reduce
  * an array larger than `block_size`.
  */
-template <ConfigSetType cfg, typename ValueType>
+template <std::uint32_t cfg, typename ValueType>
 void reduce_add_array(
     size_type size, const ValueType *__restrict__ source,
     ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
@@ -216,7 +217,7 @@ void reduce_add_array(
     }
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                       sycl::queue *stream, size_type size,
                       const ValueType *source, ValueType *result)
@@ -263,8 +264,8 @@ ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,
     ValueType answer = zero<ValueType>();
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index e83c5dba5cc..58f688951c9 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 
 
-#include <iostream>
-
-
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
@@ -70,12 +67,14 @@ namespace dpcpp {
  */
 namespace dense {
 
+
 using KCFG_1D = ConfigSet<11, 7>;
 constexpr auto kcfg_1d_list =
-    syn::value_list<ConfigSetType, KCFG_1D::encode(512, 64),
+    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 64),
                     KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
                     KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
                     KCFG_1D::encode(256, 8)>();
+constexpr auto subgroup_list = syn::value_list<std::uint32_t, 64, 32, 16, 8>();
 constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
 constexpr auto default_block_size = 256;
 
@@ -119,6 +118,7 @@ void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols,
 
 GKO_ENABLE_DEFAULT_HOST(scale, scale)
 
+
 template <typename ValueType>
 void add_scaled(size_type num_rows, size_type num_cols,
                 size_type num_alpha_cols, const ValueType *__restrict__ alpha,
@@ -157,7 +157,7 @@ void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha,
 GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag)
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename OutType,
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename OutType,
           typename CallableGetValue, typename CallableReduce>
 void compute_partial_reduce(
     size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value,
@@ -191,7 +191,7 @@ void compute_partial_reduce(
 }
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType,
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType,
           typename CallableReduce, typename CallableFinalize>
 void finalize_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
@@ -220,7 +220,7 @@ void finalize_reduce_computation(
 }
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_dot(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     const ValueType *__restrict__ y, size_type stride_y,
@@ -230,13 +230,13 @@ void compute_partial_dot(
     compute_partial_reduce<cfg>(
         num_rows, work,
         [x, stride_x, y, stride_y](size_type i) {
-            return x[i * stride_x] * conj(y[i * stride_y]);
+            return x[i * stride_x] * y[i * stride_y];
         },
         [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1,
         tmp_work);
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          sycl::queue *stream, size_type num_rows,
                          const ValueType *x, size_type stride_x,
@@ -244,7 +244,6 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          ValueType *work)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    std::cout << "partial " << cfg << std::endl;
     stream->submit([&](sycl::handler &cgh) {
         sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
                        sycl::access::mode::read_write,
@@ -267,8 +266,54 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot,
                                kcfg_1d_list)
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
-void finalize_dot_computation(
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void compute_partial_conj_dot(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    const ValueType *__restrict__ y, size_type stride_y,
+    ValueType *__restrict__ work, sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+{
+    compute_partial_reduce<cfg>(
+        num_rows, work,
+        [x, stride_x, y, stride_y](size_type i) {
+            return conj(x[i * stride_x]) * y[i * stride_y];
+        },
+        [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1,
+        tmp_work);
+}
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void compute_partial_conj_dot(dim3 grid, dim3 block,
+                              size_t dynamic_shared_memory, sycl::queue *stream,
+                              size_type num_rows, const ValueType *x,
+                              size_type stride_x, const ValueType *y,
+                              size_type stride_y, ValueType *work)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                compute_partial_conj_dot<cfg>(
+                    num_rows, x, stride_x, y, stride_y, work, item_ct1,
+                    (UninitializedArray<ValueType, wg_size> *)
+                        tmp_work_acc_ct1.get_pointer());
+            });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_conj_dot,
+                                           compute_partial_conj_dot)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call,
+                               compute_partial_conj_dot, kcfg_1d_list)
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void finalize_sum_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
     sycl::nd_item<3> item_ct1,
     UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
@@ -279,14 +324,13 @@ void finalize_dot_computation(
         [](const ValueType &x) { return x; }, item_ct1, tmp_work);
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
-void finalize_dot_computation(dim3 grid, dim3 block,
-                              size_t dynamic_shared_memory, sycl::queue *stream,
-                              size_type size, const ValueType *work,
-                              ValueType *result)
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void finalize_sum_reduce_computation(dim3 grid, dim3 block,
+                                     size_t dynamic_shared_memory,
+                                     sycl::queue *stream, size_type size,
+                                     const ValueType *work, ValueType *result)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
-    std::cout << "finalize " << cfg << std::endl;
     stream->submit([&](sycl::handler &cgh) {
         sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
                        sycl::access::mode::read_write,
@@ -295,7 +339,7 @@ void finalize_dot_computation(dim3 grid, dim3 block,
 
         cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
-                             finalize_dot_computation<cfg>(
+                             finalize_sum_reduce_computation<cfg>(
                                  size, work, result, item_ct1,
                                  (UninitializedArray<ValueType, wg_size> *)
                                      tmp_work_acc_ct1.get_pointer());
@@ -303,13 +347,13 @@ void finalize_dot_computation(dim3 grid, dim3 block,
     });
 }
 
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_dot_computation,
-                                           finalize_dot_computation)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_dot_computation_call,
-                               finalize_dot_computation, kcfg_1d_list)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sum_reduce_computation,
+                                           finalize_sum_reduce_computation)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call,
+                               finalize_sum_reduce_computation, kcfg_1d_list)
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_norm2(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     remove_complex<ValueType> *__restrict__ work, sycl::nd_item<3> item_ct1,
@@ -324,7 +368,7 @@ void compute_partial_norm2(
         tmp_work);
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
 void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                            sycl::queue *stream, size_type num_rows,
                            const ValueType *x, size_type stride_x,
@@ -353,8 +397,8 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call,
                                compute_partial_norm2, kcfg_1d_list)
 
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
-void finalize_norm2_computation(
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void finalize_sqrt_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
     sycl::nd_item<3> item_ct1,
     UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
@@ -362,14 +406,14 @@ void finalize_norm2_computation(
     finalize_reduce_computation<cfg>(
         size, work, result,
         [](const ValueType &x, const ValueType &y) { return x + y; },
-        [](const ValueType &x) { return sqrt(x); }, item_ct1, tmp_work);
+        [](const ValueType &x) { return std::sqrt(x); }, item_ct1, tmp_work);
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32), typename ValueType>
-void finalize_norm2_computation(dim3 grid, dim3 block,
-                                size_t dynamic_shared_memory,
-                                sycl::queue *stream, size_type size,
-                                const ValueType *work, ValueType *result)
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+void finalize_sqrt_reduce_computation(dim3 grid, dim3 block,
+                                      size_t dynamic_shared_memory,
+                                      sycl::queue *stream, size_type size,
+                                      const ValueType *work, ValueType *result)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     stream->submit([&](sycl::handler &cgh) {
@@ -381,7 +425,7 @@ void finalize_norm2_computation(dim3 grid, dim3 block,
 
         cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
-                             finalize_norm2_computation<cfg>(
+                             finalize_sqrt_reduce_computation<cfg>(
                                  size, work, result, item_ct1,
                                  (UninitializedArray<ValueType, wg_size> *)
                                      tmp_work_acc_ct1.get_pointer());
@@ -389,13 +433,13 @@ void finalize_norm2_computation(dim3 grid, dim3 block,
     });
 }
 
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_norm2_computation,
-                                           finalize_norm2_computation)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_norm2_computation_call,
-                               finalize_norm2_computation, kcfg_1d_list)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sqrt_reduce_computation,
+                                           finalize_sqrt_reduce_computation)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call,
+                               finalize_sqrt_reduce_computation, kcfg_1d_list)
 
 
-template <typename ValueType, typename IndexType>
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
 void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
                  const size_type *__restrict__ row_ptrs,
                  const ValueType *__restrict__ source,
@@ -418,10 +462,12 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo)
+GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_coo, fill_in_coo)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_coo, fill_in_coo)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_coo_call, fill_in_coo, kcfg_1d_list)
 
 
-template <ConfigSetType cfg, typename ValueType, typename IndexType>
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
 void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride,
                        const ValueType *__restrict__ work,
                        IndexType *__restrict__ result,
@@ -451,7 +497,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row,
                                kcfg_1d_list)
 
 
-template <typename ValueType, typename IndexType>
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
 void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
                  const ValueType *__restrict__ source,
                  IndexType *__restrict__ row_ptrs,
@@ -472,10 +518,12 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr)
+GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_csr, fill_in_csr)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_csr, fill_in_csr)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_csr_call, fill_in_csr, kcfg_1d_list)
 
 
-template <typename ValueType, typename IndexType>
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
 void fill_in_ell(size_type num_rows, size_type num_cols,
                  size_type source_stride, const ValueType *__restrict__ source,
                  size_type max_nnz_per_row, size_type result_stride,
@@ -505,10 +553,12 @@ void fill_in_ell(size_type num_rows, size_type num_cols,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell)
+GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_ell, fill_in_ell)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_ell, fill_in_ell)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_ell_call, fill_in_ell, kcfg_1d_list)
 
 
-template <ConfigSetType cfg>
+template <std::uint32_t cfg>
 void calculate_slice_lengths(size_type num_rows, size_type slice_size,
                              int slice_num, size_type stride_factor,
                              const size_type *__restrict__ nnz_per_row,
@@ -516,7 +566,7 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size,
                              size_type *__restrict__ slice_sets,
                              sycl::nd_item<3> item_ct1)
 {
-    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    constexpr auto sg_size = cfg;
     const auto sliceid = item_ct1.get_group(2);
     const auto tid_in_warp = item_ct1.get_local_id(2);
 
@@ -548,10 +598,10 @@ GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths)
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths,
                                            calculate_slice_lengths)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call,
-                               calculate_slice_lengths, kcfg_1d_list)
+                               calculate_slice_lengths, subgroup_list)
 
 
-template <typename ValueType, typename IndexType>
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
 void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
                    size_type stride, const ValueType *__restrict__ source,
                    size_type *__restrict__ slice_lengths,
@@ -584,9 +634,12 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp)
+GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_sellp, fill_in_sellp)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_sellp, fill_in_sellp)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_sellp_call, fill_in_sellp, kcfg_1d_list)
 
-template <ConfigSetType cfg>
+
+template <std::uint32_t cfg>
 void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
                     size_type *__restrict__ result, sycl::nd_item<3> item_ct1,
                     uint8_t *dpct_local)
@@ -603,7 +656,7 @@ void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
     }
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32)>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32)>
 void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                     sycl::queue *stream, size_type size,
                     const size_type *nnz_per_row, size_type *result)
@@ -626,7 +679,8 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz);
 GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz,
                                kcfg_1d_list)
 
-template <ConfigSetType cfg>
+
+template <std::uint32_t cfg>
 void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size,
                               size_type stride_factor,
                               const size_type *__restrict__ nnz_per_row,
@@ -665,7 +719,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call,
                                reduce_max_nnz_per_slice, kcfg_1d_list)
 
 
-template <ConfigSetType cfg>
+template <std::uint32_t cfg>
 void reduce_total_cols(size_type num_slices,
                        const size_type *__restrict__ max_nnz_per_slice,
                        size_type *__restrict__ result,
@@ -682,7 +736,7 @@ void reduce_total_cols(size_type num_slices,
     }
 }
 
-template <ConfigSetType cfg = KCFG_1D::encode(256, 32)>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 32)>
 void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::queue *stream, size_type num_slices,
                        const size_type *max_nnz_per_slice, size_type *result)
@@ -960,6 +1014,34 @@ void apply(std::shared_ptr<const DpcppExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
+namespace {
+
+
+#define GKO_BIND_DOT(ValueType, Name, Func)                                  \
+    void Name(::cl::sycl::queue &exec_queue, std::int64_t n,                 \
+              const ValueType *x, std::int64_t incx, const ValueType *y,     \
+              std::int64_t incy, ValueType *result)                          \
+    {                                                                        \
+        Func(exec_queue, n, x, incx, y, incy, result);                       \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(std::complex<float>, dot, oneapi::mkl::blas::row_major::dotu);
+GKO_BIND_DOT(std::complex<double>, dot, oneapi::mkl::blas::row_major::dotu);
+GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(std::complex<float>, conj_dot, oneapi::mkl::blas::row_major::dotc);
+GKO_BIND_DOT(std::complex<double>, conj_dot,
+             oneapi::mkl::blas::row_major::dotc);
+
+
+}  // namespace
+
+
 template <typename ValueType>
 void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
                  const matrix::Dense<ValueType> *x,
@@ -981,15 +1063,13 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
         constexpr auto work_per_thread = 32;
         auto queue = exec->get_queue();
         constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-        const ConfigSetType cfg =
-            get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+        const std::uint32_t cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
                 return validate(queue, KCFG_1D::decode<0>(cfg),
                                 KCFG_1D::decode<1>(cfg));
             });
         const auto wg_size = KCFG_1D::decode<0>(cfg);
         const auto sg_size = KCFG_1D::decode<1>(cfg);
-        std::cout << "dot " << cfg << " " << wg_size << " " << sg_size
-                  << std::endl;
         const auto work_per_block = work_per_thread * wg_size;
         const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
         const dim3 block_dim{sg_size, 1, wg_size / sg_size};
@@ -1000,7 +1080,7 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
                 cfg, grid_dim, block_dim, 0, exec->get_queue(),
                 x->get_size()[0], x->get_const_values() + col, x->get_stride(),
                 y->get_const_values() + col, y->get_stride(), work.get_data());
-            kernel::finalize_dot_computation_call(
+            kernel::finalize_sum_reduce_computation_call(
                 cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
                 work.get_const_data(), result->get_values() + col);
         }
@@ -1014,7 +1094,47 @@ template <typename ValueType>
 void compute_conj_dot(std::shared_ptr<const DpcppExecutor> exec,
                       const matrix::Dense<ValueType> *x,
                       const matrix::Dense<ValueType> *y,
-                      matrix::Dense<ValueType> *result) GKO_NOT_IMPLEMENTED;
+                      matrix::Dense<ValueType> *result)
+{
+    if (0) {
+        // TODO: write a custom kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            conj_dot(*exec->get_queue(), x->get_size()[0],
+                     x->get_const_values() + col, x->get_stride(),
+                     y->get_const_values() + col, y->get_stride(),
+                     result->get_values() + col);
+        }
+    } else {
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        auto queue = exec->get_queue();
+        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+        const std::uint32_t cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+                return validate(queue, KCFG_1D::decode<0>(cfg),
+                                KCFG_1D::decode<1>(cfg));
+            });
+        const auto wg_size = KCFG_1D::decode<0>(cfg);
+        const auto sg_size = KCFG_1D::decode<1>(cfg);
+
+        const auto work_per_block = work_per_thread * wg_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
+        Array<ValueType> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_conj_dot_call(
+                cfg, grid_dim, block_dim, 0, exec->get_queue(),
+                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
+                y->get_const_values() + col, y->get_stride(), work.get_data());
+            kernel::finalize_sum_reduce_computation_call(
+                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
+                work.get_const_data(), result->get_values() + col);
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 
@@ -1039,8 +1159,8 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
         constexpr auto work_per_thread = 32;
         auto queue = exec->get_queue();
         constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-        const ConfigSetType cfg =
-            get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+        const std::uint32_t cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
                 return validate(queue, KCFG_1D::decode<0>(cfg),
                                 KCFG_1D::decode<1>(cfg));
             });
@@ -1057,7 +1177,7 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
                 cfg, grid_dim, block_dim, 0, exec->get_queue(),
                 x->get_size()[0], x->get_const_values() + col, x->get_stride(),
                 work.get_data());
-            kernel::finalize_norm2_computation_call(
+            kernel::finalize_sqrt_reduce_computation_call(
                 cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
                 work.get_const_data(), result->get_values() + col);
         }
@@ -1086,12 +1206,21 @@ void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
 
     components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows);
 
-    size_type grid_dim = ceildiv(num_rows, default_block_size);
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+    size_type grid_dim = ceildiv(num_rows, wg_size);
 
-    kernel::fill_in_coo(grid_dim, default_block_size, 0, exec->get_queue(),
-                        num_rows, num_cols, stride,
-                        nnz_prefix_sum.get_const_data(),
-                        source->get_const_values(), row_idxs, col_idxs, values);
+    kernel::fill_in_coo_call(
+        cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, num_cols,
+        stride, nnz_prefix_sum.get_const_data(), source->get_const_values(),
+        row_idxs, col_idxs, values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1105,8 +1234,8 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
 {
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
@@ -1133,9 +1262,10 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
 
     size_type grid_dim = ceildiv(num_rows, wg_size);
 
-    kernel::fill_in_csr(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
-                        num_cols, stride, source->get_const_values(), row_ptrs,
-                        col_idxs, values);
+    kernel::fill_in_csr_call(cfg, grid_dim, default_block_size, 0,
+                             exec->get_queue(), num_rows, num_cols, stride,
+                             source->get_const_values(), row_ptrs, col_idxs,
+                             values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1157,11 +1287,20 @@ void convert_to_ell(std::shared_ptr<const DpcppExecutor> exec,
     auto source_stride = source->get_stride();
     auto result_stride = result->get_stride();
 
-    auto grid_dim = ceildiv(result_stride, default_block_size);
-    kernel::fill_in_ell(grid_dim, default_block_size, 0, exec->get_queue(),
-                        num_rows, num_cols, source_stride,
-                        source->get_const_values(), max_nnz_per_row,
-                        result_stride, col_ptrs, values);
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+    auto grid_dim = ceildiv(result_stride, wg_size);
+    kernel::fill_in_ell_call(cfg, grid_dim, wg_size, 0, exec->get_queue(),
+                             num_rows, num_cols, source_stride,
+                             source->get_const_values(), max_nnz_per_row,
+                             result_stride, col_ptrs, values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1185,8 +1324,8 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
 {
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
@@ -1211,24 +1350,25 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
     const int slice_num = ceildiv(num_rows, slice_size);
 
     auto nnz_per_row = Array<size_type>(exec, num_rows);
-
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
 
     auto grid_dim = slice_num;
 
     if (grid_dim > 0) {
         kernel::calculate_slice_lengths_call(
-            cfg, grid_dim, sg_size, 0, exec->get_queue(), num_rows, slice_size,
-            slice_num, stride_factor, nnz_per_row.get_const_data(),
+            sg_size, grid_dim, sg_size, 0, exec->get_queue(), num_rows,
+            slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(),
             slice_lengths, slice_sets);
     }
+
     components::prefix_sum(exec, slice_sets, slice_num + 1);
+
     grid_dim = ceildiv(num_rows, wg_size);
     if (grid_dim > 0) {
-        kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
-                              num_cols, slice_size, stride,
-                              source->get_const_values(), slice_lengths,
-                              slice_sets, col_idxs, vals);
+        kernel::fill_in_sellp_call(cfg, grid_dim, wg_size, 0, exec->get_queue(),
+                                   num_rows, num_cols, slice_size, stride,
+                                   source->get_const_values(), slice_lengths,
+                                   slice_sets, col_idxs, vals);
     }
 }
 
@@ -1272,14 +1412,12 @@ void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
     const auto wg_size = KCFG_1D::decode<0>(cfg);
-    std::cout << "wg_size " << wg_size << "sg_size " << KCFG_1D::decode<1>(cfg)
-              << std::endl;
     const auto n = ceildiv(num_rows, wg_size);
     const size_type grid_dim = (n <= wg_size) ? n : wg_size;
 
@@ -1309,8 +1447,8 @@ void calculate_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
 {
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
@@ -1355,8 +1493,8 @@ void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
     auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
     auto queue = exec->get_queue();
     constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const ConfigSetType cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](ConfigSetType cfg) {
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
             return validate(queue, KCFG_1D::decode<0>(cfg),
                             KCFG_1D::decode<1>(cfg));
         });
@@ -1394,27 +1532,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 template <typename ValueType>
 void transpose(std::shared_ptr<const DpcppExecutor> exec,
                const matrix::Dense<ValueType> *orig,
-               matrix::Dense<ValueType> *trans)
-{
-    // if (cublas::is_supported<ValueType>::value) {
-    //     auto handle = exec->get_cublas_handle();
-    //     {
-    //         cublas::pointer_mode_guard pm_guard(handle);
-    //         auto alpha = one<ValueType>();
-    //         auto beta = zero<ValueType>();
-    //         cublas::geam(
-    //             handle, oneapi::mkl::transpose::trans,
-    //             oneapi::mkl::transpose::nontrans, orig->get_size()[0],
-    //             orig->get_size()[1], &alpha, orig->get_const_values(),
-    //             orig->get_stride(), &beta, static_cast<ValueType
-    //             *>(nullptr), trans->get_size()[1], trans->get_values(),
-    //             trans->get_stride());
-    //     }
-    // } else {
-    //     GKO_NOT_IMPLEMENTED;
-    // }
-    GKO_NOT_IMPLEMENTED;
-};
+               matrix::Dense<ValueType> *trans) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
@@ -1422,27 +1540,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 template <typename ValueType>
 void conj_transpose(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *orig,
-                    matrix::Dense<ValueType> *trans)
-{
-    // if (cublas::is_supported<ValueType>::value) {
-    //     auto handle = exec->get_cublas_handle();
-    //     {
-    //         cublas::pointer_mode_guard pm_guard(handle);
-    //         auto alpha = one<ValueType>();
-    //         auto beta = zero<ValueType>();
-    //         cublas::geam(
-    //             handle, oneapi::mkl::transpose::conjtrans,
-    //             oneapi::mkl::transpose::nontrans, orig->get_size()[0],
-    //             orig->get_size()[1], &alpha, orig->get_const_values(),
-    //             orig->get_stride(), &beta, static_cast<ValueType
-    //             *>(nullptr), trans->get_size()[1], trans->get_values(),
-    //             trans->get_stride());
-    //     }
-    // } else {
-    //     GKO_NOT_IMPLEMENTED;
-    // }
-    GKO_NOT_IMPLEMENTED;
-}
+                    matrix::Dense<ValueType> *trans) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CONJ_TRANSPOSE_KERNEL);
 
diff --git a/dpcpp/test/components/prefix_sum.cpp b/dpcpp/test/components/prefix_sum.cpp
index 2ae72880443..402192d0b77 100644
--- a/dpcpp/test/components/prefix_sum.cpp
+++ b/dpcpp/test/components/prefix_sum.cpp
@@ -44,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/array.hpp>
 
 
+#include "core/test/utils.hpp"
+
+
 namespace {
 
 
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
index e47de0a6487..2b9af16732a 100644
--- a/dpcpp/test/matrix/dense_kernels.cpp
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/components/fill_array.hpp"
 #include "core/matrix/dense_kernels.hpp"
+#include "core/test/utils.hpp"
 
 
 namespace {
@@ -64,9 +65,12 @@ class Dense : public ::testing::Test {
     using vtype = double;
 #endif  // GINKGO_DPCPP_SINGLE_MODE
     using Mtx = gko::matrix::Dense<vtype>;
+    using MixedMtx = gko::matrix::Dense<gko::next_precision<vtype>>;
     using NormVector = gko::matrix::Dense<gko::remove_complex<vtype>>;
     using Arr = gko::Array<itype>;
-    // using ComplexMtx = gko::matrix::Dense<std::complex<vtype>>;
+    using ComplexMtx = gko::matrix::Dense<std::complex<vtype>>;
+    using MixedComplexMtx =
+        gko::matrix::Dense<gko::next_precision<std::complex<vtype>>>;
 
     Dense() : rand_engine(15) {}
 
@@ -116,15 +120,16 @@ class Dense : public ::testing::Test {
     void set_up_apply_data()
     {
         x = gen_mtx<Mtx>(65, 25);
-        // c_x = gen_mtx<ComplexMtx>(65, 25);
+        c_x = gen_mtx<ComplexMtx>(65, 25);
         y = gen_mtx<Mtx>(25, 35);
         expected = gen_mtx<Mtx>(65, 35);
         alpha = gko::initialize<Mtx>({2.0}, ref);
         beta = gko::initialize<Mtx>({-1.0}, ref);
+        square = gen_mtx<Mtx>(x->get_size()[0], x->get_size()[0]);
         dx = Mtx::create(dpcpp);
         dx->copy_from(x.get());
-        // dc_x = ComplexMtx::create(dpcpp);
-        // dc_x->copy_from(c_x.get());
+        dc_x = ComplexMtx::create(dpcpp);
+        dc_x->copy_from(c_x.get());
         dy = Mtx::create(dpcpp);
         dy->copy_from(y.get());
         dresult = Mtx::create(dpcpp);
@@ -133,6 +138,8 @@ class Dense : public ::testing::Test {
         dalpha->copy_from(alpha.get());
         dbeta = Mtx::create(dpcpp);
         dbeta->copy_from(beta.get());
+        dsquare = Mtx::create(dpcpp);
+        dsquare->copy_from(square.get());
 
         std::vector<itype> tmp(x->get_size()[0], 0);
         auto rng = std::default_random_engine{};
@@ -141,14 +148,25 @@ class Dense : public ::testing::Test {
         std::vector<itype> tmp2(x->get_size()[1], 0);
         std::iota(tmp2.begin(), tmp2.end(), 0);
         std::shuffle(tmp2.begin(), tmp2.end(), rng);
+        std::vector<itype> tmp3(x->get_size()[0] / 10);
+        std::uniform_int_distribution<itype> row_dist(0, x->get_size()[0] - 1);
+        for (auto &i : tmp3) {
+            i = row_dist(rng);
+        }
         rpermute_idxs =
             std::unique_ptr<Arr>(new Arr{ref, tmp.begin(), tmp.end()});
-        drpermute_idxs =
-            std::unique_ptr<Arr>(new Arr{dpcpp, tmp.begin(), tmp.end()});
         cpermute_idxs =
             std::unique_ptr<Arr>(new Arr{ref, tmp2.begin(), tmp2.end()});
-        dcpermute_idxs =
-            std::unique_ptr<Arr>(new Arr{dpcpp, tmp2.begin(), tmp2.end()});
+        rgather_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp3.begin(), tmp3.end()});
+    }
+
+    template <typename ConvertedType, typename InputType>
+    std::unique_ptr<ConvertedType> convert(InputType &&input)
+    {
+        auto result = ConvertedType::create(input->get_executor());
+        input->convert_to(result.get());
+        return result;
     }
 
     std::shared_ptr<gko::ReferenceExecutor> ref;
@@ -157,21 +175,22 @@ class Dense : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> x;
-    // std::unique_ptr<ComplexMtx> c_x;
+    std::unique_ptr<ComplexMtx> c_x;
     std::unique_ptr<Mtx> y;
     std::unique_ptr<Mtx> alpha;
     std::unique_ptr<Mtx> beta;
     std::unique_ptr<Mtx> expected;
+    std::unique_ptr<Mtx> square;
     std::unique_ptr<Mtx> dresult;
     std::unique_ptr<Mtx> dx;
-    // std::unique_ptr<ComplexMtx> dc_x;
+    std::unique_ptr<ComplexMtx> dc_x;
     std::unique_ptr<Mtx> dy;
     std::unique_ptr<Mtx> dalpha;
     std::unique_ptr<Mtx> dbeta;
+    std::unique_ptr<Mtx> dsquare;
     std::unique_ptr<Arr> rpermute_idxs;
-    std::unique_ptr<Arr> drpermute_idxs;
     std::unique_ptr<Arr> cpermute_idxs;
-    std::unique_ptr<Arr> dcpermute_idxs;
+    std::unique_ptr<Arr> rgather_idxs;
 };
 
 
@@ -201,7 +220,7 @@ TEST_F(Dense, DpcppStridedFillIsEquivalentToRef)
     dx->fill(42);
     result->copy_from(dx.get());
 
-    GKO_ASSERT_MTX_NEAR(result, x, r<T>::value);
+    GKO_ASSERT_MTX_NEAR(result, x, r<vtype>::value);
 }
 
 
@@ -318,6 +337,28 @@ TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, SingleVectorDpcppComputeConjDotIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    x->compute_conj_dot(y.get(), expected.get());
+    dx->compute_conj_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, MultipleVectorDpcppComputeConjDotIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->compute_conj_dot(y.get(), expected.get());
+    dx->compute_conj_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
+}
+
+
 TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef)
 {
     set_up_vector_data(20);
@@ -343,6 +384,23 @@ TEST_F(Dense, SimpleApplyIsEquivalentToRef)
 }
 
 
+#if !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, SimpleApplyMixedIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    x->apply(convert<MixedMtx>(y).get(), convert<MixedMtx>(expected).get());
+    dx->apply(convert<MixedMtx>(dy).get(), convert<MixedMtx>(dresult).get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7);
+}
+
+
+#endif  // !GINKGO_DPCPP_SINGLE_MODE
+
+
 TEST_F(Dense, AdvancedApplyIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -354,38 +412,142 @@ TEST_F(Dense, AdvancedApplyIsEquivalentToRef)
 }
 
 
-// TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
-// {
-//     set_up_apply_data();
-//     auto complex_b = gen_mtx<ComplexMtx>(25, 1);
-//     auto dcomplex_b = ComplexMtx::create(dpcpp);
-//     dcomplex_b->copy_from(complex_b.get());
-//     auto complex_x = gen_mtx<ComplexMtx>(65, 1);
-//     auto dcomplex_x = ComplexMtx::create(dpcpp);
-//     dcomplex_x->copy_from(complex_x.get());
+#if !GINKGO_DPCPP_SINGLE_MODE
 
-//     x->apply(complex_b.get(), complex_x.get());
-//     dx->apply(dcomplex_b.get(), dcomplex_x.get());
 
-//     GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
-// }
+TEST_F(Dense, AdvancedApplyMixedIsEquivalentToRef)
+{
+    set_up_apply_data();
 
+    x->apply(convert<MixedMtx>(alpha).get(), convert<MixedMtx>(y).get(),
+             convert<MixedMtx>(beta).get(), convert<MixedMtx>(expected).get());
+    dx->apply(convert<MixedMtx>(dalpha).get(), convert<MixedMtx>(dy).get(),
+              convert<MixedMtx>(dbeta).get(), convert<MixedMtx>(dresult).get());
 
-// TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef)
-// {
-//     set_up_apply_data();
-//     auto complex_b = gen_mtx<ComplexMtx>(25, 1);
-//     auto dcomplex_b = ComplexMtx::create(dpcpp);
-//     dcomplex_b->copy_from(complex_b.get());
-//     auto complex_x = gen_mtx<ComplexMtx>(65, 1);
-//     auto dcomplex_x = ComplexMtx::create(dpcpp);
-//     dcomplex_x->copy_from(complex_x.get());
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7);
+}
 
-//     x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get());
-//     dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get());
 
-//     GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-14);
-// }
+#endif  // !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(25, 1);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(65, 1);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(complex_b.get(), complex_x.get());
+    dx->apply(dcomplex_b.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r<vtype>::value);
+}
+
+
+#if !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<MixedComplexMtx>(25, 1);
+    auto dcomplex_b = MixedComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<MixedComplexMtx>(65, 1);
+    auto dcomplex_x = MixedComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(complex_b.get(), complex_x.get());
+    dx->apply(dcomplex_b.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7);
+}
+
+#endif  // !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(25, 1);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(65, 1);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get());
+    dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r<vtype>::value);
+}
+
+
+#if !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<MixedComplexMtx>(25, 1);
+    auto dcomplex_b = MixedComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<MixedComplexMtx>(65, 1);
+    auto dcomplex_x = MixedComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(convert<MixedMtx>(alpha).get(), complex_b.get(),
+             convert<MixedMtx>(beta).get(), complex_x.get());
+    dx->apply(convert<MixedMtx>(dalpha).get(), dcomplex_b.get(),
+              convert<MixedMtx>(dbeta).get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7);
+}
+
+
+#endif  // !GINKGO_DPCPP_SINGLE_MODE
+
+
+TEST_F(Dense, ComputeDotComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+    auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2});
+    auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2});
+
+    complex_b->compute_dot(complex_x.get(), result.get());
+    dcomplex_b->compute_dot(dcomplex_x.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(result, dresult, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ComputeConjDotComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+    auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2});
+    auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2});
+
+    complex_b->compute_conj_dot(complex_x.get(), result.get());
+    dcomplex_b->compute_conj_dot(dcomplex_x.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(result, dresult, r<vtype>::value);
+}
 
 
 // TEST_F(Dense, IsTransposable)
@@ -494,42 +656,42 @@ TEST_F(Dense, MoveToEllIsEquivalentToRef)
 }
 
 
-// TEST_F(Dense, ConvertToSellpIsEquivalentToRef)
-// {
-//     set_up_apply_data();
-//     auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
-//     auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+TEST_F(Dense, ConvertToSellpIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sellp_mtx = gko::matrix::Sellp<vtype>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<vtype>::create(dpcpp);
 
-//     x->convert_to(sellp_mtx.get());
-//     dx->convert_to(dsellp_mtx.get());
+    x->convert_to(sellp_mtx.get());
+    dx->convert_to(dsellp_mtx.get());
 
-//     GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6);
-// }
+    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r<vtype>::value);
+}
 
 
-// TEST_F(Dense, MoveToSellpIsEquivalentToRef)
-// {
-//     set_up_apply_data();
-//     auto sellp_mtx = gko::matrix::Sellp<>::create(ref);
-//     auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+TEST_F(Dense, MoveToSellpIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sellp_mtx = gko::matrix::Sellp<vtype>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<vtype>::create(dpcpp);
 
-//     x->move_to(sellp_mtx.get());
-//     dx->move_to(dsellp_mtx.get());
+    x->move_to(sellp_mtx.get());
+    dx->move_to(dsellp_mtx.get());
 
-//     GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, 1e-6);
-// }
+    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r<vtype>::value);
+}
 
 
-// TEST_F(Dense, ConvertsEmptyToSellp)
-// {
-//     auto dempty_mtx = Mtx::create(dpcpp);
-//     auto dsellp_mtx = gko::matrix::Sellp<>::create(dpcpp);
+TEST_F(Dense, ConvertsEmptyToSellp)
+{
+    auto dempty_mtx = Mtx::create(dpcpp);
+    auto dsellp_mtx = gko::matrix::Sellp<vtype>::create(dpcpp);
 
-//     dempty_mtx->convert_to(dsellp_mtx.get());
+    dempty_mtx->convert_to(dsellp_mtx.get());
 
-//     ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()),
-//     0); ASSERT_FALSE(dsellp_mtx->get_size());
-// }
+    ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
+    ASSERT_FALSE(dsellp_mtx->get_size());
+}
 
 
 TEST_F(Dense, CountNNZIsEquivalentToRef)
@@ -595,12 +757,63 @@ TEST_F(Dense, CalculateTotalColsIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, CanGatherRows)
+{
+    set_up_apply_data();
+
+    auto r_gather = x->row_gather(rgather_idxs.get());
+    auto dr_gather = dx->row_gather(rgather_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0);
+}
+
+
+TEST_F(Dense, CanGatherRowsIntoDense)
+{
+    set_up_apply_data();
+    auto gather_size =
+        gko::dim<2>{rgather_idxs->get_num_elems(), x->get_size()[1]};
+    auto r_gather = Mtx::create(ref, gather_size);
+    // test make_temporary_clone and non-default stride
+    auto dr_gather = Mtx::create(ref, gather_size, x->get_size()[1] + 2);
+
+    x->row_gather(rgather_idxs.get(), r_gather.get());
+    dx->row_gather(rgather_idxs.get(), dr_gather.get());
+
+    GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0);
+}
+
+
+TEST_F(Dense, IsPermutable)
+{
+    set_up_apply_data();
+
+    auto permuted = square->permute(rpermute_idxs.get());
+    auto dpermuted = dsquare->permute(rpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(permuted.get()),
+                        static_cast<Mtx *>(dpermuted.get()), 0);
+}
+
+
+TEST_F(Dense, IsInversePermutable)
+{
+    set_up_apply_data();
+
+    auto permuted = square->inverse_permute(rpermute_idxs.get());
+    auto dpermuted = dsquare->inverse_permute(rpermute_idxs.get());
+
+    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(permuted.get()),
+                        static_cast<Mtx *>(dpermuted.get()), 0);
+}
+
+
 TEST_F(Dense, IsRowPermutable)
 {
     set_up_apply_data();
 
     auto r_permute = x->row_permute(rpermute_idxs.get());
-    auto dr_permute = dx->row_permute(drpermute_idxs.get());
+    auto dr_permute = dx->row_permute(rpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(r_permute.get()),
                         static_cast<Mtx *>(dr_permute.get()), 0);
@@ -612,7 +825,7 @@ TEST_F(Dense, IsColPermutable)
     set_up_apply_data();
 
     auto c_permute = x->column_permute(cpermute_idxs.get());
-    auto dc_permute = dx->column_permute(dcpermute_idxs.get());
+    auto dc_permute = dx->column_permute(cpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(c_permute.get()),
                         static_cast<Mtx *>(dc_permute.get()), 0);
@@ -624,7 +837,7 @@ TEST_F(Dense, IsInverseRowPermutable)
     set_up_apply_data();
 
     auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get());
-    auto d_inverse_r_permute = dx->inverse_row_permute(drpermute_idxs.get());
+    auto d_inverse_r_permute = dx->inverse_row_permute(rpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_r_permute.get()),
                         static_cast<Mtx *>(d_inverse_r_permute.get()), 0);
@@ -636,14 +849,14 @@ TEST_F(Dense, IsInverseColPermutable)
     set_up_apply_data();
 
     auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get());
-    auto d_inverse_c_permute = dx->inverse_column_permute(dcpermute_idxs.get());
+    auto d_inverse_c_permute = dx->inverse_column_permute(cpermute_idxs.get());
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_c_permute.get()),
                         static_cast<Mtx *>(d_inverse_c_permute.get()), 0);
 }
 
 
-TEST_F(Dense, ExtractDiagonalIsEquivalentToRef)
+TEST_F(Dense, ExtractDiagonalOnTallSkinnyIsEquivalentToRef)
 {
     set_up_apply_data();
 
@@ -654,6 +867,17 @@ TEST_F(Dense, ExtractDiagonalIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, ExtractDiagonalOnShortFatIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto diag = y->extract_diagonal();
+    auto ddiag = dy->extract_diagonal();
+
+    GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0);
+}
+
+
 TEST_F(Dense, InplaceAbsoluteMatrixIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -676,4 +900,76 @@ TEST_F(Dense, OutplaceAbsoluteMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, MakeComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto complex_x = x->make_complex();
+    auto dcomplex_x = dx->make_complex();
+
+    GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0);
+}
+
+
+TEST_F(Dense, MakeComplexWithGivenResultIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto complex_x = ComplexMtx::create(ref, x->get_size());
+    x->make_complex(complex_x.get());
+    auto dcomplex_x = ComplexMtx::create(dpcpp, x->get_size());
+    dx->make_complex(dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0);
+}
+
+
+TEST_F(Dense, GetRealIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto real_x = x->get_real();
+    auto dreal_x = dx->get_real();
+
+    GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0);
+}
+
+
+TEST_F(Dense, GetRealWithGivenResultIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto real_x = Mtx::create(ref, x->get_size());
+    x->get_real(real_x.get());
+    auto dreal_x = Mtx::create(dpcpp, dx->get_size());
+    dx->get_real(dreal_x.get());
+
+    GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0);
+}
+
+
+TEST_F(Dense, GetImagIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto imag_x = x->get_imag();
+    auto dimag_x = dx->get_imag();
+
+    GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0);
+}
+
+
+TEST_F(Dense, GetImagWithGivenResultIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    auto imag_x = Mtx::create(ref, x->get_size());
+    x->get_imag(imag_x.get());
+    auto dimag_x = Mtx::create(dpcpp, dx->get_size());
+    dx->get_imag(dimag_x.get());
+
+    GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0);
+}
+
+
 }  // namespace

From 9583b1bca76e7715d36c737928d7d8e8e14c5168 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 2 Jun 2021 19:46:54 +0200
Subject: [PATCH 14/22] fix sellp stuck

---
 dpcpp/matrix/dense_kernels.dp.cpp | 37 ++++++++++++++-----------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 58f688951c9..41a283d71bd 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -569,28 +569,25 @@ void calculate_slice_lengths(size_type num_rows, size_type slice_size,
     constexpr auto sg_size = cfg;
     const auto sliceid = item_ct1.get_group(2);
     const auto tid_in_warp = item_ct1.get_local_id(2);
+    const bool runable = sliceid * slice_size + tid_in_warp < num_rows;
+    size_type thread_result = 0;
+    for (size_type i = tid_in_warp; i < slice_size; i += sg_size) {
+        thread_result =
+            (i + slice_size * sliceid < num_rows)
+                ? max(thread_result, nnz_per_row[sliceid * slice_size + i])
+                : thread_result;
+    }
 
-    if (sliceid * slice_size + tid_in_warp < num_rows) {
-        size_type thread_result = 0;
-        for (size_type i = tid_in_warp; i < slice_size; i += sg_size) {
-            thread_result =
-                (i + slice_size * sliceid < num_rows)
-                    ? max(thread_result, nnz_per_row[sliceid * slice_size + i])
-                    : thread_result;
-        }
+    auto warp_tile =
+        group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
+    auto warp_result = ::gko::kernels::dpcpp::reduce(
+        warp_tile, thread_result,
+        [](const size_type &a, const size_type &b) { return max(a, b); });
 
-        auto warp_tile =
-            group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
-        auto warp_result = ::gko::kernels::dpcpp::reduce(
-            warp_tile, thread_result,
-            [](const size_type &a, const size_type &b) { return max(a, b); });
-
-        if (tid_in_warp == 0) {
-            auto slice_length =
-                ceildiv(warp_result, stride_factor) * stride_factor;
-            slice_lengths[sliceid] = slice_length;
-            slice_sets[sliceid] = slice_length;
-        }
+    if (tid_in_warp == 0 && runable) {
+        auto slice_length = ceildiv(warp_result, stride_factor) * stride_factor;
+        slice_lengths[sliceid] = slice_length;
+        slice_sets[sliceid] = slice_length;
     }
 }
 

From 485934083e5e321b794d085fea355bd1e844a59f Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 8 Jul 2021 13:44:52 +0200
Subject: [PATCH 15/22] add cp, update doc, mv mkl bind

Co-authored-by: Terry Cojean <terry.cojean@kit.edu>
---
 dpcpp/CMakeLists.txt                          |   4 +-
 dpcpp/base/config.hpp                         |   2 +-
 dpcpp/base/onemkl_bindings.hpp                | 128 ++++++++++++++++++
 dpcpp/components/prefix_sum.dp.hpp            |   1 -
 dpcpp/components/reduction.dp.hpp             |   3 +-
 dpcpp/components/thread_ids.dp.hpp            |   1 -
 dpcpp/components/uninitialized_array.hpp      |   4 +-
 dpcpp/matrix/dense_kernels.dp.cpp             |  63 +++++----
 .../ginkgo/core/synthesizer/containers.hpp    |  85 ++++++++++++
 9 files changed, 259 insertions(+), 32 deletions(-)
 create mode 100644 dpcpp/base/onemkl_bindings.hpp

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 48addebaf5f..8755b424433 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -55,13 +55,13 @@ target_sources(ginkgo_dpcpp
 ginkgo_compile_features(ginkgo_dpcpp)
 target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP)
 
-set(GINKGO_DPCPP_FLAGS ${GINKGO_COMPILER_FLAGS} -fsycl)
+set(GINKGO_DPCPP_FLAGS ${GINKGO_COMPILER_FLAGS} -DMKL_ILP64)
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
 target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
-target_link_libraries(ginkgo_dpcpp PRIVATE "mkl_sycl;mkl_intel_ilp64;mkl_sequential;mkl_core")
+target_link_libraries(ginkgo_dpcpp PRIVATE "mkl_sycl;mkl_intel_ilp64;mkl_tbb_thread;mkl_core;sycl;OpenCL;pthread;m;dl")
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
 if (GINKGO_DPCPP_SINGLE_MODE)
     target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp
index 78fdcc2b819..abb84d9b7ff 100644
--- a/dpcpp/base/config.hpp
+++ b/dpcpp/base/config.hpp
@@ -53,7 +53,7 @@ struct config {
     /**
      * The number of threads within a CUDA warp.
      */
-    static constexpr uint32 warp_size = 32;
+    static constexpr uint32 warp_size = 16;
 
     /**
      * The bitmask of the entire warp.
diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp
new file mode 100644
index 00000000000..6456a048d23
--- /dev/null
+++ b/dpcpp/base/onemkl_bindings.hpp
@@ -0,0 +1,128 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_
+#define GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_
+
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+
+namespace gko {
+/**
+ * @brief The device specific kernels namespace.
+ *
+ * @ingroup kernels
+ */
+namespace kernels {
+/**
+ * @brief The DPCPP namespace.
+ *
+ * @ingroup dpcpp
+ */
+namespace dpcpp {
+/**
+ * @brief The ONEMKL namespace.
+ *
+ * @ingroup onemkl
+ */
+namespace onemkl {
+/**
+ * @brief The detail namespace.
+ *
+ * @ingroup detail
+ */
+namespace detail {
+
+
+template <typename... Args>
+inline void not_implemented(Args &&...) GKO_NOT_IMPLEMENTED;
+
+
+}  // namespace detail
+
+
+template <typename ValueType>
+struct is_supported : std::false_type {};
+
+template <>
+struct is_supported<float> : std::true_type {};
+
+template <>
+struct is_supported<double> : std::true_type {};
+
+template <>
+struct is_supported<std::complex<float>> : std::true_type {};
+
+template <>
+struct is_supported<std::complex<double>> : std::true_type {};
+
+
+#define GKO_BIND_DOT(ValueType, Name, Func)                                  \
+    void Name(::cl::sycl::queue &exec_queue, std::int64_t n,                 \
+              const ValueType *x, std::int64_t incx, const ValueType *y,     \
+              std::int64_t incy, ValueType *result)                          \
+    {                                                                        \
+        Func(exec_queue, n, x, incx, y, incy, result);                       \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+// Bind the dot for x^T * y
+GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(std::complex<float>, dot, oneapi::mkl::blas::row_major::dotu);
+GKO_BIND_DOT(std::complex<double>, dot, oneapi::mkl::blas::row_major::dotu);
+template <typename ValueType>
+GKO_BIND_DOT(ValueType, dot, detail::not_implemented);
+
+// Bind the conj_dot for x' * y
+GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(std::complex<float>, conj_dot, oneapi::mkl::blas::row_major::dotc);
+GKO_BIND_DOT(std::complex<double>, conj_dot,
+             oneapi::mkl::blas::row_major::dotc);
+template <typename ValueType>
+GKO_BIND_DOT(ValueType, conj_dot, detail::not_implemented);
+
+#undef GKO_BIND_DOT
+
+
+}  // namespace onemkl
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
index f76f85135eb..22e6139dd84 100644
--- a/dpcpp/components/prefix_sum.dp.hpp
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -53,7 +53,6 @@ namespace kernels {
 namespace dpcpp {
 
 
-// #include "common/components/prefix_sum.hpp.inc"
 /**
  * @internal
  * Computes the prefix sum and total sum of `element` over a subwarp.
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index 9c2387a7113..e0678f6cf7a 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -70,7 +70,6 @@ constexpr auto kcfg_1d_list =
                     KCFG_1D::encode(256, 8)>();
 constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
 
-// #include "common/components/reduction.hpp.inc"
 /**
  * @internal
  *
@@ -217,7 +216,7 @@ void reduce_add_array(
     }
 }
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                       sycl::queue *stream, size_type size,
                       const ValueType *source, ValueType *result)
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
index 5b656c5e0db..9eda077381c 100644
--- a/dpcpp/components/thread_ids.dp.hpp
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -52,7 +52,6 @@ namespace dpcpp {
 namespace thread {
 
 
-// #include "common/components/thread_ids.hpp.inc"
 /**
  * @internal
  *
diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp
index 415126b8ed3..b10457df217 100644
--- a/dpcpp/components/uninitialized_array.hpp
+++ b/dpcpp/components/uninitialized_array.hpp
@@ -45,7 +45,6 @@ namespace kernels {
 namespace dpcpp {
 
 
-// #include "common/components/uninitialized_array.hpp.inc"
 /**
  * Stores an array with uninitialized contents.
  *
@@ -105,7 +104,8 @@ class UninitializedArray {
     }
 
 private:
-    // unsigned char data_[sizeof(ValueType) / sizeof(unsigned char) * size];
+    // if dpcpp uses char to represent data in char, compiling gives error.
+    // Thanksfully, dpcpp support complex data allocation directly.
     ValueType data_[size];
 };
 
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 41a283d71bd..a4e061c3f98 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/onemkl_bindings.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
@@ -79,10 +80,26 @@ constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
 constexpr auto default_block_size = 256;
 
 
-// #include "common/matrix/dense_kernels.hpp.inc"
 namespace kernel {
 
 
+template <typename InValueType, typename OutValueType>
+void strided_copy(size_type num_rows, size_type num_cols, size_type in_stride,
+                  size_type out_stride, const InValueType *__restrict__ input,
+                  OutValueType *__restrict__ output, sycl::nd_item<3> item_ct1)
+{
+    const auto global_id = thread::get_thread_id_flat(item_ct1);
+    const auto row_id = global_id / num_cols;
+    const auto col_id = global_id % num_cols;
+    if (row_id < num_rows) {
+        output[row_id * out_stride + col_id] =
+            static_cast<OutValueType>(input[row_id * in_stride + col_id]);
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(strided_copy, strided_copy)
+
+
 template <typename ValueType>
 void strided_fill(size_type num_rows, size_type num_cols, size_type stride,
                   ValueType *__restrict__ mat, ValueType value,
@@ -157,7 +174,7 @@ void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha,
 GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag)
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename OutType,
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename OutType,
           typename CallableGetValue, typename CallableReduce>
 void compute_partial_reduce(
     size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value,
@@ -191,7 +208,7 @@ void compute_partial_reduce(
 }
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType,
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType,
           typename CallableReduce, typename CallableFinalize>
 void finalize_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
@@ -220,7 +237,7 @@ void finalize_reduce_computation(
 }
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void compute_partial_dot(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     const ValueType *__restrict__ y, size_type stride_y,
@@ -236,7 +253,7 @@ void compute_partial_dot(
         tmp_work);
 }
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          sycl::queue *stream, size_type num_rows,
                          const ValueType *x, size_type stride_x,
@@ -266,7 +283,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot,
                                kcfg_1d_list)
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void compute_partial_conj_dot(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     const ValueType *__restrict__ y, size_type stride_y,
@@ -282,7 +299,7 @@ void compute_partial_conj_dot(
         tmp_work);
 }
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void compute_partial_conj_dot(dim3 grid, dim3 block,
                               size_t dynamic_shared_memory, sycl::queue *stream,
                               size_type num_rows, const ValueType *x,
@@ -312,7 +329,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call,
                                compute_partial_conj_dot, kcfg_1d_list)
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void finalize_sum_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
     sycl::nd_item<3> item_ct1,
@@ -324,7 +341,7 @@ void finalize_sum_reduce_computation(
         [](const ValueType &x) { return x; }, item_ct1, tmp_work);
 }
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void finalize_sum_reduce_computation(dim3 grid, dim3 block,
                                      size_t dynamic_shared_memory,
                                      sycl::queue *stream, size_type size,
@@ -353,7 +370,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call,
                                finalize_sum_reduce_computation, kcfg_1d_list)
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void compute_partial_norm2(
     size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
     remove_complex<ValueType> *__restrict__ work, sycl::nd_item<3> item_ct1,
@@ -368,7 +385,7 @@ void compute_partial_norm2(
         tmp_work);
 }
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                            sycl::queue *stream, size_type num_rows,
                            const ValueType *x, size_type stride_x,
@@ -397,7 +414,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call,
                                compute_partial_norm2, kcfg_1d_list)
 
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void finalize_sqrt_reduce_computation(
     size_type size, const ValueType *work, ValueType *result,
     sycl::nd_item<3> item_ct1,
@@ -409,7 +426,7 @@ void finalize_sqrt_reduce_computation(
         [](const ValueType &x) { return std::sqrt(x); }, item_ct1, tmp_work);
 }
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32), typename ValueType>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
 void finalize_sqrt_reduce_computation(dim3 grid, dim3 block,
                                       size_t dynamic_shared_memory,
                                       sycl::queue *stream, size_type size,
@@ -653,7 +670,7 @@ void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
     }
 }
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32)>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16)>
 void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                     sycl::queue *stream, size_type size,
                     const size_type *nnz_per_row, size_type *result)
@@ -733,7 +750,7 @@ void reduce_total_cols(size_type num_slices,
     }
 }
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 32)>
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16)>
 void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::queue *stream, size_type num_slices,
                        const size_type *max_nnz_per_slice, size_type *result)
@@ -1048,10 +1065,10 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
     if (0) {
         // TODO: write a custom kernel which does this more efficiently
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            dot(*exec->get_queue(), x->get_size()[0],
-                x->get_const_values() + col, x->get_stride(),
-                y->get_const_values() + col, y->get_stride(),
-                result->get_values() + col);
+            onemkl::dot(*exec->get_queue(), x->get_size()[0],
+                        x->get_const_values() + col, x->get_stride(),
+                        y->get_const_values() + col, y->get_stride(),
+                        result->get_values() + col);
         }
     } else {
         // TODO: these are tuning parameters obtained experimentally, once
@@ -1096,10 +1113,10 @@ void compute_conj_dot(std::shared_ptr<const DpcppExecutor> exec,
     if (0) {
         // TODO: write a custom kernel which does this more efficiently
         for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            conj_dot(*exec->get_queue(), x->get_size()[0],
-                     x->get_const_values() + col, x->get_stride(),
-                     y->get_const_values() + col, y->get_stride(),
-                     result->get_values() + col);
+            onemkl::conj_dot(*exec->get_queue(), x->get_size()[0],
+                             x->get_const_values() + col, x->get_stride(),
+                             y->get_const_values() + col, y->get_stride(),
+                             result->get_values() + col);
         }
     } else {
         // TODO: these are tuning parameters obtained experimentally, once
diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp
index 3c79c7b7455..10e8c1031a1 100644
--- a/include/ginkgo/core/synthesizer/containers.hpp
+++ b/include/ginkgo/core/synthesizer/containers.hpp
@@ -47,14 +47,32 @@ namespace gko {
 namespace syn {
 
 
+/**
+ * value_list records several values with the same type in template.
+ *
+ * @tparam T  the value type of the list
+ * @tparam T...  the values in the list
+ */
 template <typename T, T... Values>
 struct value_list {};
 
 
+/**
+ * type_list records several types in template
+ *
+ * @tparam ...Types  the types in the list
+ */
 template <typename... Types>
 struct type_list {};
 
 
+/**
+ * range records start, end, step in template
+ *
+ * @tparam int  start of range
+ * @tparam int  end of range
+ * @tparam int  step of range. default is 1
+ */
 template <int Start, int End, int Step = 1>
 struct range {};
 
@@ -62,9 +80,22 @@ struct range {};
 namespace detail {
 
 
+/**
+ * concatenate_impl base type
+ *
+ * @tparam List1  the first List
+ * @tparam List2  the second List
+ */
 template <typename List1, typename List2>
 struct concatenate_impl;
 
+/**
+ * concatenate_impl specializes for two value_list with the same value type.
+ *
+ * @tparam T  the value type of two value_list
+ * @tparam T...  the values of the first list
+ * @tparam T...  the values of the second list
+ */
 template <typename T, T... Values1, T... Values2>
 struct concatenate_impl<value_list<T, Values1...>, value_list<T, Values2...>> {
     using type = value_list<T, Values1..., Values2...>;
@@ -74,6 +105,12 @@ struct concatenate_impl<value_list<T, Values1...>, value_list<T, Values2...>> {
 }  // namespace detail
 
 
+/**
+ * concatenate combines two value_list into one value_list.
+ *
+ * @tparam List1  the first list
+ * @tparam List2  the second list
+ */
 template <typename List1, typename List2>
 using concatenate = typename detail::concatenate_impl<List1, List2>::type;
 
@@ -81,19 +118,43 @@ using concatenate = typename detail::concatenate_impl<List1, List2>::type;
 namespace detail {
 
 
+/**
+ * as_list_impl base type
+ *
+ * @tparam T  the input template
+ */
 template <typename T, typename = void>
 struct as_list_impl;
 
+/**
+ * as_list_impl specializes for the value_list
+ *
+ * @tparam T  the value_list type
+ * @tparam T...  the values of value_list
+ */
 template <typename T, T... Values>
 struct as_list_impl<value_list<T, Values...>> {
     using type = value_list<T, Values...>;
 };
 
+/**
+ * as_list_impl specializes for the type_list
+ *
+ * @tparam ...Types  the types of type_list
+ */
 template <typename... Types>
 struct as_list_impl<type_list<Types...>> {
     using type = type_list<Types...>;
 };
 
+/**
+ * as_list_impl specializes for the range. This is the recursive case. It will
+ * concatenate Start and range<Start + Step, End, Step>.
+ *
+ * @tparam int  the start of range
+ * @tparam int  the end of range
+ * @tparam int  the step of range
+ */
 template <int Start, int End, int Step>
 struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start < End)>> {
     using type = concatenate<
@@ -101,6 +162,13 @@ struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start < End)>> {
         typename as_list_impl<range<Start + Step, End, Step>>::type>;
 };
 
+/**
+ * as_list_impl specializes for the range. This is the end case.
+ *
+ * @tparam int  the start of range
+ * @tparam int  the end of range
+ * @tparam int  the step of range
+ */
 template <int Start, int End, int Step>
 struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start >= End)>> {
     using type = value_list<int>;
@@ -110,10 +178,27 @@ struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start >= End)>> {
 }  // namespace detail
 
 
+/**
+ * as_list<T> gives the alias type of as_list_impl<T>::type. It gives a list
+ * (itself) if input is already a list, or generates list type from range input.
+ *
+ * @tparam T  list or range
+ */
 template <typename T>
 using as_list = typename detail::as_list_impl<T>::type;
 
 
+/**
+ * as_array<T> returns the array from value_list. It will be helpful if using
+ * for in runtime on the array.
+ *
+ * @tparam T  the type of value_list
+ * @tparam T...  the values of value_list
+ *
+ * @param value_list  the input value_list
+ *
+ * @return std::array  the std::array contains the values of value_list
+ */
 template <typename T, T... Value>
 constexpr std::array<T, sizeof...(Value)> as_array(value_list<T, Value...> vl)
 {

From bb4d648e07852ce1de04bbdd277466bde3032fe0 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 13 Jul 2021 14:06:27 +0200
Subject: [PATCH 16/22] MKL cmake, delete unused, simplify func, add job

---
 cmake/create_test.cmake           |   8 +-
 dpcpp/CMakeLists.txt              |   8 +-
 dpcpp/matrix/dense_kernels.dp.cpp | 386 ++----------------------------
 3 files changed, 33 insertions(+), 369 deletions(-)

diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index ebf70232dd3..9d36b49911e 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -42,6 +42,10 @@ function(ginkgo_create_dpcpp_test test_name)
     target_compile_options(${test_target_name} PRIVATE "${GINKGO_DPCPP_FLAGS}")
     target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel)
     ginkgo_set_test_target_properties(${test_name} ${test_target_name})
+    # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test.
+    if (MKL_ENV)
+       set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}")
+    endif()
 endfunction(ginkgo_create_dpcpp_test)
 
 function(ginkgo_create_thread_test test_name)
@@ -165,7 +169,7 @@ function(ginkgo_create_common_test test_name)
         # use float for DPC++ if necessary
         if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE)
             target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1)
-        endif() 
+        endif()
         ginkgo_set_test_target_properties(${test_name}_${exec} ${test_target_name})
     endforeach()
-endfunction(ginkgo_create_common_test)
\ No newline at end of file
+endfunction(ginkgo_create_common_test)
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 8755b424433..d30810cf12c 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -6,6 +6,8 @@ endif()
 ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION)
 set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE)
 
+find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
+
 add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
 target_sources(ginkgo_dpcpp
     PRIVATE
@@ -55,14 +57,16 @@ target_sources(ginkgo_dpcpp
 ginkgo_compile_features(ginkgo_dpcpp)
 target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP)
 
-set(GINKGO_DPCPP_FLAGS ${GINKGO_COMPILER_FLAGS} -DMKL_ILP64)
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
+# Note. add MKL via PRIVATE not PUBLIC (MKL example shows) to avoid find_package(MKL) everywhere when link ginkgo
+target_compile_options(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_COMPILE_OPTIONS>)
 target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
+target_include_directories(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
-target_link_libraries(ginkgo_dpcpp PRIVATE "mkl_sycl;mkl_intel_ilp64;mkl_tbb_thread;mkl_core;sycl;OpenCL;pthread;m;dl")
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
+target_link_libraries(ginkgo_dpcpp PRIVATE $<LINK_ONLY:MKL::MKL_DPCPP>)
 if (GINKGO_DPCPP_SINGLE_MODE)
     target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
 endif()
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index a4e061c3f98..f264f970cac 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -83,97 +83,6 @@ constexpr auto default_block_size = 256;
 namespace kernel {
 
 
-template <typename InValueType, typename OutValueType>
-void strided_copy(size_type num_rows, size_type num_cols, size_type in_stride,
-                  size_type out_stride, const InValueType *__restrict__ input,
-                  OutValueType *__restrict__ output, sycl::nd_item<3> item_ct1)
-{
-    const auto global_id = thread::get_thread_id_flat(item_ct1);
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    if (row_id < num_rows) {
-        output[row_id * out_stride + col_id] =
-            static_cast<OutValueType>(input[row_id * in_stride + col_id]);
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(strided_copy, strided_copy)
-
-
-template <typename ValueType>
-void strided_fill(size_type num_rows, size_type num_cols, size_type stride,
-                  ValueType *__restrict__ mat, ValueType value,
-                  sycl::nd_item<3> item_ct1)
-{
-    const auto global_id = thread::get_thread_id_flat(item_ct1);
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    if (row_id < num_rows) {
-        mat[row_id * stride + col_id] = value;
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(strided_fill, strided_fill)
-
-
-template <typename ValueType>
-void scale(size_type num_rows, size_type num_cols, size_type num_alpha_cols,
-           const ValueType *__restrict__ alpha, ValueType *__restrict__ x,
-           size_type stride_x, sycl::nd_item<3> item_ct1)
-{
-    const auto global_id = thread::get_thread_id_flat(item_ct1);
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id;
-    if (row_id < num_rows) {
-        x[row_id * stride_x + col_id] =
-            alpha[alpha_id] == zero<ValueType>()
-                ? zero<ValueType>()
-                : x[row_id * stride_x + col_id] * alpha[alpha_id];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(scale, scale)
-
-
-template <typename ValueType>
-void add_scaled(size_type num_rows, size_type num_cols,
-                size_type num_alpha_cols, const ValueType *__restrict__ alpha,
-                const ValueType *__restrict__ x, size_type stride_x,
-                ValueType *__restrict__ y, size_type stride_y,
-                sycl::nd_item<3> item_ct1)
-{
-    const auto global_id = thread::get_thread_id_flat(item_ct1);
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    const auto alpha_id = num_alpha_cols == 1 ? 0 : col_id;
-    if (row_id < num_rows && alpha[alpha_id] != zero<ValueType>()) {
-        y[row_id * stride_y + col_id] +=
-            x[row_id * stride_x + col_id] * alpha[alpha_id];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(add_scaled, add_scaled)
-
-
-template <typename ValueType>
-void add_scaled_diag(size_type size, const ValueType *__restrict__ alpha,
-                     const ValueType *__restrict__ diag,
-                     ValueType *__restrict__ y, size_type stride_y,
-                     sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-
-    if (tidx >= size) {
-        return;
-    }
-
-    y[tidx * stride_y + tidx] += alpha[0] * diag[tidx];
-}
-
-GKO_ENABLE_DEFAULT_HOST(add_scaled_diag, add_scaled_diag)
-
-
 template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename OutType,
           typename CallableGetValue, typename CallableReduce>
 void compute_partial_reduce(
@@ -456,7 +365,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call,
                                finalize_sqrt_reduce_computation, kcfg_1d_list)
 
 
-template <std::uint32_t cfg, typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType>
 void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
                  const size_type *__restrict__ row_ptrs,
                  const ValueType *__restrict__ source,
@@ -479,9 +388,7 @@ void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_coo, fill_in_coo)
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_coo, fill_in_coo)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_coo_call, fill_in_coo, kcfg_1d_list)
+GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo)
 
 
 template <std::uint32_t cfg, typename ValueType, typename IndexType>
@@ -514,7 +421,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row,
                                kcfg_1d_list)
 
 
-template <std::uint32_t cfg, typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType>
 void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
                  const ValueType *__restrict__ source,
                  IndexType *__restrict__ row_ptrs,
@@ -535,12 +442,10 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_csr, fill_in_csr)
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_csr, fill_in_csr)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_csr_call, fill_in_csr, kcfg_1d_list)
+GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr)
 
 
-template <std::uint32_t cfg, typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType>
 void fill_in_ell(size_type num_rows, size_type num_cols,
                  size_type source_stride, const ValueType *__restrict__ source,
                  size_type max_nnz_per_row, size_type result_stride,
@@ -570,9 +475,7 @@ void fill_in_ell(size_type num_rows, size_type num_cols,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_ell, fill_in_ell)
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_ell, fill_in_ell)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_ell_call, fill_in_ell, kcfg_1d_list)
+GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell)
 
 
 template <std::uint32_t cfg>
@@ -615,7 +518,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call,
                                calculate_slice_lengths, subgroup_list)
 
 
-template <std::uint32_t cfg, typename ValueType, typename IndexType>
+template <typename ValueType, typename IndexType>
 void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
                    size_type stride, const ValueType *__restrict__ source,
                    size_type *__restrict__ slice_lengths,
@@ -648,9 +551,7 @@ void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
     }
 }
 
-GKO_ENABLE_DEFAULT_HOST_CONFIG(fill_in_sellp, fill_in_sellp)
-GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(fill_in_sellp, fill_in_sellp)
-GKO_ENABLE_DEFAULT_CONFIG_CALL(fill_in_sellp_call, fill_in_sellp, kcfg_1d_list)
+GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp)
 
 
 template <std::uint32_t cfg>
@@ -775,220 +676,6 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols,
                                kcfg_1d_list)
 
 
-template <typename IndexType, typename ValueType>
-void symm_permute(size_type num_rows, size_type num_cols,
-                  const IndexType *__restrict__ perm_idxs,
-                  const ValueType *__restrict__ orig, size_type stride_orig,
-                  ValueType *__restrict__ result, size_type stride_result,
-                  sycl::nd_item<3> item_ct1)
-{
-    const auto global_id = thread::get_thread_id_flat(item_ct1);
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    if (row_id < num_rows) {
-        result[row_id * stride_result + col_id] =
-            orig[perm_idxs[row_id] * stride_orig + perm_idxs[col_id]];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(symm_permute, symm_permute)
-
-
-template <typename IndexType, typename ValueType>
-void inv_symm_permute(size_type num_rows, size_type num_cols,
-                      const IndexType *__restrict__ perm_idxs,
-                      const ValueType *__restrict__ orig, size_type stride_orig,
-                      ValueType *__restrict__ result, size_type stride_result,
-                      sycl::nd_item<3> item_ct1)
-{
-    const auto global_id = thread::get_thread_id_flat(item_ct1);
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    if (row_id < num_rows) {
-        result[perm_idxs[row_id] * stride_result + perm_idxs[col_id]] =
-            orig[row_id * stride_orig + col_id];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(inv_symm_permute, inv_symm_permute)
-
-
-template <typename IndexType, typename ValueType>
-void row_gather(size_type num_rows, size_type num_cols,
-                const IndexType *__restrict__ perm_idxs,
-                const ValueType *__restrict__ orig, size_type stride_orig,
-                ValueType *__restrict__ result, size_type stride_result,
-                sycl::nd_item<3> item_ct1)
-{
-    const auto global_id = thread::get_thread_id_flat(item_ct1);
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    if (row_id < num_rows) {
-        result[row_id * stride_result + col_id] =
-            orig[perm_idxs[row_id] * stride_orig + col_id];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(row_gather, row_gather)
-
-
-template <typename IndexType, typename ValueType>
-void column_permute(size_type num_rows, size_type num_cols,
-                    const IndexType *__restrict__ perm_idxs,
-                    const ValueType *__restrict__ orig, size_type stride_orig,
-                    ValueType *__restrict__ result, size_type stride_result,
-                    sycl::nd_item<3> item_ct1)
-{
-    const auto global_id = thread::get_thread_id_flat(item_ct1);
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    if (row_id < num_rows) {
-        result[row_id * stride_result + col_id] =
-            orig[row_id * stride_orig + perm_idxs[col_id]];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(column_permute, column_permute)
-
-
-template <typename IndexType, typename ValueType>
-void inverse_row_permute(size_type num_rows, size_type num_cols,
-                         const IndexType *__restrict__ perm_idxs,
-                         const ValueType *__restrict__ orig,
-                         size_type stride_orig, ValueType *__restrict__ result,
-                         size_type stride_result, sycl::nd_item<3> item_ct1)
-{
-    const auto global_id = thread::get_thread_id_flat(item_ct1);
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    if (row_id < num_rows) {
-        result[perm_idxs[row_id] * stride_result + col_id] =
-            orig[row_id * stride_orig + col_id];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(inverse_row_permute, inverse_row_permute)
-
-
-template <typename IndexType, typename ValueType>
-void inverse_column_permute(size_type num_rows, size_type num_cols,
-                            const IndexType *__restrict__ perm_idxs,
-                            const ValueType *__restrict__ orig,
-                            size_type stride_orig,
-                            ValueType *__restrict__ result,
-                            size_type stride_result, sycl::nd_item<3> item_ct1)
-{
-    const auto global_id = thread::get_thread_id_flat(item_ct1);
-    const auto row_id = global_id / num_cols;
-    const auto col_id = global_id % num_cols;
-    if (row_id < num_rows) {
-        result[row_id * stride_result + perm_idxs[col_id]] =
-            orig[row_id * stride_orig + col_id];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(inverse_column_permute, inverse_column_permute)
-
-
-template <typename ValueType>
-void extract_diagonal(size_type problem_size,
-                      const ValueType *__restrict__ orig, size_type stride_orig,
-                      ValueType *__restrict__ diag, sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat<int>(item_ct1);
-    if (tidx < problem_size) {
-        diag[tidx] = orig[tidx * stride_orig + tidx];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(extract_diagonal, extract_diagonal)
-
-
-template <typename ValueType>
-void inplace_absolute_dense(size_type num_rows, size_type num_cols,
-                            ValueType *__restrict__ data, size_type stride,
-                            sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-    auto row = tidx / num_cols;
-    auto col = tidx % num_cols;
-    if (row < num_rows) {
-        data[row * stride + col] = std::abs(data[row * stride + col]);
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(inplace_absolute_dense, inplace_absolute_dense)
-
-
-template <typename ValueType>
-void outplace_absolute_dense(size_type num_rows, size_type num_cols,
-                             const ValueType *__restrict__ in,
-                             size_type stride_in,
-                             remove_complex<ValueType> *__restrict__ out,
-                             size_type stride_out, sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-    auto row = tidx / num_cols;
-    auto col = tidx % num_cols;
-    if (row < num_rows) {
-        out[row * stride_out + col] = std::abs(in[row * stride_in + col]);
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(outplace_absolute_dense, outplace_absolute_dense)
-
-
-template <typename ValueType, typename ComplexType>
-void make_complex(size_type num_rows, size_type num_cols,
-                  const ValueType *__restrict__ in, size_type stride_in,
-                  ComplexType *__restrict__ out, size_type stride_out,
-                  sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-    auto row = tidx / num_cols;
-    auto col = tidx % num_cols;
-    if (row < num_rows) {
-        out[row * stride_out + col] = in[row * stride_in + col];
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(make_complex, make_complex)
-
-
-template <typename ValueType>
-void get_real(size_type num_rows, size_type num_cols,
-              const ValueType *__restrict__ in, size_type stride_in,
-              remove_complex<ValueType> *__restrict__ out, size_type stride_out,
-              sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-    auto row = tidx / num_cols;
-    auto col = tidx % num_cols;
-    if (row < num_rows) {
-        out[row * stride_out + col] = real(in[row * stride_in + col]);
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(get_real, get_real)
-
-
-template <typename ValueType>
-void get_imag(size_type num_rows, size_type num_cols,
-              const ValueType *__restrict__ in, size_type stride_in,
-              remove_complex<ValueType> *__restrict__ out, size_type stride_out,
-              sycl::nd_item<3> item_ct1)
-{
-    const auto tidx = thread::get_thread_id_flat(item_ct1);
-    auto row = tidx / num_cols;
-    auto col = tidx % num_cols;
-    if (row < num_rows) {
-        out[row * stride_out + col] = imag(in[row * stride_in + col]);
-    }
-}
-
-GKO_ENABLE_DEFAULT_HOST(get_imag, get_imag)
-
-
 }  // namespace kernel
 
 
@@ -1028,34 +715,6 @@ void apply(std::shared_ptr<const DpcppExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
-namespace {
-
-
-#define GKO_BIND_DOT(ValueType, Name, Func)                                  \
-    void Name(::cl::sycl::queue &exec_queue, std::int64_t n,                 \
-              const ValueType *x, std::int64_t incx, const ValueType *y,     \
-              std::int64_t incy, ValueType *result)                          \
-    {                                                                        \
-        Func(exec_queue, n, x, incx, y, incy, result);                       \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
-
-GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot);
-GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot);
-GKO_BIND_DOT(std::complex<float>, dot, oneapi::mkl::blas::row_major::dotu);
-GKO_BIND_DOT(std::complex<double>, dot, oneapi::mkl::blas::row_major::dotu);
-GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot);
-GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot);
-GKO_BIND_DOT(std::complex<float>, conj_dot, oneapi::mkl::blas::row_major::dotc);
-GKO_BIND_DOT(std::complex<double>, conj_dot,
-             oneapi::mkl::blas::row_major::dotc);
-
-
-}  // namespace
-
-
 template <typename ValueType>
 void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
                  const matrix::Dense<ValueType> *x,
@@ -1231,10 +890,9 @@ void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
     const auto sg_size = KCFG_1D::decode<1>(cfg);
     size_type grid_dim = ceildiv(num_rows, wg_size);
 
-    kernel::fill_in_coo_call(
-        cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, num_cols,
-        stride, nnz_prefix_sum.get_const_data(), source->get_const_values(),
-        row_idxs, col_idxs, values);
+    kernel::fill_in_coo(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
+                        num_cols, stride, nnz_prefix_sum.get_const_data(),
+                        source->get_const_values(), row_idxs, col_idxs, values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1276,10 +934,9 @@ void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
 
     size_type grid_dim = ceildiv(num_rows, wg_size);
 
-    kernel::fill_in_csr_call(cfg, grid_dim, default_block_size, 0,
-                             exec->get_queue(), num_rows, num_cols, stride,
-                             source->get_const_values(), row_ptrs, col_idxs,
-                             values);
+    kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(),
+                        num_rows, num_cols, stride, source->get_const_values(),
+                        row_ptrs, col_idxs, values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1311,10 +968,9 @@ void convert_to_ell(std::shared_ptr<const DpcppExecutor> exec,
     const auto wg_size = KCFG_1D::decode<0>(cfg);
     const auto sg_size = KCFG_1D::decode<1>(cfg);
     auto grid_dim = ceildiv(result_stride, wg_size);
-    kernel::fill_in_ell_call(cfg, grid_dim, wg_size, 0, exec->get_queue(),
-                             num_rows, num_cols, source_stride,
-                             source->get_const_values(), max_nnz_per_row,
-                             result_stride, col_ptrs, values);
+    kernel::fill_in_ell(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
+                        num_cols, source_stride, source->get_const_values(),
+                        max_nnz_per_row, result_stride, col_ptrs, values);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1379,10 +1035,10 @@ void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
 
     grid_dim = ceildiv(num_rows, wg_size);
     if (grid_dim > 0) {
-        kernel::fill_in_sellp_call(cfg, grid_dim, wg_size, 0, exec->get_queue(),
-                                   num_rows, num_cols, slice_size, stride,
-                                   source->get_const_values(), slice_lengths,
-                                   slice_sets, col_idxs, vals);
+        kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
+                              num_cols, slice_size, stride,
+                              source->get_const_values(), slice_lengths,
+                              slice_sets, col_idxs, vals);
     }
 }
 

From 0b562fdf49f3a0cfa3196c65f6f4d39b4cf78dc5 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 16 Jul 2021 10:46:25 +0200
Subject: [PATCH 17/22] MKL static cmake - public per_kernel link option when
 static - add find(MKL) when ginkgo is static

---
 cmake/GinkgoConfig.cmake.in         |  5 +++++
 dpcpp/CMakeLists.txt                |  9 ++++++++-
 dpcpp/matrix/dense_kernels.dp.cpp   |  5 +++--
 dpcpp/test/matrix/dense_kernels.cpp | 24 ------------------------
 4 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
index 61da405cf6f..4675bcb4781 100644
--- a/cmake/GinkgoConfig.cmake.in
+++ b/cmake/GinkgoConfig.cmake.in
@@ -78,6 +78,7 @@ set(GINKGO_AMD_ARCH_FLAGS @GINKGO_AMD_ARCH_FLAGS@)
 
 set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@)
 set(GINKGO_DPCPP_FLAGS @GINKGO_DPCPP_FLAGS@)
+set(GINKGO_MKL_ROOT @GINKGO_MKL_ROOT@)
 
 set(GINKGO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@)
 
@@ -168,4 +169,8 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP)
     find_package(rocrand REQUIRED)
 endif()
 
+if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_DPCPP)
+    find_package(MKL CONFIG REQUIRED HINTS "${GINKGO_MKL_ROOT}")
+endif()
+
 include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake)
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index d30810cf12c..443d180b172 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -7,6 +7,7 @@ ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION)
 set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE)
 
 find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
+set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)
 
 add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
 target_sources(ginkgo_dpcpp
@@ -64,7 +65,13 @@ target_compile_options(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INT
 target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
 target_include_directories(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
-target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
+# When building ginkgo as a static library, we need to use dpcpp and per_kernel
+# link option when the program uses dpcpp related function.
+if (BUILD_SHARED_LIBS)
+    target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
+else ()
+    target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
+endif()
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
 target_link_libraries(ginkgo_dpcpp PRIVATE $<LINK_ONLY:MKL::MKL_DPCPP>)
 if (GINKGO_DPCPP_SINGLE_MODE)
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index f264f970cac..c2326be9c82 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -75,8 +75,9 @@ constexpr auto kcfg_1d_list =
                     KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
                     KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
                     KCFG_1D::encode(256, 8)>();
-constexpr auto subgroup_list = syn::value_list<std::uint32_t, 64, 32, 16, 8>();
-constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+constexpr auto subgroup_list =
+    syn::value_list<std::uint32_t, 64, 32, 16, 8, 4>();
+constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list);
 constexpr auto default_block_size = 256;
 
 
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
index 2b9af16732a..43ce9bad547 100644
--- a/dpcpp/test/matrix/dense_kernels.cpp
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -550,30 +550,6 @@ TEST_F(Dense, ComputeConjDotComplexIsEquivalentToRef)
 }
 
 
-// TEST_F(Dense, IsTransposable)
-// {
-//     set_up_apply_data();
-
-//     auto trans = x->transpose();
-//     auto dtrans = dx->transpose();
-
-//     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(dtrans.get()),
-//                         static_cast<Mtx *>(trans.get()), 0);
-// }
-
-
-// TEST_F(Dense, IsConjugateTransposable)
-// {
-//     set_up_apply_data();
-
-//     auto trans = c_x->conj_transpose();
-//     auto dtrans = dc_x->conj_transpose();
-
-//     GKO_ASSERT_MTX_NEAR(static_cast<ComplexMtx *>(dtrans.get()),
-//                         static_cast<ComplexMtx *>(trans.get()), 0);
-// }
-
-
 TEST_F(Dense, ConvertToCooIsEquivalentToRef)
 {
     set_up_apply_data();

From 7ac6d0909f006b52c5516e63b50cf0dff6b18012 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Sun, 18 Jul 2021 23:33:35 +0200
Subject: [PATCH 18/22] use mkl in ncols=1, add cuda descp in dpcpp thread

---
 dpcpp/components/thread_ids.dp.hpp | 26 ++++++++++++----------
 dpcpp/matrix/dense_kernels.dp.cpp  | 35 ++++++++++++------------------
 2 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
index 9eda077381c..47abf3c7b72 100644
--- a/dpcpp/components/thread_ids.dp.hpp
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -59,7 +59,7 @@ namespace thread {
  *
  * @return the ID of the block group this thread belongs to
  *
- * @note Assumes that grid dimensions are in standard format:
+ * @note Assumes that grid dimensions are in cuda standard format:
  *       `(block_group_size, first_grid_dimension, second grid_dimension)`
  */
 __dpct_inline__ size_type get_block_group_id(sycl::nd_item<3> item_ct1)
@@ -76,7 +76,7 @@ __dpct_inline__ size_type get_block_group_id(sycl::nd_item<3> item_ct1)
  *
  * @return the ID of the block this thread belongs to
  *
- * @note Assumes that grid dimensions are in standard format:
+ * @note Assumes that grid dimensions are in cuda standard format:
  *       `(block_group_size, first_grid_dimension, second grid_dimension)`
  */
 __dpct_inline__ size_type get_block_id(sycl::nd_item<3> item_ct1)
@@ -95,7 +95,7 @@ __dpct_inline__ size_type get_block_id(sycl::nd_item<3> item_ct1)
  * @return the local ID of the warp (relative to the block) this thread belongs
  *         to
  *
- * @note Assumes that block dimensions are in standard format:
+ * @note Assumes that block dimensions are in cuda standard format:
  *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
  *         config::warp_size)`
  */
@@ -116,7 +116,7 @@ __dpct_inline__ size_type get_local_warp_id(sycl::nd_item<3> item_ct1)
  * @return the local ID of the sub-warp (relative to the block) this thread
  *         belongs to
  *
- * @note Assumes that block dimensions are in standard format:
+ * @note Assumes that block dimensions are in cuda standard format:
  *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
  *         config::warp_size)`
  */
@@ -140,7 +140,7 @@ __dpct_inline__ size_type get_local_subwarp_id(sycl::nd_item<3> item_ct1)
  *
  * @return the local ID of the thread (relative to the block)
  *
- * @note Assumes that block dimensions are in standard format:
+ * @note Assumes that block dimensions are in cuda standard format:
  *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
  *         config::warp_size)`
  */
@@ -161,7 +161,7 @@ __dpct_inline__ size_type get_local_thread_id(sycl::nd_item<3> item_ct1)
  *
  * @return the global ID of the warp this thread belongs to.
  *
- * @note Assumes that block dimensions and grid dimensions are in standard
+ * @note Assumes that block dimensions and grid dimensions are in cuda standard
  *       format:
  *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
  *         config::warp_size)` and
@@ -185,7 +185,7 @@ __dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1)
  *
  * @return the global ID of the sub-warp this thread belongs to.
  *
- * @note Assumes that block dimensions and grid dimensions are in standard
+ * @note Assumes that block dimensions and grid dimensions are in cuda standard
  *       format:
  *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
  *         config::warp_size)` and
@@ -211,7 +211,7 @@ __dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1)
  *
  * @tparam subwarp_size  size of the subwarp
  *
- * @note Assumes that block dimensions and grid dimensions are in standard
+ * @note Assumes that block dimensions and grid dimensions are in cuda standard
  *       format:
  *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
  *         config::warp_size)` and
@@ -231,7 +231,8 @@ __dpct_inline__ size_type get_thread_id(sycl::nd_item<3> item_ct1)
  * @internal
  *
  * Returns the global ID of the thread in the given index type.
- * This function assumes one-dimensional thread and block indexing.
+ * This function assumes one-dimensional thread and block indexing in cuda
+ * sense. It uses the third position infomation to get the information.
  *
  * @return the global ID of the thread in the given index type.
  *
@@ -250,7 +251,8 @@ __dpct_inline__ IndexType get_thread_id_flat(sycl::nd_item<3> item_ct1)
  * @internal
  *
  * Returns the total number of threads in the given index type.
- * This function assumes one-dimensional thread and block indexing.
+ * This function assumes one-dimensional thread and block indexing in cuda
+ * sense. It uses the third position infomation to get the information.
  *
  * @return the total number of threads in the given index type.
  *
@@ -268,7 +270,7 @@ __dpct_inline__ IndexType get_thread_num_flat(sycl::nd_item<3> item_ct1)
  * @internal
  *
  * Returns the global ID of the subwarp in the given index type.
- * This function assumes one-dimensional thread and block indexing
+ * This function assumes one-dimensional thread and block indexing in cuda sense
  * with a power of two block size of at least subwarp_size.
  *
  * @return the global ID of the subwarp in the given index type.
@@ -292,7 +294,7 @@ __dpct_inline__ IndexType get_subwarp_id_flat(sycl::nd_item<3> item_ct1)
  * @internal
  *
  * Returns the total number of subwarps in the given index type.
- * This function assumes one-dimensional thread and block indexing
+ * This function assumes one-dimensional thread and block indexing in cuda sense
  * with a power of two block size of at least subwarp_size.
  *
  * @return the total number of subwarps in the given index type.
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index c2326be9c82..32eef01af63 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -722,14 +722,11 @@ void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
                  const matrix::Dense<ValueType> *y,
                  matrix::Dense<ValueType> *result)
 {
-    if (0) {
+    if (x->get_size()[1] == 1) {
         // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            onemkl::dot(*exec->get_queue(), x->get_size()[0],
-                        x->get_const_values() + col, x->get_stride(),
-                        y->get_const_values() + col, y->get_stride(),
-                        result->get_values() + col);
-        }
+        onemkl::dot(*exec->get_queue(), x->get_size()[0], x->get_const_values(),
+                    x->get_stride(), y->get_const_values(), y->get_stride(),
+                    result->get_values());
     } else {
         // TODO: these are tuning parameters obtained experimentally, once
         // we decide how to handle this uniformly, they should be modified
@@ -770,14 +767,13 @@ void compute_conj_dot(std::shared_ptr<const DpcppExecutor> exec,
                       const matrix::Dense<ValueType> *y,
                       matrix::Dense<ValueType> *result)
 {
-    if (0) {
+    if (x->get_size()[1] == 1) {
         // TODO: write a custom kernel which does this more efficiently
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            onemkl::conj_dot(*exec->get_queue(), x->get_size()[0],
-                             x->get_const_values() + col, x->get_stride(),
-                             y->get_const_values() + col, y->get_stride(),
-                             result->get_values() + col);
-        }
+        onemkl::conj_dot(*exec->get_queue(), x->get_size()[0],
+                         x->get_const_values(), x->get_stride(),
+                         y->get_const_values(), y->get_stride(),
+                         result->get_values());
+
     } else {
         // TODO: these are tuning parameters obtained experimentally, once
         // we decide how to handle this uniformly, they should be modified
@@ -818,13 +814,10 @@ void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
                    const matrix::Dense<ValueType> *x,
                    matrix::Dense<remove_complex<ValueType>> *result)
 {
-    if (0) {
-        for (size_type col = 0; col < x->get_size()[1]; ++col) {
-            oneapi::mkl::blas::row_major::nrm2(
-                *exec->get_queue(), x->get_size()[0],
-                x->get_const_values() + col, x->get_stride(),
-                result->get_values() + col);
-        }
+    if (x->get_size()[1] == 1) {
+        oneapi::mkl::blas::row_major::nrm2(
+            *exec->get_queue(), x->get_size()[0], x->get_const_values(),
+            x->get_stride(), result->get_values());
     } else {
         using norm_type = remove_complex<ValueType>;
         // TODO: these are tuning parameters obtained experimentally, once

From 6169c2ff4a6caf20a0b4213b62119bb8ce0e3890 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 20 Jul 2021 15:56:24 +0200
Subject: [PATCH 19/22] improve document, fix auto usage in for, shared_memory
 usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
---
 common/components/prefix_sum.hpp.inc          |  2 +-
 common/components/sorting.hpp.inc             | 12 ++--
 common/components/uninitialized_array.hpp.inc |  7 +-
 .../par_ilut_filter_kernels.hpp.inc           |  4 +-
 .../par_ilut_select_kernels.hpp.inc           | 10 +--
 common/matrix/dense_kernels.hpp.inc           |  2 +-
 common/matrix/ell_kernels.hpp.inc             |  4 +-
 common/matrix/hybrid_kernels.hpp.inc          |  2 +-
 cuda/components/prefix_sum.cu                 |  6 +-
 cuda/test/components/sorting_kernels.cu       |  2 +-
 cuda/test/matrix/dense_kernels.cpp            |  2 +-
 cuda/test/matrix/ell_kernels.cpp              |  2 +-
 dpcpp/CMakeLists.txt                          |  6 +-
 dpcpp/base/config.hpp                         |  6 --
 dpcpp/base/helper.dp.cpp                      | 20 +++---
 dpcpp/base/helper.hpp                         | 71 ++++++++++++-------
 dpcpp/base/onemkl_bindings.hpp                |  3 +
 dpcpp/components/prefix_sum.dp.cpp            |  6 +-
 dpcpp/components/prefix_sum.dp.hpp            |  5 +-
 dpcpp/components/reduction.dp.hpp             |  6 +-
 dpcpp/components/thread_ids.dp.hpp            |  2 +-
 dpcpp/components/uninitialized_array.hpp      |  6 +-
 dpcpp/matrix/dense_kernels.dp.cpp             | 35 ++++-----
 dpcpp/test/matrix/dense_kernels.cpp           |  2 +-
 hip/components/prefix_sum.hip.cpp             |  6 +-
 .../ginkgo/core/synthesizer/containers.hpp    | 18 ++---
 26 files changed, 131 insertions(+), 116 deletions(-)

diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc
index 8f759b1dc95..1d57c20b2e5 100644
--- a/common/components/prefix_sum.hpp.inc
+++ b/common/components/prefix_sum.hpp.inc
@@ -57,7 +57,7 @@ __forceinline__ __device__ void subwarp_prefix_sum(ValueType element,
     total_sum = element;
 #pragma unroll
     // hypercube prefix sum
-    for (auto step = 1; step < subwarp.size(); step *= 2) {
+    for (int step = 1; step < subwarp.size(); step *= 2) {
         auto neighbor = subwarp.shfl_xor(total_sum, step);
         total_sum += neighbor;
         prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;
diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc
index ef5bd690937..cd772e08adb 100644
--- a/common/components/sorting.hpp.inc
+++ b/common/components/sorting.hpp.inc
@@ -70,7 +70,7 @@ struct bitonic_local {
                                                           bool reverse)
     {
         auto els_mid = els + (num_elements / 2);
-        for (auto i = 0; i < num_elements / 2; ++i) {
+        for (int i = 0; i < num_elements / 2; ++i) {
             bitonic_cas(els[i], els_mid[i], reverse);
         }
         half::merge(els, reverse);
@@ -131,7 +131,7 @@ struct bitonic_warp {
         auto tile =
             group::tiled_partition<num_threads>(group::this_thread_block());
         auto new_reverse = reverse != upper_half();
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             auto other = tile.shfl_xor(els[i], num_threads / 2);
             bitonic_cas(els[i], other, new_reverse);
         }
@@ -206,7 +206,7 @@ struct bitonic_global {
         auto upper_shared_els = shared_els + (num_groups * num_threads / 2);
         // only the lower group executes the CAS
         if (!upper_half()) {
-            for (auto i = 0; i < num_local; ++i) {
+            for (int i = 0; i < num_local; ++i) {
                 auto j = shared_idx(i);
                 bitonic_cas(shared_els[j], upper_shared_els[j], reverse);
             }
@@ -241,11 +241,11 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
                                                  bool reverse)
     {
         group::this_thread_block().sync();
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             local_els[i] = shared_els[shared_idx(i)];
         }
         warp::merge(local_els, reverse);
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             shared_els[shared_idx(i)] = local_els[i];
         }
     }
@@ -258,7 +258,7 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
         // This is the first step, so we don't need to load from shared memory
         warp::sort(local_els, reverse);
         // store the sorted elements in shared memory
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             shared_els[shared_idx(i)] = local_els[i];
         }
     }
diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc
index 3a8b3796c12..e951cf06860 100644
--- a/common/components/uninitialized_array.hpp.inc
+++ b/common/components/uninitialized_array.hpp.inc
@@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**
  * Stores an array with uninitialized contents.
  *
- * This class needed for datatypes that do have a non-empty constructor when`
+ * This class is needed for datatypes that do have a non-empty constructor when
  * using them as shared memory, for example `thrust::complex<float>`.
  *
  * @tparam ValueType the type of values
@@ -49,7 +49,7 @@ public:
      *
      * @return the constexpr pointer to the first entry of the array.
      */
-    constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
+    constexpr GKO_ATTRIBUTES operator const ValueType *() const noexcept
     {
         return &(*this)[0];
     }
@@ -70,7 +70,8 @@ public:
      *
      * @return a reference to the array entry at the given index.
      */
-    constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
+    constexpr GKO_ATTRIBUTES const ValueType &operator[](size_type pos) const
+        noexcept
     {
         return reinterpret_cast<const ValueType *>(data_)[pos];
     }
diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc
index 25b43e789ee..b5f7d43db67 100644
--- a/common/factorization/par_ilut_filter_kernels.hpp.inc
+++ b/common/factorization/par_ilut_filter_kernels.hpp.inc
@@ -55,7 +55,7 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs,
     auto end = row_ptrs[row + 1];
     begin_cb(row);
     auto num_steps = ceildiv(end - begin, subwarp_size);
-    for (auto step = 0; step < num_steps; ++step) {
+    for (IndexType step = 0; step < num_steps; ++step) {
         auto idx = begin + lane + step * subwarp_size;
         auto keep = idx < end && pred(idx, begin, end);
         auto mask = subwarp.ballot(keep);
@@ -189,4 +189,4 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter(
 }
 
 
-}  // namespace kernel
\ No newline at end of file
+}  // namespace kernel
diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc
index 059069faf41..9b4897d1766 100644
--- a/common/factorization/par_ilut_select_kernels.hpp.inc
+++ b/common/factorization/par_ilut_select_kernels.hpp.inc
@@ -62,7 +62,8 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree(
     // assuming rounding towards zero
     auto stride = double(size) / sample_size;
 #pragma unroll
-    for (auto i = 0; i < sampleselect_oversampling; ++i) {
+    for (auto i = decltype(sampleselect_oversampling){0};
+         i < sampleselect_oversampling; ++i) {
         auto lidx = idx * sampleselect_oversampling + i;
         auto val = input[static_cast<IndexType>(lidx * stride)];
         samples[i] = abs(val);
@@ -119,7 +120,8 @@ __global__ __launch_bounds__(default_block_size) void count_buckets(
         auto el = abs(input[i]);
         IndexType tree_idx{};
 #pragma unroll
-        for (auto level = 0; level < sampleselect_searchtree_height; ++level) {
+        for (auto level = decltype(sampleselect_searchtree_height){0};
+             level < sampleselect_searchtree_height; ++level) {
             auto cmp = !(el < sh_tree[tree_idx]);
             tree_idx = 2 * tree_idx + 1 + cmp;
         }
@@ -168,7 +170,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
     // compute prefix sum over warp-sized blocks
     IndexType total{};
     auto base_idx = warp_idx * work_per_warp * warp.size();
-    for (auto step = 0; step < work_per_warp; ++step) {
+    for (auto step = decltype(work_per_warp){0}; step < work_per_warp; ++step) {
         auto idx = warp_lane + step * warp.size() + base_idx;
         auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
         IndexType warp_total{};
@@ -207,7 +209,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
     // add block prefix sum to each warp's block of data
     block.sync();
     auto warp_prefixsum = warp_sums[warp_idx];
-    for (auto step = 0; step < work_per_warp; ++step) {
+    for (IndexType step = 0; step < work_per_warp; ++step) {
         auto idx = warp_lane + step * warp.size() + base_idx;
         auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
         if (idx < num_blocks) {
diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc
index d46b202a8ff..c7ebafd0627 100644
--- a/common/matrix/dense_kernels.hpp.inc
+++ b/common/matrix/dense_kernels.hpp.inc
@@ -211,7 +211,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = row_ptrs[tidx];
-        for (auto i = 0; i < num_cols; i++) {
+        for (size_type i = 0; i < num_cols; i++) {
             if (source[stride * tidx + i] != zero<ValueType>()) {
                 values[write_to] = source[stride * tidx + i];
                 col_idxs[write_to] = i;
diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
index 2323d512258..399dd5070ac 100644
--- a/common/matrix/ell_kernels.hpp.inc
+++ b/common/matrix/ell_kernels.hpp.inc
@@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_rows) {
-        for (auto col = 0; col < nnz; col++) {
+        for (size_type col = 0; col < nnz; col++) {
             result[tidx * result_stride +
                    col_idxs[tidx + col * source_stride]] +=
                 values[tidx + col * source_stride];
@@ -226,7 +226,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
+        for (size_type i = 0; i < max_nnz_per_row; i++) {
             const auto source_idx = tidx + stride * i;
             if (source_values[source_idx] != zero<ValueType>()) {
                 result_values[write_to] = source_values[source_idx];
diff --git a/common/matrix/hybrid_kernels.hpp.inc b/common/matrix/hybrid_kernels.hpp.inc
index b6af7c2be36..c7c192189e0 100644
--- a/common/matrix/hybrid_kernels.hpp.inc
+++ b/common/matrix/hybrid_kernels.hpp.inc
@@ -108,7 +108,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
+        for (size_type i = 0; i < max_nnz_per_row; i++) {
             const auto source_idx = tidx + stride * i;
             if (ell_val[source_idx] != zero<ValueType>()) {
                 result_values[write_to] = ell_val[source_idx];
diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu
index 54739c783c8..ce108fa8cf9 100644
--- a/cuda/components/prefix_sum.cu
+++ b/cuda/components/prefix_sum.cu
@@ -49,7 +49,7 @@ template <typename IndexType>
 void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // prefix_sum should be on the valid array
+    // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
@@ -57,8 +57,8 @@ void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
         start_prefix_sum<prefix_sum_block_size>
             <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
                                                     block_sums);
-        // add the total sum of the previous block only when the number of block
-        // is larger than 1.
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
         if (num_blocks > 1) {
             finalize_prefix_sum<prefix_sum_block_size>
                 <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu
index e973cc0f650..f61bbd0694e 100644
--- a/cuda/test/components/sorting_kernels.cu
+++ b/cuda/test/components/sorting_kernels.cu
@@ -99,7 +99,7 @@ protected:
     {
         // we want some duplicate elements
         std::uniform_int_distribution<gko::int32> dist(0, num_elements / 2);
-        for (auto i = 0; i < num_elements; ++i) {
+        for (auto i = decltype(num_elements){0}; i < num_elements; ++i) {
             ref_shared.get_data()[i] = dist(rng);
         }
         ddata = gko::Array<gko::int32>{cuda, ref_shared};
diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
index 6e40ce5b5a3..de96d27d823 100644
--- a/cuda/test/matrix/dense_kernels.cpp
+++ b/cuda/test/matrix/dense_kernels.cpp
@@ -550,7 +550,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
                                                           &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }
diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
index 2df1c397f4c..51c12fab531 100644
--- a/cuda/test/matrix/ell_kernels.cpp
+++ b/cuda/test/matrix/ell_kernels.cpp
@@ -585,7 +585,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef)
                                                         &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 443d180b172..7729588d363 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -60,13 +60,15 @@ target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP)
 
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
-# Note. add MKL via PRIVATE not PUBLIC (MKL example shows) to avoid find_package(MKL) everywhere when link ginkgo
+# Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating
+# find_package(MKL) everywhere when linking ginkgo (see the MKL example
+# https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html)
 target_compile_options(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_COMPILE_OPTIONS>)
 target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
 target_include_directories(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
 # When building ginkgo as a static library, we need to use dpcpp and per_kernel
-# link option when the program uses dpcpp related function.
+# link option when the program uses a dpcpp related function.
 if (BUILD_SHARED_LIBS)
     target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
 else ()
diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp
index abb84d9b7ff..78fe25978a7 100644
--- a/dpcpp/base/config.hpp
+++ b/dpcpp/base/config.hpp
@@ -49,12 +49,6 @@ struct config {
      */
     using lane_mask_type = uint64;
 
-
-    /**
-     * The number of threads within a CUDA warp.
-     */
-    static constexpr uint32 warp_size = 16;
-
     /**
      * The bitmask of the entire warp.
      */
diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp
index ae453dd937d..5e6c1a579f5 100644
--- a/dpcpp/base/helper.dp.cpp
+++ b/dpcpp/base/helper.dp.cpp
@@ -44,18 +44,16 @@ namespace dpcpp {
 bool validate(sycl::queue *queue, unsigned int workgroup_size,
               unsigned int subgroup_size)
 {
-    {
-        auto device = queue->get_device();
-        auto subgroup_size_list =
-            device.get_info<cl::sycl::info::device::sub_group_sizes>();
-        auto max_workgroup_size =
-            device.get_info<sycl::info::device::max_work_group_size>();
-        bool allowed = false;
-        for (auto &i : subgroup_size_list) {
-            allowed |= (i == subgroup_size);
-        }
-        return allowed && (workgroup_size <= max_workgroup_size);
+    auto device = queue->get_device();
+    auto subgroup_size_list =
+        device.get_info<cl::sycl::info::device::sub_group_sizes>();
+    auto max_workgroup_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+    bool allowed = false;
+    for (auto &i : subgroup_size_list) {
+        allowed |= (i == subgroup_size);
     }
+    return allowed && (workgroup_size <= max_workgroup_size);
 }
 
 
diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index 8c7f45e5174..16d91c2ef8d 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -51,44 +51,44 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**
  * GKO_ENABLE_DEFAULT_HOST gives a default host implementation for those
  * kernels which require encoded config but do not need explicit template
- * parameter and share memory
+ * parameter and shared memory
  *
  * @param name_  the name of the host function with config
  * @param kernel_  the kernel name
  */
-#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_)                     \
-    template <typename... InferredArgs>                             \
-    void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \
-               sycl::queue *queue, InferredArgs... args)            \
-    {                                                               \
-        queue->submit([&](sycl::handler &cgh) {                     \
-            cgh.parallel_for(sycl_nd_range(grid, block),            \
-                             [=](sycl::nd_item<3> item_ct1) {       \
-                                 kernel_(args..., item_ct1);        \
-                             });                                    \
-        });                                                         \
+#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_)                   \
+    template <typename... InferredArgs>                           \
+    void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \
+               InferredArgs... args)                              \
+    {                                                             \
+        queue->submit([&](sycl::handler &cgh) {                   \
+            cgh.parallel_for(sycl_nd_range(grid, block),          \
+                             [=](sycl::nd_item<3> item_ct1) {     \
+                                 kernel_(args..., item_ct1);      \
+                             });                                  \
+        });                                                       \
     }
 
 
 /**
  * GKO_ENABLE_DEFAULT_HOST_CONFIG gives a default host implementation for those
  * kernels which require encoded config but do not need explicit template
- * parameter and share memory
+ * parameter and shared memory
  *
  * @param name_  the name of the host function with config
  * @param kernel_  the kernel name
  */
-#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                     \
-    template <std::uint32_t encoded, typename... InferredArgs>             \
-    inline void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \
-                      sycl::queue *queue, InferredArgs... args)            \
-    {                                                                      \
-        queue->submit([&](sycl::handler &cgh) {                            \
-            cgh.parallel_for(sycl_nd_range(grid, block),                   \
-                             [=](sycl::nd_item<3> item_ct1) {              \
-                                 kernel_<encoded>(args..., item_ct1);      \
-                             });                                           \
-        });                                                                \
+#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                   \
+    template <std::uint32_t encoded, typename... InferredArgs>           \
+    inline void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \
+                      InferredArgs... args)                              \
+    {                                                                    \
+        queue->submit([&](sycl::handler &cgh) {                          \
+            cgh.parallel_for(sycl_nd_range(grid, block),                 \
+                             [=](sycl::nd_item<3> item_ct1) {            \
+                                 kernel_<encoded>(args..., item_ct1);    \
+                             });                                         \
+        });                                                              \
     }
 
 /**
@@ -138,10 +138,33 @@ namespace kernels {
 namespace dpcpp {
 
 
+/**
+ * This is the validate function for common check. It checks the workgroup size
+ * is below device max workgroup size and subgroup size is in the supported
+ * subgroup size.
+ *
+ * @param queue  the sycl queue pointer
+ * @param workgroup_size  the workgroup size (block size in cuda sense)
+ * @param subgroup_size  the subgroup size (warp size in cuda sense)
+ *
+ * @return the given arguments are valid or not in given queue.
+ */
 bool validate(sycl::queue *queue, unsigned workgroup_size,
               unsigned subgroup_size);
 
 
+/**
+ * get_first_cfg will return the first valid config by validate function from
+ * given config array.
+ *
+ * @tparam IterArr  the iteratable array type
+ * @tparam Validate  the validate function type
+ *
+ * @param arr  the config array
+ * @param verify  the validate function
+ *
+ * @return the first valid config
+ */
 template <typename IterArr, typename Validate>
 std::uint32_t get_first_cfg(IterArr &arr, Validate verify)
 {
diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp
index 6456a048d23..1c9a8dabb30 100644
--- a/dpcpp/base/onemkl_bindings.hpp
+++ b/dpcpp/base/onemkl_bindings.hpp
@@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_
 
 
+#include <type_traits>
+
+
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp
index 07cdb5b38aa..63f33e9ba35 100644
--- a/dpcpp/components/prefix_sum.dp.cpp
+++ b/dpcpp/components/prefix_sum.dp.cpp
@@ -70,7 +70,7 @@ template <typename IndexType>
 void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // prefix_sum should be on the valid array
+    // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto queue = exec->get_queue();
         constexpr auto block_cfg_array = as_array(block_cfg_list);
@@ -84,8 +84,8 @@ void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
         auto block_sums = block_sum_array.get_data();
         start_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(),
                               num_entries, counts, block_sums);
-        // add the total sum of the previous block only when the number of block
-        // is larger than 1.
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
         if (num_blocks > 1) {
             finalize_prefix_sum_call(cfg, num_blocks, wg_size, 0,
                                      exec->get_queue(), num_entries, counts,
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
index 22e6139dd84..334d4239c56 100644
--- a/dpcpp/components/prefix_sum.dp.hpp
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -78,7 +78,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element,
     total_sum = element;
 #pragma unroll
     // hypercube prefix sum
-    for (auto step = 1; step < subwarp.size(); step *= 2) {
+    for (int step = 1; step < subwarp.size(); step *= 2) {
         auto neighbor = subwarp.shfl_xor(total_sum, step);
         total_sum += neighbor;
         prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;
@@ -193,8 +193,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                          [=](sycl::nd_item<3> item_ct1) {
                              start_prefix_sum<block_size>(
                                  num_elements, elements, block_sum, item_ct1,
-                                 (UninitializedArray<ValueType, block_size> *)
-                                     prefix_helper_acc_ct1.get_pointer());
+                                 prefix_helper_acc_ct1.get_pointer().get());
                          });
     });
 }
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index e0678f6cf7a..094f2093a95 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -229,10 +229,8 @@ void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory,
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                reduce_add_array<cfg>(
-                    size, source, result, item_ct1,
-                    (UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *)
-                        block_sum_acc_ct1.get_pointer());
+                reduce_add_array<cfg>(size, source, result, item_ct1,
+                                      block_sum_acc_ct1.get_pointer().get());
             });
     });
 }
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
index 47abf3c7b72..f9decfd989d 100644
--- a/dpcpp/components/thread_ids.dp.hpp
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -195,7 +195,7 @@ __dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1)
 template <int subwarp_size, int warps_per_block>
 __dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1)
 {
-    // dpcpp dose not have subwarp
+    // dpcpp does not have subwarp
     constexpr auto subwarps_per_warp = subwarp_size / subwarp_size;
     return get_warp_id<warps_per_block>(item_ct1) * subwarps_per_warp +
            item_ct1.get_local_id(1);
diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp
index b10457df217..d9d423c9c94 100644
--- a/dpcpp/components/uninitialized_array.hpp
+++ b/dpcpp/components/uninitialized_array.hpp
@@ -48,7 +48,7 @@ namespace dpcpp {
 /**
  * Stores an array with uninitialized contents.
  *
- * This class needed for datatypes that do have a non-empty constructor when`
+ * This class is needed for datatypes that do have a non-empty constructor when
  * using them as shared memory, for example `thrust::complex<float>`.
  *
  * @tparam ValueType the type of values
@@ -63,7 +63,7 @@ class UninitializedArray {
      *
      * @return the constexpr pointer to the first entry of the array.
      */
-    constexpr __dpct_inline__ operator ValueType *() const noexcept
+    constexpr __dpct_inline__ operator const ValueType *() const noexcept
     {
         return &(*this)[0];
     }
@@ -84,7 +84,7 @@ class UninitializedArray {
      *
      * @return a reference to the array entry at the given index.
      */
-    constexpr __dpct_inline__ ValueType &operator[](size_type pos) const
+    constexpr __dpct_inline__ const ValueType &operator[](size_type pos) const
         noexcept
     {
         return data_[pos];
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 32eef01af63..b17e44f9706 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -179,10 +179,9 @@ void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_dot<cfg>(
-                    num_rows, x, stride_x, y, stride_y, work, item_ct1,
-                    (UninitializedArray<ValueType, wg_size> *)
-                        tmp_work_acc_ct1.get_pointer());
+                compute_partial_dot<cfg>(num_rows, x, stride_x, y, stride_y,
+                                         work, item_ct1,
+                                         tmp_work_acc_ct1.get_pointer().get());
             });
     });
 }
@@ -227,8 +226,7 @@ void compute_partial_conj_dot(dim3 grid, dim3 block,
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 compute_partial_conj_dot<cfg>(
                     num_rows, x, stride_x, y, stride_y, work, item_ct1,
-                    (UninitializedArray<ValueType, wg_size> *)
-                        tmp_work_acc_ct1.get_pointer());
+                    tmp_work_acc_ct1.get_pointer().get());
             });
     });
 }
@@ -268,8 +266,7 @@ void finalize_sum_reduce_computation(dim3 grid, dim3 block,
                          [=](sycl::nd_item<3> item_ct1) {
                              finalize_sum_reduce_computation<cfg>(
                                  size, work, result, item_ct1,
-                                 (UninitializedArray<ValueType, wg_size> *)
-                                     tmp_work_acc_ct1.get_pointer());
+                                 tmp_work_acc_ct1.get_pointer().get());
                          });
     });
 }
@@ -308,13 +305,12 @@ void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                        sycl::access::target::local>
             tmp_work_acc_ct1(cgh);
 
-        cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                compute_partial_norm2<cfg>(
-                    num_rows, x, stride_x, work, item_ct1,
-                    (UninitializedArray<remove_complex<ValueType>, wg_size> *)
-                        tmp_work_acc_ct1.get_pointer());
-            });
+        cgh.parallel_for(sycl_nd_range(grid, block),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             compute_partial_norm2<cfg>(
+                                 num_rows, x, stride_x, work, item_ct1,
+                                 tmp_work_acc_ct1.get_pointer().get());
+                         });
     });
 }
 
@@ -354,8 +350,7 @@ void finalize_sqrt_reduce_computation(dim3 grid, dim3 block,
                          [=](sycl::nd_item<3> item_ct1) {
                              finalize_sqrt_reduce_computation<cfg>(
                                  size, work, result, item_ct1,
-                                 (UninitializedArray<ValueType, wg_size> *)
-                                     tmp_work_acc_ct1.get_pointer());
+                                 tmp_work_acc_ct1.get_pointer().get());
                          });
     });
 }
@@ -433,7 +428,7 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
 
     if (tidx < num_rows) {
         auto write_to = row_ptrs[tidx];
-        for (auto i = 0; i < num_cols; i++) {
+        for (size_type i = 0; i < num_cols; i++) {
             if (source[stride * tidx + i] != zero<ValueType>()) {
                 values[write_to] = source[stride * tidx + i];
                 col_idxs[write_to] = i;
@@ -586,7 +581,7 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 reduce_max_nnz<cfg>(size, nnz_per_row, result, item_ct1,
-                                    dpct_local_acc_ct1.get_pointer());
+                                    dpct_local_acc_ct1.get_pointer().get());
             });
     });
 }
@@ -666,7 +661,7 @@ void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 reduce_total_cols<cfg>(num_slices, max_nnz_per_slice, result,
                                        item_ct1,
-                                       dpct_local_acc_ct1.get_pointer());
+                                       dpct_local_acc_ct1.get_pointer().get());
             });
     });
 }
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
index 43ce9bad547..257ee6fbc6a 100644
--- a/dpcpp/test/matrix/dense_kernels.cpp
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -697,7 +697,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
                                                            &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }
diff --git a/hip/components/prefix_sum.hip.cpp b/hip/components/prefix_sum.hip.cpp
index 28cd01b4fb5..9302fc07b9a 100644
--- a/hip/components/prefix_sum.hip.cpp
+++ b/hip/components/prefix_sum.hip.cpp
@@ -49,7 +49,7 @@ template <typename IndexType>
 void prefix_sum(std::shared_ptr<const HipExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // prefix_sum should be on the valid array
+    // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
@@ -58,8 +58,8 @@ void prefix_sum(std::shared_ptr<const HipExecutor> exec, IndexType *counts,
             HIP_KERNEL_NAME(start_prefix_sum<prefix_sum_block_size>),
             dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, num_entries,
             counts, block_sums);
-        // add the total sum of the previous block only when the number of block
-        // is larger than 1.
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
         if (num_blocks > 1) {
             hipLaunchKernelGGL(
                 HIP_KERNEL_NAME(finalize_prefix_sum<prefix_sum_block_size>),
diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp
index 10e8c1031a1..0e9570540fa 100644
--- a/include/ginkgo/core/synthesizer/containers.hpp
+++ b/include/ginkgo/core/synthesizer/containers.hpp
@@ -51,7 +51,7 @@ namespace syn {
  * value_list records several values with the same type in template.
  *
  * @tparam T  the value type of the list
- * @tparam T...  the values in the list
+ * @tparam Values  the values in the list
  */
 template <typename T, T... Values>
 struct value_list {};
@@ -60,7 +60,7 @@ struct value_list {};
 /**
  * type_list records several types in template
  *
- * @tparam ...Types  the types in the list
+ * @tparam Types  the types in the list
  */
 template <typename... Types>
 struct type_list {};
@@ -69,9 +69,9 @@ struct type_list {};
 /**
  * range records start, end, step in template
  *
- * @tparam int  start of range
- * @tparam int  end of range
- * @tparam int  step of range. default is 1
+ * @tparam Start  start of range
+ * @tparam End  end of range
+ * @tparam Step  step of range. default is 1
  */
 template <int Start, int End, int Step = 1>
 struct range {};
@@ -93,8 +93,8 @@ struct concatenate_impl;
  * concatenate_impl specializes for two value_list with the same value type.
  *
  * @tparam T  the value type of two value_list
- * @tparam T...  the values of the first list
- * @tparam T...  the values of the second list
+ * @tparam Values  the values of the first list
+ * @tparam Values  the values of the second list
  */
 template <typename T, T... Values1, T... Values2>
 struct concatenate_impl<value_list<T, Values1...>, value_list<T, Values2...>> {
@@ -130,7 +130,7 @@ struct as_list_impl;
  * as_list_impl specializes for the value_list
  *
  * @tparam T  the value_list type
- * @tparam T...  the values of value_list
+ * @tparam Values  the values of value_list
  */
 template <typename T, T... Values>
 struct as_list_impl<value_list<T, Values...>> {
@@ -193,7 +193,7 @@ using as_list = typename detail::as_list_impl<T>::type;
  * for in runtime on the array.
  *
  * @tparam T  the type of value_list
- * @tparam T...  the values of value_list
+ * @tparam Value  the values of value_list
  *
  * @param value_list  the input value_list
  *

From a6b4ccc461e78df3bd2d7082184b6dc5ffb69a45 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 20 Jul 2021 18:08:59 +0200
Subject: [PATCH 20/22] add some note to indicate the porting TODO need to
 revisit these TODO when we are close to fully porting ginkgo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Terry Cojean <terry.cojean@kit.edu>
Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
---
 dpcpp/base/helper.hpp                    | 47 ++++++++++++------------
 dpcpp/components/thread_ids.dp.hpp       |  6 +++
 dpcpp/components/uninitialized_array.hpp |  3 ++
 3 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index 16d91c2ef8d..cb98e4c511e 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -56,17 +56,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * @param name_  the name of the host function with config
  * @param kernel_  the kernel name
  */
-#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_)                   \
-    template <typename... InferredArgs>                           \
-    void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \
-               InferredArgs... args)                              \
-    {                                                             \
-        queue->submit([&](sycl::handler &cgh) {                   \
-            cgh.parallel_for(sycl_nd_range(grid, block),          \
-                             [=](sycl::nd_item<3> item_ct1) {     \
-                                 kernel_(args..., item_ct1);      \
-                             });                                  \
-        });                                                       \
+#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_)                           \
+    template <typename... InferredArgs>                                   \
+    void name_(dim3 grid, dim3 block, gko::size_type, sycl::queue *queue, \
+               InferredArgs... args)                                      \
+    {                                                                     \
+        queue->submit([&](sycl::handler &cgh) {                           \
+            cgh.parallel_for(sycl_nd_range(grid, block),                  \
+                             [=](sycl::nd_item<3> item_ct1) {             \
+                                 kernel_(args..., item_ct1);              \
+                             });                                          \
+        });                                                               \
     }
 
 
@@ -78,17 +78,17 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * @param name_  the name of the host function with config
  * @param kernel_  the kernel name
  */
-#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                   \
-    template <std::uint32_t encoded, typename... InferredArgs>           \
-    inline void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \
-                      InferredArgs... args)                              \
-    {                                                                    \
-        queue->submit([&](sycl::handler &cgh) {                          \
-            cgh.parallel_for(sycl_nd_range(grid, block),                 \
-                             [=](sycl::nd_item<3> item_ct1) {            \
-                                 kernel_<encoded>(args..., item_ct1);    \
-                             });                                         \
-        });                                                              \
+#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                \
+    template <std::uint32_t encoded, typename... InferredArgs>        \
+    inline void name_(dim3 grid, dim3 block, gko::size_type,          \
+                      sycl::queue *queue, InferredArgs... args)       \
+    {                                                                 \
+        queue->submit([&](sycl::handler &cgh) {                       \
+            cgh.parallel_for(sycl_nd_range(grid, block),              \
+                             [=](sycl::nd_item<3> item_ct1) {         \
+                                 kernel_<encoded>(args..., item_ct1); \
+                             });                                      \
+        });                                                           \
     }
 
 /**
@@ -106,7 +106,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_)               \
     template <typename... InferredArgs>                                       \
     void name_(std::uint32_t desired_cfg, dim3 grid, dim3 block,              \
-               size_t dynamic_shared_memory, sycl::queue *queue,              \
+               gko::size_type dynamic_shared_memory, sycl::queue *queue,      \
                InferredArgs... args)                                          \
     {                                                                         \
         callable_(                                                            \
@@ -174,7 +174,6 @@ std::uint32_t get_first_cfg(IterArr &arr, Validate verify)
         }
     }
     GKO_NOT_SUPPORTED(arr);
-    return 0;
 }
 
 
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
index f9decfd989d..70ad76d9ccb 100644
--- a/dpcpp/components/thread_ids.dp.hpp
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -52,6 +52,12 @@ namespace dpcpp {
 namespace thread {
 
 
+// TODO: porting - need to refine functions and their name in this file
+// the grid/block description uses the cuda dim3 to represent. i.e. using dim3
+// to launch dpcpp kernel, the kernel will reverse the ordering to keep the same
+// linear memory usage as cuda.
+
+
 /**
  * @internal
  *
diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp
index d9d423c9c94..eb8a36770d7 100644
--- a/dpcpp/components/uninitialized_array.hpp
+++ b/dpcpp/components/uninitialized_array.hpp
@@ -45,6 +45,9 @@ namespace kernels {
 namespace dpcpp {
 
 
+// TODO: porting - consider directly use the array as shared memory
+
+
 /**
  * Stores an array with uninitialized contents.
  *

From 4464c4db84ed78715b5a4bccd92de140e2b5aecd Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 21 Jul 2021 12:04:00 +0200
Subject: [PATCH 21/22] explict type in for, use func not macro to skip, remove
 dup test note: gtest_skip can not be in another function call Co-authored-by:
 Tobias Ribizel <ribizel@kit.edu>

---
 .../par_ilut_select_kernels.hpp.inc           |   8 +-
 cuda/test/components/sorting_kernels.cu       |   2 +-
 dpcpp/CMakeLists.txt                          |   2 +-
 dpcpp/test/matrix/dense_kernels.cpp           | 365 +-----------------
 dpcpp/test/utils.hpp                          |  56 +++
 5 files changed, 67 insertions(+), 366 deletions(-)
 create mode 100644 dpcpp/test/utils.hpp

diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc
index 9b4897d1766..e443d7b6ba7 100644
--- a/common/factorization/par_ilut_select_kernels.hpp.inc
+++ b/common/factorization/par_ilut_select_kernels.hpp.inc
@@ -62,8 +62,7 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree(
     // assuming rounding towards zero
     auto stride = double(size) / sample_size;
 #pragma unroll
-    for (auto i = decltype(sampleselect_oversampling){0};
-         i < sampleselect_oversampling; ++i) {
+    for (int i = 0; i < sampleselect_oversampling; ++i) {
         auto lidx = idx * sampleselect_oversampling + i;
         auto val = input[static_cast<IndexType>(lidx * stride)];
         samples[i] = abs(val);
@@ -120,8 +119,7 @@ __global__ __launch_bounds__(default_block_size) void count_buckets(
         auto el = abs(input[i]);
         IndexType tree_idx{};
 #pragma unroll
-        for (auto level = decltype(sampleselect_searchtree_height){0};
-             level < sampleselect_searchtree_height; ++level) {
+        for (int level = 0; level < sampleselect_searchtree_height; ++level) {
             auto cmp = !(el < sh_tree[tree_idx]);
             tree_idx = 2 * tree_idx + 1 + cmp;
         }
@@ -170,7 +168,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
     // compute prefix sum over warp-sized blocks
     IndexType total{};
     auto base_idx = warp_idx * work_per_warp * warp.size();
-    for (auto step = decltype(work_per_warp){0}; step < work_per_warp; ++step) {
+    for (IndexType step = 0; step < work_per_warp; ++step) {
         auto idx = warp_lane + step * warp.size() + base_idx;
         auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
         IndexType warp_total{};
diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu
index f61bbd0694e..e2b7abc51d7 100644
--- a/cuda/test/components/sorting_kernels.cu
+++ b/cuda/test/components/sorting_kernels.cu
@@ -99,7 +99,7 @@ protected:
     {
         // we want some duplicate elements
         std::uniform_int_distribution<gko::int32> dist(0, num_elements / 2);
-        for (auto i = decltype(num_elements){0}; i < num_elements; ++i) {
+        for (int i = 0; i < num_elements; ++i) {
             ref_shared.get_data()[i] = dist(rng);
         }
         ddata = gko::Array<gko::int32>{cuda, ref_shared};
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 7729588d363..e2d476164e8 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -75,7 +75,7 @@ else ()
     target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
 endif()
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
-target_link_libraries(ginkgo_dpcpp PRIVATE $<LINK_ONLY:MKL::MKL_DPCPP>)
+target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP)
 if (GINKGO_DPCPP_SINGLE_MODE)
     target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
 endif()
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
index 257ee6fbc6a..cf1bbe26cd4 100644
--- a/dpcpp/test/matrix/dense_kernels.cpp
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/fill_array.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/test/utils.hpp"
+#include "dpcpp/test/utils.hpp"
 
 
 namespace {
@@ -194,127 +195,6 @@ class Dense : public ::testing::Test {
 };
 
 
-TEST_F(Dense, DpcppFillIsEquivalentToRef)
-{
-    set_up_vector_data(3);
-    auto result = Mtx::create(ref);
-
-    x->fill(42);
-    dx->fill(42);
-    result->copy_from(dx.get());
-
-    GKO_ASSERT_MTX_NEAR(result, x, r<vtype>::value);
-}
-
-
-TEST_F(Dense, DpcppStridedFillIsEquivalentToRef)
-{
-    using T = vtype;
-    auto x = gko::initialize<gko::matrix::Dense<T>>(
-        4, {I<T>{1.0, 2.0}, I<T>{3.0, 4.0}, I<T>{5.0, 6.0}}, ref);
-    auto dx = gko::initialize<gko::matrix::Dense<T>>(
-        4, {I<T>{1.0, 2.0}, I<T>{3.0, 4.0}, I<T>{5.0, 6.0}}, dpcpp);
-    auto result = Mtx::create(ref);
-
-    x->fill(42);
-    dx->fill(42);
-    result->copy_from(dx.get());
-
-    GKO_ASSERT_MTX_NEAR(result, x, r<vtype>::value);
-}
-
-
-TEST_F(Dense, SingleVectorDpcppScaleIsEquivalentToRef)
-{
-    set_up_vector_data(1);
-    auto result = Mtx::create(ref);
-
-    x->scale(alpha.get());
-    dx->scale(dalpha.get());
-    result->copy_from(dx.get());
-
-    GKO_ASSERT_MTX_NEAR(result, x, r<vtype>::value);
-}
-
-
-TEST_F(Dense, MultipleVectorDpcppScaleIsEquivalentToRef)
-{
-    set_up_vector_data(20);
-
-    x->scale(alpha.get());
-    dx->scale(dalpha.get());
-
-    GKO_ASSERT_MTX_NEAR(dx, x, r<vtype>::value);
-}
-
-
-TEST_F(Dense, MultipleVectorDpcppScaleWithDifferentAlphaIsEquivalentToRef)
-{
-    set_up_vector_data(20, true);
-
-    x->scale(alpha.get());
-    dx->scale(dalpha.get());
-
-    GKO_ASSERT_MTX_NEAR(dx, x, r<vtype>::value);
-}
-
-
-TEST_F(Dense, SingleVectorDpcppAddScaledIsEquivalentToRef)
-{
-    set_up_vector_data(1);
-
-    x->add_scaled(alpha.get(), y.get());
-    dx->add_scaled(dalpha.get(), dy.get());
-
-    GKO_ASSERT_MTX_NEAR(dx, x, r<vtype>::value);
-}
-
-
-TEST_F(Dense, MultipleVectorDpcppAddScaledIsEquivalentToRef)
-{
-    set_up_vector_data(20);
-
-    x->add_scaled(alpha.get(), y.get());
-    dx->add_scaled(dalpha.get(), dy.get());
-
-    GKO_ASSERT_MTX_NEAR(dx, x, r<vtype>::value);
-}
-
-
-TEST_F(Dense, MultipleVectorDpcppAddScaledWithDifferentAlphaIsEquivalentToRef)
-{
-    set_up_vector_data(20);
-
-    x->add_scaled(alpha.get(), y.get());
-    dx->add_scaled(dalpha.get(), dy.get());
-
-    GKO_ASSERT_MTX_NEAR(dx, x, r<vtype>::value);
-}
-
-
-TEST_F(Dense, AddsScaledDiagIsEquivalentToRef)
-{
-    auto mat = gen_mtx<Mtx>(532, 532);
-    gko::Array<Mtx::value_type> diag_values(ref, 532);
-    gko::kernels::reference::components::fill_array(ref, diag_values.get_data(),
-                                                    532, Mtx::value_type{2.0});
-    auto diag =
-        gko::matrix::Diagonal<Mtx::value_type>::create(ref, 532, diag_values);
-    alpha = gko::initialize<Mtx>({2.0}, ref);
-    auto dmat = Mtx::create(dpcpp);
-    dmat->copy_from(mat.get());
-    auto ddiag = gko::matrix::Diagonal<Mtx::value_type>::create(dpcpp);
-    ddiag->copy_from(diag.get());
-    dalpha = Mtx::create(dpcpp);
-    dalpha->copy_from(alpha.get());
-
-    mat->add_scaled(alpha.get(), diag.get());
-    dmat->add_scaled(dalpha.get(), ddiag.get());
-
-    GKO_ASSERT_MTX_NEAR(mat, dmat, r<vtype>::value);
-}
-
-
 TEST_F(Dense, SingleVectorDpcppComputeDotIsEquivalentToRef)
 {
     set_up_vector_data(1);
@@ -384,11 +264,9 @@ TEST_F(Dense, SimpleApplyIsEquivalentToRef)
 }
 
 
-#if !GINKGO_DPCPP_SINGLE_MODE
-
-
 TEST_F(Dense, SimpleApplyMixedIsEquivalentToRef)
 {
+    SKIP_IF_SINGLE_MODE;
     set_up_apply_data();
 
     x->apply(convert<MixedMtx>(y).get(), convert<MixedMtx>(expected).get());
@@ -398,9 +276,6 @@ TEST_F(Dense, SimpleApplyMixedIsEquivalentToRef)
 }
 
 
-#endif  // !GINKGO_DPCPP_SINGLE_MODE
-
-
 TEST_F(Dense, AdvancedApplyIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -412,11 +287,9 @@ TEST_F(Dense, AdvancedApplyIsEquivalentToRef)
 }
 
 
-#if !GINKGO_DPCPP_SINGLE_MODE
-
-
 TEST_F(Dense, AdvancedApplyMixedIsEquivalentToRef)
 {
+    SKIP_IF_SINGLE_MODE;
     set_up_apply_data();
 
     x->apply(convert<MixedMtx>(alpha).get(), convert<MixedMtx>(y).get(),
@@ -428,11 +301,9 @@ TEST_F(Dense, AdvancedApplyMixedIsEquivalentToRef)
 }
 
 
-#endif  // !GINKGO_DPCPP_SINGLE_MODE
-
-
 TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
 {
+    SKIP_IF_SINGLE_MODE;
     set_up_apply_data();
     auto complex_b = gen_mtx<ComplexMtx>(25, 1);
     auto dcomplex_b = ComplexMtx::create(dpcpp);
@@ -448,11 +319,9 @@ TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
 }
 
 
-#if !GINKGO_DPCPP_SINGLE_MODE
-
-
 TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef)
 {
+    SKIP_IF_SINGLE_MODE;
     set_up_apply_data();
     auto complex_b = gen_mtx<MixedComplexMtx>(25, 1);
     auto dcomplex_b = MixedComplexMtx::create(dpcpp);
@@ -467,8 +336,6 @@ TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef)
     GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7);
 }
 
-#endif  // !GINKGO_DPCPP_SINGLE_MODE
-
 
 TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef)
 {
@@ -487,11 +354,9 @@ TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef)
 }
 
 
-#if !GINKGO_DPCPP_SINGLE_MODE
-
-
 TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef)
 {
+    SKIP_IF_SINGLE_MODE;
     set_up_apply_data();
     auto complex_b = gen_mtx<MixedComplexMtx>(25, 1);
     auto dcomplex_b = MixedComplexMtx::create(dpcpp);
@@ -509,9 +374,6 @@ TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef)
 }
 
 
-#endif  // !GINKGO_DPCPP_SINGLE_MODE
-
-
 TEST_F(Dense, ComputeDotComplexIsEquivalentToRef)
 {
     set_up_apply_data();
@@ -733,219 +595,4 @@ TEST_F(Dense, CalculateTotalColsIsEquivalentToRef)
 }
 
 
-TEST_F(Dense, CanGatherRows)
-{
-    set_up_apply_data();
-
-    auto r_gather = x->row_gather(rgather_idxs.get());
-    auto dr_gather = dx->row_gather(rgather_idxs.get());
-
-    GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0);
-}
-
-
-TEST_F(Dense, CanGatherRowsIntoDense)
-{
-    set_up_apply_data();
-    auto gather_size =
-        gko::dim<2>{rgather_idxs->get_num_elems(), x->get_size()[1]};
-    auto r_gather = Mtx::create(ref, gather_size);
-    // test make_temporary_clone and non-default stride
-    auto dr_gather = Mtx::create(ref, gather_size, x->get_size()[1] + 2);
-
-    x->row_gather(rgather_idxs.get(), r_gather.get());
-    dx->row_gather(rgather_idxs.get(), dr_gather.get());
-
-    GKO_ASSERT_MTX_NEAR(r_gather.get(), dr_gather.get(), 0);
-}
-
-
-TEST_F(Dense, IsPermutable)
-{
-    set_up_apply_data();
-
-    auto permuted = square->permute(rpermute_idxs.get());
-    auto dpermuted = dsquare->permute(rpermute_idxs.get());
-
-    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(permuted.get()),
-                        static_cast<Mtx *>(dpermuted.get()), 0);
-}
-
-
-TEST_F(Dense, IsInversePermutable)
-{
-    set_up_apply_data();
-
-    auto permuted = square->inverse_permute(rpermute_idxs.get());
-    auto dpermuted = dsquare->inverse_permute(rpermute_idxs.get());
-
-    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(permuted.get()),
-                        static_cast<Mtx *>(dpermuted.get()), 0);
-}
-
-
-TEST_F(Dense, IsRowPermutable)
-{
-    set_up_apply_data();
-
-    auto r_permute = x->row_permute(rpermute_idxs.get());
-    auto dr_permute = dx->row_permute(rpermute_idxs.get());
-
-    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(r_permute.get()),
-                        static_cast<Mtx *>(dr_permute.get()), 0);
-}
-
-
-TEST_F(Dense, IsColPermutable)
-{
-    set_up_apply_data();
-
-    auto c_permute = x->column_permute(cpermute_idxs.get());
-    auto dc_permute = dx->column_permute(cpermute_idxs.get());
-
-    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(c_permute.get()),
-                        static_cast<Mtx *>(dc_permute.get()), 0);
-}
-
-
-TEST_F(Dense, IsInverseRowPermutable)
-{
-    set_up_apply_data();
-
-    auto inverse_r_permute = x->inverse_row_permute(rpermute_idxs.get());
-    auto d_inverse_r_permute = dx->inverse_row_permute(rpermute_idxs.get());
-
-    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_r_permute.get()),
-                        static_cast<Mtx *>(d_inverse_r_permute.get()), 0);
-}
-
-
-TEST_F(Dense, IsInverseColPermutable)
-{
-    set_up_apply_data();
-
-    auto inverse_c_permute = x->inverse_column_permute(cpermute_idxs.get());
-    auto d_inverse_c_permute = dx->inverse_column_permute(cpermute_idxs.get());
-
-    GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(inverse_c_permute.get()),
-                        static_cast<Mtx *>(d_inverse_c_permute.get()), 0);
-}
-
-
-TEST_F(Dense, ExtractDiagonalOnTallSkinnyIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto diag = x->extract_diagonal();
-    auto ddiag = dx->extract_diagonal();
-
-    GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0);
-}
-
-
-TEST_F(Dense, ExtractDiagonalOnShortFatIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto diag = y->extract_diagonal();
-    auto ddiag = dy->extract_diagonal();
-
-    GKO_ASSERT_MTX_NEAR(diag.get(), ddiag.get(), 0);
-}
-
-
-TEST_F(Dense, InplaceAbsoluteMatrixIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    x->compute_absolute_inplace();
-    dx->compute_absolute_inplace();
-
-    GKO_ASSERT_MTX_NEAR(x, dx, r<vtype>::value);
-}
-
-
-TEST_F(Dense, OutplaceAbsoluteMatrixIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto abs_x = x->compute_absolute();
-    auto dabs_x = dx->compute_absolute();
-
-    GKO_ASSERT_MTX_NEAR(abs_x, dabs_x, r<vtype>::value);
-}
-
-
-TEST_F(Dense, MakeComplexIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto complex_x = x->make_complex();
-    auto dcomplex_x = dx->make_complex();
-
-    GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0);
-}
-
-
-TEST_F(Dense, MakeComplexWithGivenResultIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto complex_x = ComplexMtx::create(ref, x->get_size());
-    x->make_complex(complex_x.get());
-    auto dcomplex_x = ComplexMtx::create(dpcpp, x->get_size());
-    dx->make_complex(dcomplex_x.get());
-
-    GKO_ASSERT_MTX_NEAR(complex_x, dcomplex_x, 0);
-}
-
-
-TEST_F(Dense, GetRealIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto real_x = x->get_real();
-    auto dreal_x = dx->get_real();
-
-    GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0);
-}
-
-
-TEST_F(Dense, GetRealWithGivenResultIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto real_x = Mtx::create(ref, x->get_size());
-    x->get_real(real_x.get());
-    auto dreal_x = Mtx::create(dpcpp, dx->get_size());
-    dx->get_real(dreal_x.get());
-
-    GKO_ASSERT_MTX_NEAR(real_x, dreal_x, 0);
-}
-
-
-TEST_F(Dense, GetImagIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto imag_x = x->get_imag();
-    auto dimag_x = dx->get_imag();
-
-    GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0);
-}
-
-
-TEST_F(Dense, GetImagWithGivenResultIsEquivalentToRef)
-{
-    set_up_apply_data();
-
-    auto imag_x = Mtx::create(ref, x->get_size());
-    x->get_imag(imag_x.get());
-    auto dimag_x = Mtx::create(dpcpp, dx->get_size());
-    dx->get_imag(dimag_x.get());
-
-    GKO_ASSERT_MTX_NEAR(imag_x, dimag_x, 0);
-}
-
-
 }  // namespace
diff --git a/dpcpp/test/utils.hpp b/dpcpp/test/utils.hpp
new file mode 100644
index 00000000000..57e703b8ef1
--- /dev/null
+++ b/dpcpp/test/utils.hpp
@@ -0,0 +1,56 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_TEST_UTILS_HPP_
+#define GKO_DPCPP_TEST_UTILS_HPP_
+
+
+#include <gtest/gtest.h>
+
+
+namespace {
+
+
+#if GINKGO_DPCPP_SINGLE_MODE
+#define SKIP_IF_SINGLE_MODE GTEST_SKIP() << "Skip due to single mode"
+#else
+#define SKIP_IF_SINGLE_MODE                                                  \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+#endif
+
+
+}  // namespace
+
+
+#endif  // GKO_DPCPP_TEST_UTILS_HPP_

From d17511655c09acfec159ce55e0fb919f5d8747d5 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 21 Jul 2021 16:44:49 +0200
Subject: [PATCH 22/22] debug/static dpcpp -> debug/shared due to memory

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index df3265cb58c..1a884c2b408 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -754,7 +754,7 @@ build/dpcpp/opencl_igpu/release/static:
     SYCL_DEVICE_FILTER: "OpenCL"
     SYCL_DEVICE_TYPE: "GPU"
 
-build/dpcpp/level_zero_igpu/debug/static:
+build/dpcpp/level_zero_igpu/debug/shared:
   <<: *default_build_with_test
   extends:
     - .full_test_condition
@@ -765,7 +765,7 @@ build/dpcpp/level_zero_igpu/debug/static:
     CXX_COMPILER: "dpcpp"
     BUILD_DPCPP: "ON"
     BUILD_TYPE: "Debug"
-    BUILD_SHARED_LIBS: "OFF"
+    BUILD_SHARED_LIBS: "ON"
     DPCPP_SINGLE_MODE: "ON"
     SYCL_DEVICE_FILTER: "Level_Zero:GPU"