diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index df3265cb58c..1a884c2b408 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -754,7 +754,7 @@ build/dpcpp/opencl_igpu/release/static:
     SYCL_DEVICE_FILTER: "OpenCL"
     SYCL_DEVICE_TYPE: "GPU"
 
-build/dpcpp/level_zero_igpu/debug/static:
+build/dpcpp/level_zero_igpu/debug/shared:
   <<: *default_build_with_test
   extends:
     - .full_test_condition
@@ -765,7 +765,7 @@ build/dpcpp/level_zero_igpu/debug/static:
     CXX_COMPILER: "dpcpp"
     BUILD_DPCPP: "ON"
     BUILD_TYPE: "Debug"
-    BUILD_SHARED_LIBS: "OFF"
+    BUILD_SHARED_LIBS: "ON"
     DPCPP_SINGLE_MODE: "ON"
     SYCL_DEVICE_FILTER: "Level_Zero:GPU"
 
diff --git a/cmake/GinkgoConfig.cmake.in b/cmake/GinkgoConfig.cmake.in
index 61da405cf6f..4675bcb4781 100644
--- a/cmake/GinkgoConfig.cmake.in
+++ b/cmake/GinkgoConfig.cmake.in
@@ -78,6 +78,7 @@ set(GINKGO_AMD_ARCH_FLAGS @GINKGO_AMD_ARCH_FLAGS@)
 
 set(GINKGO_DPCPP_VERSION @GINKGO_DPCPP_VERSION@)
 set(GINKGO_DPCPP_FLAGS @GINKGO_DPCPP_FLAGS@)
+set(GINKGO_MKL_ROOT @GINKGO_MKL_ROOT@)
 
 set(GINKGO_HAVE_PAPI_SDE @GINKGO_HAVE_PAPI_SDE@)
 
@@ -168,4 +169,8 @@ if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_HIP)
     find_package(rocrand REQUIRED)
 endif()
 
+if((NOT GINKGO_BUILD_SHARED_LIBS) AND GINKGO_BUILD_DPCPP)
+    find_package(MKL CONFIG REQUIRED HINTS "${GINKGO_MKL_ROOT}")
+endif()
+
 include(${CMAKE_CURRENT_LIST_DIR}/GinkgoTargets.cmake)
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index ebf70232dd3..9d36b49911e 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -42,6 +42,10 @@ function(ginkgo_create_dpcpp_test test_name)
     target_compile_options(${test_target_name} PRIVATE "${GINKGO_DPCPP_FLAGS}")
     target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel)
     ginkgo_set_test_target_properties(${test_name} ${test_target_name})
+    # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test.
+    if (MKL_ENV)
+       set_tests_properties(${test_target_name} PROPERTIES ENVIRONMENT "${MKL_ENV}")
+    endif()
 endfunction(ginkgo_create_dpcpp_test)
 
 function(ginkgo_create_thread_test test_name)
@@ -165,7 +169,7 @@ function(ginkgo_create_common_test test_name)
         # use float for DPC++ if necessary
         if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE)
             target_compile_definitions(${test_target_name} PRIVATE GINKGO_COMMON_SINGLE_MODE=1)
-        endif() 
+        endif()
         ginkgo_set_test_target_properties(${test_name}_${exec} ${test_target_name})
     endforeach()
-endfunction(ginkgo_create_common_test)
\ No newline at end of file
+endfunction(ginkgo_create_common_test)
diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc
index 8f759b1dc95..1d57c20b2e5 100644
--- a/common/components/prefix_sum.hpp.inc
+++ b/common/components/prefix_sum.hpp.inc
@@ -57,7 +57,7 @@ __forceinline__ __device__ void subwarp_prefix_sum(ValueType element,
     total_sum = element;
 #pragma unroll
     // hypercube prefix sum
-    for (auto step = 1; step < subwarp.size(); step *= 2) {
+    for (int step = 1; step < subwarp.size(); step *= 2) {
         auto neighbor = subwarp.shfl_xor(total_sum, step);
         total_sum += neighbor;
         prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;
diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc
index ef5bd690937..cd772e08adb 100644
--- a/common/components/sorting.hpp.inc
+++ b/common/components/sorting.hpp.inc
@@ -70,7 +70,7 @@ struct bitonic_local {
                                                           bool reverse)
     {
         auto els_mid = els + (num_elements / 2);
-        for (auto i = 0; i < num_elements / 2; ++i) {
+        for (int i = 0; i < num_elements / 2; ++i) {
             bitonic_cas(els[i], els_mid[i], reverse);
         }
         half::merge(els, reverse);
@@ -131,7 +131,7 @@ struct bitonic_warp {
         auto tile =
             group::tiled_partition<num_threads>(group::this_thread_block());
         auto new_reverse = reverse != upper_half();
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             auto other = tile.shfl_xor(els[i], num_threads / 2);
             bitonic_cas(els[i], other, new_reverse);
         }
@@ -206,7 +206,7 @@ struct bitonic_global {
         auto upper_shared_els = shared_els + (num_groups * num_threads / 2);
         // only the lower group executes the CAS
         if (!upper_half()) {
-            for (auto i = 0; i < num_local; ++i) {
+            for (int i = 0; i < num_local; ++i) {
                 auto j = shared_idx(i);
                 bitonic_cas(shared_els[j], upper_shared_els[j], reverse);
             }
@@ -241,11 +241,11 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
                                                  bool reverse)
     {
         group::this_thread_block().sync();
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             local_els[i] = shared_els[shared_idx(i)];
         }
         warp::merge(local_els, reverse);
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             shared_els[shared_idx(i)] = local_els[i];
         }
     }
@@ -258,7 +258,7 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
         // This is the first step, so we don't need to load from shared memory
         warp::sort(local_els, reverse);
         // store the sorted elements in shared memory
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             shared_els[shared_idx(i)] = local_els[i];
         }
     }
diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc
index 3a8b3796c12..e951cf06860 100644
--- a/common/components/uninitialized_array.hpp.inc
+++ b/common/components/uninitialized_array.hpp.inc
@@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**
  * Stores an array with uninitialized contents.
  *
- * This class needed for datatypes that do have a non-empty constructor when`
+ * This class is needed for datatypes that do have a non-empty constructor when
  * using them as shared memory, for example `thrust::complex<float>`.
  *
  * @tparam ValueType the type of values
@@ -49,7 +49,7 @@ public:
      *
      * @return the constexpr pointer to the first entry of the array.
      */
-    constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
+    constexpr GKO_ATTRIBUTES operator const ValueType *() const noexcept
     {
         return &(*this)[0];
     }
@@ -70,7 +70,8 @@ public:
      *
      * @return a reference to the array entry at the given index.
      */
-    constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
+    constexpr GKO_ATTRIBUTES const ValueType &operator[](size_type pos) const
+        noexcept
     {
         return reinterpret_cast<const ValueType *>(data_)[pos];
     }
diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc
index 25b43e789ee..b5f7d43db67 100644
--- a/common/factorization/par_ilut_filter_kernels.hpp.inc
+++ b/common/factorization/par_ilut_filter_kernels.hpp.inc
@@ -55,7 +55,7 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs,
     auto end = row_ptrs[row + 1];
     begin_cb(row);
     auto num_steps = ceildiv(end - begin, subwarp_size);
-    for (auto step = 0; step < num_steps; ++step) {
+    for (IndexType step = 0; step < num_steps; ++step) {
         auto idx = begin + lane + step * subwarp_size;
         auto keep = idx < end && pred(idx, begin, end);
         auto mask = subwarp.ballot(keep);
@@ -189,4 +189,4 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter(
 }
 
 
-}  // namespace kernel
\ No newline at end of file
+}  // namespace kernel
diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc
index 059069faf41..e443d7b6ba7 100644
--- a/common/factorization/par_ilut_select_kernels.hpp.inc
+++ b/common/factorization/par_ilut_select_kernels.hpp.inc
@@ -62,7 +62,7 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree(
     // assuming rounding towards zero
     auto stride = double(size) / sample_size;
 #pragma unroll
-    for (auto i = 0; i < sampleselect_oversampling; ++i) {
+    for (int i = 0; i < sampleselect_oversampling; ++i) {
         auto lidx = idx * sampleselect_oversampling + i;
         auto val = input[static_cast<IndexType>(lidx * stride)];
         samples[i] = abs(val);
@@ -119,7 +119,7 @@ __global__ __launch_bounds__(default_block_size) void count_buckets(
         auto el = abs(input[i]);
         IndexType tree_idx{};
 #pragma unroll
-        for (auto level = 0; level < sampleselect_searchtree_height; ++level) {
+        for (int level = 0; level < sampleselect_searchtree_height; ++level) {
             auto cmp = !(el < sh_tree[tree_idx]);
             tree_idx = 2 * tree_idx + 1 + cmp;
         }
@@ -168,7 +168,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
     // compute prefix sum over warp-sized blocks
     IndexType total{};
     auto base_idx = warp_idx * work_per_warp * warp.size();
-    for (auto step = 0; step < work_per_warp; ++step) {
+    for (IndexType step = 0; step < work_per_warp; ++step) {
         auto idx = warp_lane + step * warp.size() + base_idx;
         auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
         IndexType warp_total{};
@@ -207,7 +207,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
     // add block prefix sum to each warp's block of data
     block.sync();
     auto warp_prefixsum = warp_sums[warp_idx];
-    for (auto step = 0; step < work_per_warp; ++step) {
+    for (IndexType step = 0; step < work_per_warp; ++step) {
         auto idx = warp_lane + step * warp.size() + base_idx;
         auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
         if (idx < num_blocks) {
diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc
index d46b202a8ff..c7ebafd0627 100644
--- a/common/matrix/dense_kernels.hpp.inc
+++ b/common/matrix/dense_kernels.hpp.inc
@@ -211,7 +211,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = row_ptrs[tidx];
-        for (auto i = 0; i < num_cols; i++) {
+        for (size_type i = 0; i < num_cols; i++) {
             if (source[stride * tidx + i] != zero<ValueType>()) {
                 values[write_to] = source[stride * tidx + i];
                 col_idxs[write_to] = i;
diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
index 2323d512258..399dd5070ac 100644
--- a/common/matrix/ell_kernels.hpp.inc
+++ b/common/matrix/ell_kernels.hpp.inc
@@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_rows) {
-        for (auto col = 0; col < nnz; col++) {
+        for (size_type col = 0; col < nnz; col++) {
             result[tidx * result_stride +
                    col_idxs[tidx + col * source_stride]] +=
                 values[tidx + col * source_stride];
@@ -226,7 +226,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
+        for (size_type i = 0; i < max_nnz_per_row; i++) {
             const auto source_idx = tidx + stride * i;
             if (source_values[source_idx] != zero<ValueType>()) {
                 result_values[write_to] = source_values[source_idx];
diff --git a/common/matrix/hybrid_kernels.hpp.inc b/common/matrix/hybrid_kernels.hpp.inc
index b6af7c2be36..c7c192189e0 100644
--- a/common/matrix/hybrid_kernels.hpp.inc
+++ b/common/matrix/hybrid_kernels.hpp.inc
@@ -108,7 +108,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
+        for (size_type i = 0; i < max_nnz_per_row; i++) {
             const auto source_idx = tidx + stride * i;
             if (ell_val[source_idx] != zero<ValueType>()) {
                 result_values[write_to] = ell_val[source_idx];
diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu
index 54739c783c8..ce108fa8cf9 100644
--- a/cuda/components/prefix_sum.cu
+++ b/cuda/components/prefix_sum.cu
@@ -49,7 +49,7 @@ template <typename IndexType>
 void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // prefix_sum should be on the valid array
+    // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
@@ -57,8 +57,8 @@ void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
         start_prefix_sum<prefix_sum_block_size>
             <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
                                                     block_sums);
-        // add the total sum of the previous block only when the number of block
-        // is larger than 1.
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
         if (num_blocks > 1) {
             finalize_prefix_sum<prefix_sum_block_size>
                 <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu
index e973cc0f650..e2b7abc51d7 100644
--- a/cuda/test/components/sorting_kernels.cu
+++ b/cuda/test/components/sorting_kernels.cu
@@ -99,7 +99,7 @@ protected:
     {
         // we want some duplicate elements
         std::uniform_int_distribution<gko::int32> dist(0, num_elements / 2);
-        for (auto i = 0; i < num_elements; ++i) {
+        for (int i = 0; i < num_elements; ++i) {
             ref_shared.get_data()[i] = dist(rng);
         }
         ddata = gko::Array<gko::int32>{cuda, ref_shared};
diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
index 6e40ce5b5a3..de96d27d823 100644
--- a/cuda/test/matrix/dense_kernels.cpp
+++ b/cuda/test/matrix/dense_kernels.cpp
@@ -550,7 +550,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
                                                           &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }
diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
index 2df1c397f4c..51c12fab531 100644
--- a/cuda/test/matrix/ell_kernels.cpp
+++ b/cuda/test/matrix/ell_kernels.cpp
@@ -585,7 +585,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef)
                                                         &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }
diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh
index cbe1b2e8ab9..7a24f50bef6 100755
--- a/dev_tools/scripts/format_header.sh
+++ b/dev_tools/scripts/format_header.sh
@@ -121,7 +121,6 @@ GINKGO_LICENSE_BEACON="******************************<GINKGO LICENSE>***********
 
 CONTENT="content.cpp" # Store the residual part (start from namespace)
 BEFORE="before.cpp" # Store the main header and the #ifdef/#define of header file
-BEGIN="begin.cpp" # Store the header before license
 HAS_HIP_RUNTIME="false"
 DURING_LICENSE="false"
 INCLUDE_REGEX="^#include.*"
diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup
index 9e8a5172d05..1756481e2e4 100644
--- a/dev_tools/scripts/regroup
+++ b/dev_tools/scripts/regroup
@@ -2,7 +2,7 @@ IncludeBlocks: Regroup
 IncludeCategories:
   - Regex: '^<(rapidjson|gflags|gtest|papi).*'
     Priority: 3
-  - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative).*'
+  - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi).*'
     Priority: 2
   - Regex: '^<ginkgo.*'
     Priority: 5
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 97cf8a5daf6..e2d476164e8 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -6,11 +6,15 @@ endif()
 ginkgo_extract_dpcpp_version(${CMAKE_CXX_COMPILER} GINKGO_DPCPP_VERSION)
 set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE)
 
+find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
+set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)
+
 add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
 target_sources(ginkgo_dpcpp
     PRIVATE
     base/version.dp.cpp
     base/executor.dp.cpp
+    base/helper.dp.cpp
     components/absolute_array.dp.cpp
     components/fill_array.dp.cpp
     components/prefix_sum.dp.cpp
@@ -54,13 +58,24 @@ target_sources(ginkgo_dpcpp
 ginkgo_compile_features(ginkgo_dpcpp)
 target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP)
 
-set(GINKGO_DPCPP_FLAGS ${GINKGO_COMPILER_FLAGS} -fsycl)
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
+# Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating
+# find_package(MKL) everywhere when linking ginkgo (see the MKL example
+# https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html)
+target_compile_options(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_COMPILE_OPTIONS>)
 target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
+target_include_directories(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
-target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
+# When building ginkgo as a static library, we need to use dpcpp and per_kernel
+# link option when the program uses a dpcpp related function.
+if (BUILD_SHARED_LIBS)
+    target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
+else ()
+    target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
+endif()
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
+target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP)
 if (GINKGO_DPCPP_SINGLE_MODE)
     target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
 endif()
diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp
new file mode 100644
index 00000000000..5e6c1a579f5
--- /dev/null
+++ b/dpcpp/base/helper.dp.cpp
@@ -0,0 +1,62 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <CL/sycl.hpp>
+
+
+#include "dpcpp/base/helper.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+bool validate(sycl::queue *queue, unsigned int workgroup_size,
+              unsigned int subgroup_size)
+{
+    auto device = queue->get_device();
+    auto subgroup_size_list =
+        device.get_info<cl::sycl::info::device::sub_group_sizes>();
+    auto max_workgroup_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+    bool allowed = false;
+    for (auto &i : subgroup_size_list) {
+        allowed |= (i == subgroup_size);
+    }
+    return allowed && (workgroup_size <= max_workgroup_size);
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index c888eb9d99d..cb98e4c511e 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -40,31 +40,55 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <CL/sycl.hpp>
 
 
+#include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "core/base/types.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 
 
+/**
+ * GKO_ENABLE_DEFAULT_HOST gives a default host implementation for those
+ * kernels which require encoded config but do not need explicit template
+ * parameter and shared memory
+ *
+ * @param name_  the name of the host function with config
+ * @param kernel_  the kernel name
+ */
+#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_)                           \
+    template <typename... InferredArgs>                                   \
+    void name_(dim3 grid, dim3 block, gko::size_type, sycl::queue *queue, \
+               InferredArgs... args)                                      \
+    {                                                                     \
+        queue->submit([&](sycl::handler &cgh) {                           \
+            cgh.parallel_for(sycl_nd_range(grid, block),                  \
+                             [=](sycl::nd_item<3> item_ct1) {             \
+                                 kernel_(args..., item_ct1);              \
+                             });                                          \
+        });                                                               \
+    }
+
+
 /**
  * GKO_ENABLE_DEFAULT_HOST_CONFIG gives a default host implementation for those
  * kernels which require encoded config but do not need explicit template
- * parameter and share memory
+ * parameter and shared memory
  *
  * @param name_  the name of the host function with config
  * @param kernel_  the kernel name
  */
-#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                     \
-    template <std::uint32_t encoded, typename... InferredArgs>             \
-    inline void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \
-                      sycl::queue *queue, InferredArgs... args)            \
-    {                                                                      \
-        queue->submit([&](sycl::handler &cgh) {                            \
-            cgh.parallel_for(sycl_nd_range(grid, block),                   \
-                             [=](sycl::nd_item<3> item_ct1) {              \
-                                 kernel_<encoded>(args..., item_ct1);      \
-                             });                                           \
-        });                                                                \
+#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                \
+    template <std::uint32_t encoded, typename... InferredArgs>        \
+    inline void name_(dim3 grid, dim3 block, gko::size_type,          \
+                      sycl::queue *queue, InferredArgs... args)       \
+    {                                                                 \
+        queue->submit([&](sycl::handler &cgh) {                       \
+            cgh.parallel_for(sycl_nd_range(grid, block),              \
+                             [=](sycl::nd_item<3> item_ct1) {         \
+                                 kernel_<encoded>(args..., item_ct1); \
+                             });                                      \
+        });                                                           \
     }
 
 /**
@@ -82,7 +106,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_)               \
     template <typename... InferredArgs>                                       \
     void name_(std::uint32_t desired_cfg, dim3 grid, dim3 block,              \
-               size_t dynamic_shared_memory, sycl::queue *queue,              \
+               gko::size_type dynamic_shared_memory, sycl::queue *queue,      \
                InferredArgs... args)                                          \
     {                                                                         \
         callable_(                                                            \
@@ -114,19 +138,42 @@ namespace kernels {
 namespace dpcpp {
 
 
+/**
+ * This is the validate function for common check. It checks the workgroup size
+ * is below device max workgroup size and subgroup size is in the supported
+ * subgroup size.
+ *
+ * @param queue  the sycl queue pointer
+ * @param workgroup_size  the workgroup size (block size in cuda sense)
+ * @param subgroup_size  the subgroup size (warp size in cuda sense)
+ *
+ * @return the given arguments are valid or not in given queue.
+ */
 bool validate(sycl::queue *queue, unsigned workgroup_size,
-              unsigned subgroup_size)
+              unsigned subgroup_size);
+
+
+/**
+ * get_first_cfg will return the first valid config by validate function from
+ * given config array.
+ *
+ * @tparam IterArr  the iteratable array type
+ * @tparam Validate  the validate function type
+ *
+ * @param arr  the config array
+ * @param verify  the validate function
+ *
+ * @return the first valid config
+ */
+template <typename IterArr, typename Validate>
+std::uint32_t get_first_cfg(IterArr &arr, Validate verify)
 {
-    auto device = queue->get_device();
-    auto subgroup_size_list =
-        device.get_info<cl::sycl::info::device::sub_group_sizes>();
-    auto max_workgroup_size =
-        device.get_info<sycl::info::device::max_work_group_size>();
-    bool allowed = false;
-    for (auto &i : subgroup_size_list) {
-        allowed |= (i == subgroup_size);
+    for (auto &cfg : arr) {
+        if (verify(cfg)) {
+            return cfg;
+        }
     }
-    return allowed && (workgroup_size <= max_workgroup_size);
+    GKO_NOT_SUPPORTED(arr);
 }
 
 
diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp
new file mode 100644
index 00000000000..1c9a8dabb30
--- /dev/null
+++ b/dpcpp/base/onemkl_bindings.hpp
@@ -0,0 +1,131 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_
+#define GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_
+
+
+#include <type_traits>
+
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+
+namespace gko {
+/**
+ * @brief The device specific kernels namespace.
+ *
+ * @ingroup kernels
+ */
+namespace kernels {
+/**
+ * @brief The DPCPP namespace.
+ *
+ * @ingroup dpcpp
+ */
+namespace dpcpp {
+/**
+ * @brief The ONEMKL namespace.
+ *
+ * @ingroup onemkl
+ */
+namespace onemkl {
+/**
+ * @brief The detail namespace.
+ *
+ * @ingroup detail
+ */
+namespace detail {
+
+
+template <typename... Args>
+inline void not_implemented(Args &&...) GKO_NOT_IMPLEMENTED;
+
+
+}  // namespace detail
+
+
+template <typename ValueType>
+struct is_supported : std::false_type {};
+
+template <>
+struct is_supported<float> : std::true_type {};
+
+template <>
+struct is_supported<double> : std::true_type {};
+
+template <>
+struct is_supported<std::complex<float>> : std::true_type {};
+
+template <>
+struct is_supported<std::complex<double>> : std::true_type {};
+
+
+#define GKO_BIND_DOT(ValueType, Name, Func)                                  \
+    void Name(::cl::sycl::queue &exec_queue, std::int64_t n,                 \
+              const ValueType *x, std::int64_t incx, const ValueType *y,     \
+              std::int64_t incy, ValueType *result)                          \
+    {                                                                        \
+        Func(exec_queue, n, x, incx, y, incy, result);                       \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+
+// Bind the dot for x^T * y
+GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(std::complex<float>, dot, oneapi::mkl::blas::row_major::dotu);
+GKO_BIND_DOT(std::complex<double>, dot, oneapi::mkl::blas::row_major::dotu);
+template <typename ValueType>
+GKO_BIND_DOT(ValueType, dot, detail::not_implemented);
+
+// Bind the conj_dot for x' * y
+GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(std::complex<float>, conj_dot, oneapi::mkl::blas::row_major::dotc);
+GKO_BIND_DOT(std::complex<double>, conj_dot,
+             oneapi::mkl::blas::row_major::dotc);
+template <typename ValueType>
+GKO_BIND_DOT(ValueType, conj_dot, detail::not_implemented);
+
+#undef GKO_BIND_DOT
+
+
+}  // namespace onemkl
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_
diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp
index 4b7f816b381..63f33e9ba35 100644
--- a/dpcpp/components/prefix_sum.dp.cpp
+++ b/dpcpp/components/prefix_sum.dp.cpp
@@ -36,7 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <CL/sycl.hpp>
 
 
-#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "core/base/types.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/prefix_sum.dp.hpp"
 
 
 namespace gko {
@@ -45,22 +50,52 @@ namespace dpcpp {
 namespace components {
 
 
+using BlockCfg = ConfigSet<11>;
+
+constexpr auto block_cfg_list =
+    ::gko::syn::value_list<std::uint32_t, BlockCfg::encode(512),
+                           BlockCfg::encode(256), BlockCfg::encode(128)>();
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum,
+                               block_cfg_list)
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_prefix_sum,
+                                           finalize_prefix_sum)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_prefix_sum_call, finalize_prefix_sum,
+                               block_cfg_list)
+
+
 template <typename IndexType>
-void prefix_sum(std::shared_ptr<const DefaultExecutor> exec, IndexType *counts,
+void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // TODO actually implement parallel prefix sum
-    exec->get_queue()->submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl::range<1>{1}, [=](sycl::id<1> idx) {
-            IndexType sum{};
-            for (size_type i = 0; i < num_entries; i++) {
-                sum += std::exchange(counts[i], sum);
-            }
-        });
-    });
+    // prefix_sum should only be performed on a valid array
+    if (num_entries > 0) {
+        auto queue = exec->get_queue();
+        constexpr auto block_cfg_array = as_array(block_cfg_list);
+        const std::uint32_t cfg =
+            get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) {
+                return validate(queue, BlockCfg::decode<0>(cfg), 16);
+            });
+        const auto wg_size = BlockCfg::decode<0>(cfg);
+        auto num_blocks = ceildiv(num_entries, wg_size);
+        Array<IndexType> block_sum_array(exec, num_blocks - 1);
+        auto block_sums = block_sum_array.get_data();
+        start_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(),
+                              num_entries, counts, block_sums);
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
+        if (num_blocks > 1) {
+            finalize_prefix_sum_call(cfg, num_blocks, wg_size, 0,
+                                     exec->get_queue(), num_entries, counts,
+                                     block_sums);
+        }
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
 // instantiate for size_type as well, as this is used in the Sellp format
 template GKO_DECLARE_PREFIX_SUM_KERNEL(size_type);
 
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
new file mode 100644
index 00000000000..334d4239c56
--- /dev/null
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -0,0 +1,253 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_
+
+
+#include <type_traits>
+
+
+#include <CL/sycl.hpp>
+
+
+#include "core/base/types.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+/**
+ * @internal
+ * Computes the prefix sum and total sum of `element` over a subwarp.
+ *
+ * @param element     the element over which we compute the prefix sum.
+ * @param prefix_sum  will be set to the sum of all `element`s from lower
+ *                    lanes, plus the local `element` if `inclusive` is `true`.
+ * @param total_sum   will be set to the total sum of `element` in this subwarp.
+ * @param subwarp     the cooperative group representing the subwarp.
+ *
+ * @tparam inclusive  if this is true, the computed prefix sum will be
+ *                    inclusive, otherwise it will be exclusive.
+ *
+ * @note For this function to work on architectures with independent thread
+ * scheduling, all threads of the subwarp have to execute it.
+ */
+template <bool inclusive, typename ValueType, typename Group>
+__dpct_inline__ void subwarp_prefix_sum(ValueType element,
+                                        ValueType &prefix_sum,
+                                        ValueType &total_sum, Group subwarp)
+{
+    prefix_sum = inclusive ? element : zero<ValueType>();
+    total_sum = element;
+#pragma unroll
+    // hypercube prefix sum
+    for (int step = 1; step < subwarp.size(); step *= 2) {
+        auto neighbor = subwarp.shfl_xor(total_sum, step);
+        total_sum += neighbor;
+        prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;
+    }
+}
+
+/**
+ * @internal
+ * Computes the prefix sum of `element` over a subwarp.
+ *
+ * @param element     the element over which we compute the prefix sum.
+ * @param prefix_sum  will be set to the sum of all `element`s from lower
+ *                    lanes, plus the local `element` if `inclusive` is `true`.
+ * @param subwarp     the cooperative group representing the subwarp.
+ *
+ * @tparam inclusive  if this is true, the computed prefix sum will be
+ *                    inclusive, otherwise it will be exclusive.
+ *
+ * @note All threads of the subwarp have to execute this function for it to work
+ *       (and not dead-lock on newer architectures).
+ */
+template <bool inclusive, typename ValueType, typename Group>
+__dpct_inline__ void subwarp_prefix_sum(ValueType element,
+                                        ValueType &prefix_sum, Group subwarp)
+{
+    ValueType tmp{};
+    subwarp_prefix_sum<inclusive>(element, prefix_sum, tmp, subwarp);
+}
+
+
+/**
+ * @internal
+ * First step of the calculation of a prefix sum. Calculates the prefix sum
+ * in-place on parts of the array `elements`.
+ *
+ * @param elements  array on which the prefix sum is to be calculated
+ * @param block_sum  array which stores the total sum of each block, requires at
+ *                   least `ceildiv(num_elements, block_size) - 1` elements
+ * @param num_elements  total number of entries in `elements`
+ *
+ * @tparam block_size  thread block size for this kernel, also size of blocks on
+ *                     which this kernel calculates the prefix sum in-place
+ *
+ * @note To calculate the prefix sum over an array of size bigger than
+ *       `block_size`, `finalize_prefix_sum` has to be used as well.
+ */
+template <std::uint32_t block_size, typename ValueType>
+void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
+                      ValueType *__restrict__ block_sum,
+                      sycl::nd_item<3> item_ct1,
+                      UninitializedArray<ValueType, block_size> *prefix_helper)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    const auto element_id = item_ct1.get_local_id(2);
+
+    // do not need to access the last element when exclusive prefix sum
+    (*prefix_helper)[element_id] =
+        (tidx + 1 < num_elements) ? elements[tidx] : zero<ValueType>();
+    auto this_block = group::this_thread_block(item_ct1);
+    this_block.sync();
+
+    // Do a normal reduction
+#pragma unroll
+    for (int i = 1; i < block_size; i <<= 1) {
+        const auto ai = i * (2 * element_id + 1) - 1;
+        const auto bi = i * (2 * element_id + 2) - 1;
+        if (bi < block_size) {
+            (*prefix_helper)[bi] += (*prefix_helper)[ai];
+        }
+        this_block.sync();
+    }
+
+    if (element_id == 0) {
+        // Store the total sum except the last block
+        if (item_ct1.get_group(2) + 1 < item_ct1.get_group_range(2)) {
+            block_sum[item_ct1.get_group(2)] = (*prefix_helper)[block_size - 1];
+        }
+        (*prefix_helper)[block_size - 1] = zero<ValueType>();
+    }
+
+    this_block.sync();
+
+    // Perform the down-sweep phase to get the true prefix sum
+#pragma unroll
+    for (int i = block_size >> 1; i > 0; i >>= 1) {
+        const auto ai = i * (2 * element_id + 1) - 1;
+        const auto bi = i * (2 * element_id + 2) - 1;
+        if (bi < block_size) {
+            auto tmp = (*prefix_helper)[ai];
+            (*prefix_helper)[ai] = (*prefix_helper)[bi];
+            (*prefix_helper)[bi] += tmp;
+        }
+        this_block.sync();
+    }
+    if (tidx < num_elements) {
+        elements[tidx] = (*prefix_helper)[element_id];
+    }
+}
+
+template <std::uint32_t block_size, typename ValueType>
+void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                      sycl::queue *stream, size_type num_elements,
+                      ValueType *elements, ValueType *block_sum)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            prefix_helper_acc_ct1(cgh);
+
+        cgh.parallel_for(sycl_nd_range(grid, block),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             start_prefix_sum<block_size>(
+                                 num_elements, elements, block_sum, item_ct1,
+                                 prefix_helper_acc_ct1.get_pointer().get());
+                         });
+    });
+}
+
+
+/**
+ * @internal
+ * Second step of the calculation of a prefix sum. Increases the value of each
+ * entry of `elements` by the total sum of all preceding blocks.
+ *
+ * @param elements  array on which the prefix sum is to be calculated
+ * @param block_sum  array storing the total sum of each block
+ * @param num_elements  total number of entries in `elements`
+ *
+ * @tparam block_size  thread block size for this kernel, has to be the same as
+ *                    for `start_prefix_sum`
+ *
+ * @note To calculate a prefix sum, first `start_prefix_sum` has to be called.
+ */
+template <std::uint32_t block_size, typename ValueType>
+void finalize_prefix_sum(size_type num_elements,
+                         ValueType *__restrict__ elements,
+                         const ValueType *__restrict__ block_sum,
+                         sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+
+    if (tidx < num_elements) {
+        ValueType prefix_block_sum = zero<ValueType>();
+        for (size_type i = 0; i < item_ct1.get_group(2); i++) {
+            prefix_block_sum += block_sum[i];
+        }
+        elements[tidx] += prefix_block_sum;
+    }
+}
+
+template <std::uint32_t block_size, typename ValueType>
+void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                         sycl::queue *stream, size_type num_elements,
+                         ValueType *elements, const ValueType *block_sum)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        cgh.parallel_for(sycl_nd_range(grid, block),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             finalize_prefix_sum<block_size>(
+                                 num_elements, elements, block_sum, item_ct1);
+                         });
+    });
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_PREFIX_SUM_DP_HPP_
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
new file mode 100644
index 00000000000..094f2093a95
--- /dev/null
+++ b/dpcpp/components/reduction.dp.hpp
@@ -0,0 +1,298 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_
+
+
+#include <type_traits>
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/synthesizer/containers.hpp>
+
+
+#include "core/base/types.hpp"
+#include "core/synthesizer/implementation_selection.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/uninitialized_array.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+constexpr int default_block_size = 256;
+using KCFG_1D = ConfigSet<11, 7>;
+constexpr auto kcfg_1d_list =
+    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 64),
+                    KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
+                    KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
+                    KCFG_1D::encode(256, 8)>();
+constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on a group
+ * `group`. Each thread contributes with one element `local_data`. The local
+ * thread element is always passed as the first parameter to the `reduce_op`.
+ * The function returns the result of the reduction on all threads.
+ *
+ * @note The function is guaranteed to return the correct value on all threads
+ *       only if `reduce_op` is commutative (in addition to being associative).
+ *       Otherwise, the correct value is returned only to the thread with
+ *       subwarp index 0.
+ */
+template <
+    typename Group, typename ValueType, typename Operator,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ ValueType reduce(const Group &group, ValueType local_data,
+                                 Operator reduce_op = Operator{})
+{
+#pragma unroll
+    for (int32 bitmask = 1; bitmask < group.size(); bitmask <<= 1) {
+        const auto remote_data = group.shfl_xor(local_data, bitmask);
+        local_data = reduce_op(local_data, remote_data);
+    }
+    return local_data;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the index of the thread that has the element with the largest
+ * magnitude among all the threads in the group.
+ * Only the values from threads which set `is_pivoted` to `false` will be
+ * considered.
+ */
+template <
+    typename Group, typename ValueType,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ int choose_pivot(const Group &group, ValueType local_data,
+                                 bool is_pivoted)
+{
+    using real = remove_complex<ValueType>;
+    real lmag = is_pivoted ? -one<real>() : abs(local_data);
+    const auto pivot =
+        reduce(group, group.thread_rank(), [&](int lidx, int ridx) {
+            const auto rmag = group.shfl(lmag, ridx);
+            if (rmag > lmag) {
+                lmag = rmag;
+                lidx = ridx;
+            }
+            return lidx;
+        });
+    // pivot operator not commutative, make sure everyone has the same pivot
+    return group.shfl(pivot, 0);
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on entire block.
+ * The data for the reduction is taken from the `data` array which has to be of
+ * size `block_size` and accessible from all threads. The `data` array is also
+ * used as work space (so its content will be destroyed in the process), as well
+ * as to store the return value - which is stored in the 0-th position of the
+ * array.
+ */
+template <
+    unsigned int sg_size = 32, typename Group, typename ValueType,
+    typename Operator,
+    typename = std::enable_if_t<group::is_synchronizable_group<Group>::value>>
+void reduce(const Group &__restrict__ group, ValueType *__restrict__ data,
+            Operator reduce_op = Operator{})
+{
+    const auto local_id = group.thread_rank();
+
+    for (int k = group.size() / 2; k >= sg_size; k /= 2) {
+        group.sync();
+        if (local_id < k) {
+            data[local_id] = reduce_op(data[local_id], data[local_id + k]);
+        }
+    }
+
+    const auto warp = group::tiled_partition<sg_size>(group);
+    const auto warp_id = group.thread_rank() / warp.size();
+    if (warp_id > 0) {
+        return;
+    }
+    auto result = ::gko::kernels::dpcpp::reduce(warp, data[warp.thread_rank()],
+                                                reduce_op);
+    if (warp.thread_rank() == 0) {
+        data[0] = result;
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the binary operation `reduce_op` on an array
+ * `source` of any size. Has to be called a second time on `result` to reduce
+ * an array larger than `block_size`.
+ */
+template <unsigned int sg_size = 32, typename Operator, typename ValueType>
+void reduce_array(size_type size, const ValueType *__restrict__ source,
+                  ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
+                  Operator reduce_op = Operator{})
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    auto thread_result = zero<ValueType>();
+    for (auto i = tidx; i < size;
+         i += item_ct1.get_local_range().get(2) * item_ct1.get_group_range(2)) {
+        thread_result = reduce_op(thread_result, source[i]);
+    }
+    result[item_ct1.get_local_id(2)] = thread_result;
+
+    group::this_thread_block(item_ct1).sync();
+
+    // Stores the result of the reduction inside `result[0]`
+    reduce<sg_size>(group::this_thread_block(item_ct1), result, reduce_op);
+}
+
+
+/**
+ * @internal
+ *
+ * Computes a reduction using the add operation (+) on an array
+ * `source` of any size. Has to be called a second time on `result` to reduce
+ * an array larger than `block_size`.
+ */
+template <std::uint32_t cfg, typename ValueType>
+void reduce_add_array(
+    size_type size, const ValueType *__restrict__ source,
+    ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *block_sum)
+{
+    reduce_array<KCFG_1D::decode<1>(cfg)>(
+        size, source, static_cast<ValueType *>((*block_sum)), item_ct1,
+        [](const ValueType &x, const ValueType &y) { return x + y; });
+
+    if (item_ct1.get_local_id(2) == 0) {
+        result[item_ct1.get_group(2)] = (*block_sum)[0];
+    }
+}
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                      sycl::queue *stream, size_type size,
+                      const ValueType *source, ValueType *result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>,
+                       0, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            block_sum_acc_ct1(cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                reduce_add_array<cfg>(size, source, result, item_ct1,
+                                      block_sum_acc_ct1.get_pointer().get());
+            });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_add_array_config,
+                                           reduce_add_array);
+
+GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config,
+                               kcfg_1d_list);
+
+
+/**
+ * Compute a reduction using add operation (+).
+ *
+ * @param exec  Executor associated to the array
+ * @param size  size of the array
+ * @param source  the pointer of the array
+ *
+ * @return the reduction result
+ */
+template <typename ValueType>
+ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,
+                           size_type size, const ValueType *source)
+{
+    auto block_results_val = source;
+    size_type grid_dim = size;
+    auto block_results = Array<ValueType>(exec);
+    ValueType answer = zero<ValueType>();
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+
+    if (size > wg_size) {
+        const auto n = ceildiv(size, wg_size);
+        grid_dim = (n <= wg_size) ? n : wg_size;
+
+        block_results.resize_and_reset(grid_dim);
+
+        reduce_add_array_call(cfg, grid_dim, wg_size, 0, exec->get_queue(),
+                              size, source, block_results.get_data());
+
+        block_results_val = block_results.get_const_data();
+    }
+
+    auto d_result = Array<ValueType>(exec, 1);
+
+    reduce_add_array_call(cfg, 1, wg_size, 0, exec->get_queue(), grid_dim,
+                          block_results_val, d_result.get_data());
+    answer = exec->copy_val_to_host(d_result.get_const_data());
+    return answer;
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
new file mode 100644
index 00000000000..70ad76d9ccb
--- /dev/null
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -0,0 +1,327 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_
+
+
+#include <CL/sycl.hpp>
+
+
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dpct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+/**
+ * @brief The DPCPP thread namespace.
+ *
+ * @ingroup dpcpp_thread
+ */
+namespace thread {
+
+
+// TODO: porting - need to refine functions and their name in this file
+// the grid/block description uses the cuda dim3 to represent. i.e. using dim3
+// to launch dpcpp kernel, the kernel will reverse the ordering to keep the same
+// linear memory usage as cuda.
+
+
+/**
+ * @internal
+ *
+ * Returns the ID of the block group this thread belongs to.
+ *
+ * @return the ID of the block group this thread belongs to
+ *
+ * @note Assumes that grid dimensions are in cuda standard format:
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`
+ */
+__dpct_inline__ size_type get_block_group_id(sycl::nd_item<3> item_ct1)
+{
+    return static_cast<size_type>(item_ct1.get_group(0)) *
+               item_ct1.get_group_range(1) +
+           item_ct1.get_group(1);
+}
+
+/**
+ * @internal
+ *
+ * Returns the ID of the block this thread belongs to.
+ *
+ * @return the ID of the block this thread belongs to
+ *
+ * @note Assumes that grid dimensions are in cuda standard format:
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`
+ */
+__dpct_inline__ size_type get_block_id(sycl::nd_item<3> item_ct1)
+{
+    return get_block_group_id(item_ct1) * item_ct1.get_group_range(2) +
+           item_ct1.get_group(2);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the local ID of the warp (relative to the block) this thread belongs
+ * to.
+ *
+ * @return the local ID of the warp (relative to the block) this thread belongs
+ *         to
+ *
+ * @note Assumes that block dimensions are in cuda standard format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
+ */
+__dpct_inline__ size_type get_local_warp_id(sycl::nd_item<3> item_ct1)
+{
+    return static_cast<size_type>(item_ct1.get_local_id(0));
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the local ID of the sub-warp (relative to the block) this thread
+ * belongs to.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @return the local ID of the sub-warp (relative to the block) this thread
+ *         belongs to
+ *
+ * @note Assumes that block dimensions are in cuda standard format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
+ */
+template <int subwarp_size>
+__dpct_inline__ size_type get_local_subwarp_id(sycl::nd_item<3> item_ct1)
+{
+    // dpcpp does not have subwarp.
+    constexpr auto subwarps_per_warp = subwarp_size / subwarp_size;
+    return get_local_warp_id(item_ct1) * subwarps_per_warp +
+           item_ct1.get_local_id(1);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the local ID of the thread (relative to the block).
+ * to.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @return the local ID of the thread (relative to the block)
+ *
+ * @note Assumes that block dimensions are in cuda standard format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)`
+ */
+template <int subwarp_size>
+__dpct_inline__ size_type get_local_thread_id(sycl::nd_item<3> item_ct1)
+{
+    return get_local_subwarp_id<subwarp_size>(item_ct1) * subwarp_size +
+           item_ct1.get_local_id(2);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the warp this thread belongs to.
+ *
+ * @tparam warps_per_block  number of warps within each block
+ *
+ * @return the global ID of the warp this thread belongs to.
+ *
+ * @note Assumes that block dimensions and grid dimensions are in cuda standard
+ *       format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
+ *       respectively.
+ */
+template <int warps_per_block>
+__dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1)
+{
+    return get_block_id(item_ct1) * warps_per_block +
+           get_local_warp_id(item_ct1);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the sub-warp this thread belongs to.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @return the global ID of the sub-warp this thread belongs to.
+ *
+ * @note Assumes that block dimensions and grid dimensions are in cuda standard
+ *       format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
+ *       respectively.
+ */
+template <int subwarp_size, int warps_per_block>
+__dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1)
+{
+    // dpcpp does not have subwarp
+    constexpr auto subwarps_per_warp = subwarp_size / subwarp_size;
+    return get_warp_id<warps_per_block>(item_ct1) * subwarps_per_warp +
+           item_ct1.get_local_id(1);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the thread.
+ *
+ * @return the global ID of the thread.
+ *
+ * @tparam subwarp_size  size of the subwarp
+ *
+ * @note Assumes that block dimensions and grid dimensions are in cuda standard
+ *       format:
+ *       `(subwarp_size, config::warp_size / subwarp_size, block_size /
+ *         config::warp_size)` and
+ *       `(block_group_size, first_grid_dimension, second grid_dimension)`,
+ *       respectively.
+ */
+template <int subwarp_size, int warps_per_block>
+__dpct_inline__ size_type get_thread_id(sycl::nd_item<3> item_ct1)
+{
+    return get_subwarp_id<subwarp_size, warps_per_block>(item_ct1) *
+               subwarp_size +
+           item_ct1.get_local_id(2);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the thread in the given index type.
+ * This function assumes one-dimensional thread and block indexing in cuda
+ * sense. It uses the third position infomation to get the information.
+ *
+ * @return the global ID of the thread in the given index type.
+ *
+ * @tparam IndexType  the index type
+ */
+template <typename IndexType = size_type>
+__dpct_inline__ IndexType get_thread_id_flat(sycl::nd_item<3> item_ct1)
+{
+    return item_ct1.get_local_id(2) +
+           static_cast<IndexType>(item_ct1.get_local_range().get(2)) *
+               item_ct1.get_group(2);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the total number of threads in the given index type.
+ * This function assumes one-dimensional thread and block indexing in cuda
+ * sense. It uses the third position infomation to get the information.
+ *
+ * @return the total number of threads in the given index type.
+ *
+ * @tparam IndexType  the index type
+ */
+template <typename IndexType = size_type>
+__dpct_inline__ IndexType get_thread_num_flat(sycl::nd_item<3> item_ct1)
+{
+    return item_ct1.get_local_range().get(2) *
+           static_cast<IndexType>(item_ct1.get_group_range(2));
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the subwarp in the given index type.
+ * This function assumes one-dimensional thread and block indexing in cuda sense
+ * with a power of two block size of at least subwarp_size.
+ *
+ * @return the global ID of the subwarp in the given index type.
+ *
+ * @tparam subwarp_size  the size of the subwarp. Must be a power of two!
+ * @tparam IndexType  the index type
+ */
+template <int subwarp_size, typename IndexType = size_type>
+__dpct_inline__ IndexType get_subwarp_id_flat(sycl::nd_item<3> item_ct1)
+{
+    static_assert(!(subwarp_size & (subwarp_size - 1)),
+                  "subwarp_size must be a power of two");
+    return item_ct1.get_local_id(2) / subwarp_size +
+           static_cast<IndexType>(item_ct1.get_local_range().get(2) /
+                                  subwarp_size) *
+               item_ct1.get_group(2);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the total number of subwarps in the given index type.
+ * This function assumes one-dimensional thread and block indexing in cuda sense
+ * with a power of two block size of at least subwarp_size.
+ *
+ * @return the total number of subwarps in the given index type.
+ *
+ * @tparam subwarp_size  the size of the subwarp. Must be a power of two!
+ * @tparam IndexType  the index type
+ */
+template <int subwarp_size, typename IndexType = size_type>
+__dpct_inline__ IndexType get_subwarp_num_flat(sycl::nd_item<3> item_ct1)
+{
+    static_assert(!(subwarp_size & (subwarp_size - 1)),
+                  "subwarp_size must be a power of two");
+    return item_ct1.get_local_range().get(2) / subwarp_size *
+           static_cast<IndexType>(item_ct1.get_group_range(2));
+}
+
+
+}  // namespace thread
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_THREAD_IDS_DP_HPP_
diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp
new file mode 100644
index 00000000000..eb8a36770d7
--- /dev/null
+++ b/dpcpp/components/uninitialized_array.hpp
@@ -0,0 +1,121 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
+#define GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "dpcpp/base/dpct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+// TODO: porting - consider directly use the array as shared memory
+
+
+/**
+ * Stores an array with uninitialized contents.
+ *
+ * This class is needed for datatypes that do have a non-empty constructor when
+ * using them as shared memory, for example `thrust::complex<float>`.
+ *
+ * @tparam ValueType the type of values
+ * @tparam size the size of the array
+ */
+template <typename ValueType, size_type size>
+class UninitializedArray {
+public:
+    /**
+     * Operator for casting an UninitializedArray into its constexpr value
+     * pointer.
+     *
+     * @return the constexpr pointer to the first entry of the array.
+     */
+    constexpr __dpct_inline__ operator const ValueType *() const noexcept
+    {
+        return &(*this)[0];
+    }
+
+    /**
+     * Operator for casting an UninitializedArray into its non-const value
+     * pointer.
+     *
+     * @return the non-const pointer to the first entry of the array.
+     */
+    __dpct_inline__ operator ValueType *() noexcept { return &(*this)[0]; }
+
+    /**
+     * constexpr array access operator.
+     *
+     * @param pos The array index. Using a value outside [0, size) is undefined
+     * behavior.
+     *
+     * @return a reference to the array entry at the given index.
+     */
+    constexpr __dpct_inline__ const ValueType &operator[](size_type pos) const
+        noexcept
+    {
+        return data_[pos];
+    }
+
+    /**
+     * Non-const array access operator.
+     *
+     * @param pos The array index. Using a value outside [0, size) is undefined
+     * behavior.
+     *
+     * @return a reference to the array entry at the given index.
+     */
+    __dpct_inline__ ValueType &operator[](size_type pos) noexcept
+    {
+        return data_[pos];
+    }
+
+private:
+    // if dpcpp uses char to represent data in char, compiling gives error.
+    // Thanksfully, dpcpp support complex data allocation directly.
+    ValueType data_[size];
+};
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_UNINITIALIZED_ARRAY_HPP_
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 531cfd52259..b17e44f9706 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -33,25 +33,29 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 
 
-#include <algorithm>
-
-
 #include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
 
 
-#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
-#include <ginkgo/core/matrix/hybrid.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
 #include "core/components/prefix_sum.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/helper.hpp"
+#include "dpcpp/base/onemkl_bindings.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/uninitialized_array.hpp"
 
 
 namespace gko {
@@ -65,11 +69,625 @@ namespace dpcpp {
 namespace dense {
 
 
+using KCFG_1D = ConfigSet<11, 7>;
+constexpr auto kcfg_1d_list =
+    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 64),
+                    KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
+                    KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
+                    KCFG_1D::encode(256, 8)>();
+constexpr auto subgroup_list =
+    syn::value_list<std::uint32_t, 64, 32, 16, 8, 4>();
+constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list);
+constexpr auto default_block_size = 256;
+
+
+namespace kernel {
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename OutType,
+          typename CallableGetValue, typename CallableReduce>
+void compute_partial_reduce(
+    size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value,
+    CallableReduce reduce_op, sycl::nd_item<3> item_ct1,
+    UninitializedArray<OutType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+
+    constexpr auto warps_per_block = wg_size / sg_size;
+
+    const auto num_blocks = item_ct1.get_group_range(2);
+    const auto local_id = thread::get_local_thread_id<sg_size>(item_ct1);
+    const auto global_id =
+        thread::get_thread_id<sg_size, warps_per_block>(item_ct1);
+
+    OutType *tmp_work_array = *tmp_work;
+    auto tmp = zero<OutType>();
+    for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) {
+        tmp = reduce_op(tmp, get_value(i));
+    }
+
+    tmp_work_array[local_id] = tmp;
+
+    ::gko::kernels::dpcpp::reduce<sg_size>(group::this_thread_block(item_ct1),
+                                           tmp_work_array, reduce_op);
+
+    if (local_id == 0) {
+        work[thread::get_block_id(item_ct1)] = tmp_work_array[0];
+    }
+}
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType,
+          typename CallableReduce, typename CallableFinalize>
+void finalize_reduce_computation(
+    size_type size, const ValueType *work, ValueType *result,
+    CallableReduce reduce_op, CallableFinalize finalize_op,
+    sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+
+    const auto local_id = thread::get_local_thread_id<sg_size>(item_ct1);
+
+    ValueType tmp = zero<ValueType>();
+    for (auto i = local_id; i < size; i += wg_size) {
+        tmp = reduce_op(tmp, work[i]);
+    }
+    ValueType *tmp_work_array = *tmp_work;
+    tmp_work_array[local_id] = tmp;
+
+    ::gko::kernels::dpcpp::reduce<sg_size>(group::this_thread_block(item_ct1),
+                                           tmp_work_array, reduce_op);
+
+    if (local_id == 0) {
+        *result = finalize_op(tmp_work_array[0]);
+    }
+}
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void compute_partial_dot(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    const ValueType *__restrict__ y, size_type stride_y,
+    ValueType *__restrict__ work, sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+{
+    compute_partial_reduce<cfg>(
+        num_rows, work,
+        [x, stride_x, y, stride_y](size_type i) {
+            return x[i * stride_x] * y[i * stride_y];
+        },
+        [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1,
+        tmp_work);
+}
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void compute_partial_dot(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                         sycl::queue *stream, size_type num_rows,
+                         const ValueType *x, size_type stride_x,
+                         const ValueType *y, size_type stride_y,
+                         ValueType *work)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                compute_partial_dot<cfg>(num_rows, x, stride_x, y, stride_y,
+                                         work, item_ct1,
+                                         tmp_work_acc_ct1.get_pointer().get());
+            });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot,
+                                           compute_partial_dot)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot,
+                               kcfg_1d_list)
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void compute_partial_conj_dot(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    const ValueType *__restrict__ y, size_type stride_y,
+    ValueType *__restrict__ work, sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+{
+    compute_partial_reduce<cfg>(
+        num_rows, work,
+        [x, stride_x, y, stride_y](size_type i) {
+            return conj(x[i * stride_x]) * y[i * stride_y];
+        },
+        [](const ValueType &x, const ValueType &y) { return x + y; }, item_ct1,
+        tmp_work);
+}
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void compute_partial_conj_dot(dim3 grid, dim3 block,
+                              size_t dynamic_shared_memory, sycl::queue *stream,
+                              size_type num_rows, const ValueType *x,
+                              size_type stride_x, const ValueType *y,
+                              size_type stride_y, ValueType *work)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                compute_partial_conj_dot<cfg>(
+                    num_rows, x, stride_x, y, stride_y, work, item_ct1,
+                    tmp_work_acc_ct1.get_pointer().get());
+            });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_conj_dot,
+                                           compute_partial_conj_dot)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call,
+                               compute_partial_conj_dot, kcfg_1d_list)
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void finalize_sum_reduce_computation(
+    size_type size, const ValueType *work, ValueType *result,
+    sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+{
+    finalize_reduce_computation<cfg>(
+        size, work, result,
+        [](const ValueType &x, const ValueType &y) { return x + y; },
+        [](const ValueType &x) { return x; }, item_ct1, tmp_work);
+}
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void finalize_sum_reduce_computation(dim3 grid, dim3 block,
+                                     size_t dynamic_shared_memory,
+                                     sycl::queue *stream, size_type size,
+                                     const ValueType *work, ValueType *result)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+        cgh.parallel_for(sycl_nd_range(grid, block),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             finalize_sum_reduce_computation<cfg>(
+                                 size, work, result, item_ct1,
+                                 tmp_work_acc_ct1.get_pointer().get());
+                         });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sum_reduce_computation,
+                                           finalize_sum_reduce_computation)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call,
+                               finalize_sum_reduce_computation, kcfg_1d_list)
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void compute_partial_norm2(
+    size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+    remove_complex<ValueType> *__restrict__ work, sycl::nd_item<3> item_ct1,
+    UninitializedArray<remove_complex<ValueType>, KCFG_1D::decode<0>(cfg)>
+        *tmp_work)
+{
+    using norm_type = remove_complex<ValueType>;
+    compute_partial_reduce<cfg>(
+        num_rows, work,
+        [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); },
+        [](const norm_type &x, const norm_type &y) { return x + y; }, item_ct1,
+        tmp_work);
+}
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void compute_partial_norm2(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                           sycl::queue *stream, size_type num_rows,
+                           const ValueType *x, size_type stride_x,
+                           remove_complex<ValueType> *work)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<remove_complex<ValueType>, wg_size>,
+                       0, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+        cgh.parallel_for(sycl_nd_range(grid, block),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             compute_partial_norm2<cfg>(
+                                 num_rows, x, stride_x, work, item_ct1,
+                                 tmp_work_acc_ct1.get_pointer().get());
+                         });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2,
+                                           compute_partial_norm2)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call,
+                               compute_partial_norm2, kcfg_1d_list)
+
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void finalize_sqrt_reduce_computation(
+    size_type size, const ValueType *work, ValueType *result,
+    sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
+{
+    finalize_reduce_computation<cfg>(
+        size, work, result,
+        [](const ValueType &x, const ValueType &y) { return x + y; },
+        [](const ValueType &x) { return std::sqrt(x); }, item_ct1, tmp_work);
+}
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+void finalize_sqrt_reduce_computation(dim3 grid, dim3 block,
+                                      size_t dynamic_shared_memory,
+                                      sycl::queue *stream, size_type size,
+                                      const ValueType *work, ValueType *result)
+{
+    constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
+                       sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            tmp_work_acc_ct1(cgh);
+
+
+        cgh.parallel_for(sycl_nd_range(grid, block),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             finalize_sqrt_reduce_computation<cfg>(
+                                 size, work, result, item_ct1,
+                                 tmp_work_acc_ct1.get_pointer().get());
+                         });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sqrt_reduce_computation,
+                                           finalize_sqrt_reduce_computation)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call,
+                               finalize_sqrt_reduce_computation, kcfg_1d_list)
+
+
+template <typename ValueType, typename IndexType>
+void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride,
+                 const size_type *__restrict__ row_ptrs,
+                 const ValueType *__restrict__ source,
+                 IndexType *__restrict__ row_idxs,
+                 IndexType *__restrict__ col_idxs,
+                 ValueType *__restrict__ values, sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    if (tidx < num_rows) {
+        size_type write_to = row_ptrs[tidx];
+
+        for (size_type i = 0; i < num_cols; i++) {
+            if (source[stride * tidx + i] != zero<ValueType>()) {
+                values[write_to] = source[stride * tidx + i];
+                col_idxs[write_to] = i;
+                row_idxs[write_to] = tidx;
+                write_to++;
+            }
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo)
+
+
+template <std::uint32_t cfg, typename ValueType, typename IndexType>
+void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride,
+                       const ValueType *__restrict__ work,
+                       IndexType *__restrict__ result,
+                       sycl::nd_item<3> item_ct1)
+{
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    const auto row_idx = thread::get_subwarp_id_flat<sg_size>(item_ct1);
+    auto warp_tile =
+        group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
+
+    if (row_idx < num_rows) {
+        IndexType part_result{};
+        for (auto i = warp_tile.thread_rank(); i < num_cols; i += sg_size) {
+            if (work[stride * row_idx + i] != zero<ValueType>()) {
+                part_result += 1;
+            }
+        }
+        result[row_idx] = ::gko::kernels::dpcpp::reduce(
+            warp_tile, part_result,
+            [](const size_type &a, const size_type &b) { return a + b; });
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST_CONFIG(count_nnz_per_row, count_nnz_per_row)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(count_nnz_per_row, count_nnz_per_row)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row,
+                               kcfg_1d_list)
+
+
+template <typename ValueType, typename IndexType>
+void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
+                 const ValueType *__restrict__ source,
+                 IndexType *__restrict__ row_ptrs,
+                 IndexType *__restrict__ col_idxs,
+                 ValueType *__restrict__ values, sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+
+    if (tidx < num_rows) {
+        auto write_to = row_ptrs[tidx];
+        for (size_type i = 0; i < num_cols; i++) {
+            if (source[stride * tidx + i] != zero<ValueType>()) {
+                values[write_to] = source[stride * tidx + i];
+                col_idxs[write_to] = i;
+                write_to++;
+            }
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr)
+
+
+template <typename ValueType, typename IndexType>
+void fill_in_ell(size_type num_rows, size_type num_cols,
+                 size_type source_stride, const ValueType *__restrict__ source,
+                 size_type max_nnz_per_row, size_type result_stride,
+                 IndexType *__restrict__ col_ptrs,
+                 ValueType *__restrict__ values, sycl::nd_item<3> item_ct1)
+{
+    const auto tidx = thread::get_thread_id_flat(item_ct1);
+    if (tidx < num_rows) {
+        IndexType col_idx = 0;
+        for (size_type col = 0; col < num_cols; col++) {
+            if (source[tidx * source_stride + col] != zero<ValueType>()) {
+                col_ptrs[col_idx * result_stride + tidx] = col;
+                values[col_idx * result_stride + tidx] =
+                    source[tidx * source_stride + col];
+                col_idx++;
+            }
+        }
+        for (size_type j = col_idx; j < max_nnz_per_row; j++) {
+            col_ptrs[j * result_stride + tidx] = 0;
+            values[j * result_stride + tidx] = zero<ValueType>();
+        }
+    } else if (tidx < result_stride) {
+        for (size_type j = 0; j < max_nnz_per_row; j++) {
+            col_ptrs[j * result_stride + tidx] = 0;
+            values[j * result_stride + tidx] = zero<ValueType>();
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell)
+
+
+template <std::uint32_t cfg>
+void calculate_slice_lengths(size_type num_rows, size_type slice_size,
+                             int slice_num, size_type stride_factor,
+                             const size_type *__restrict__ nnz_per_row,
+                             size_type *__restrict__ slice_lengths,
+                             size_type *__restrict__ slice_sets,
+                             sycl::nd_item<3> item_ct1)
+{
+    constexpr auto sg_size = cfg;
+    const auto sliceid = item_ct1.get_group(2);
+    const auto tid_in_warp = item_ct1.get_local_id(2);
+    const bool runable = sliceid * slice_size + tid_in_warp < num_rows;
+    size_type thread_result = 0;
+    for (size_type i = tid_in_warp; i < slice_size; i += sg_size) {
+        thread_result =
+            (i + slice_size * sliceid < num_rows)
+                ? max(thread_result, nnz_per_row[sliceid * slice_size + i])
+                : thread_result;
+    }
+
+    auto warp_tile =
+        group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
+    auto warp_result = ::gko::kernels::dpcpp::reduce(
+        warp_tile, thread_result,
+        [](const size_type &a, const size_type &b) { return max(a, b); });
+
+    if (tid_in_warp == 0 && runable) {
+        auto slice_length = ceildiv(warp_result, stride_factor) * stride_factor;
+        slice_lengths[sliceid] = slice_length;
+        slice_sets[sliceid] = slice_length;
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths,
+                                           calculate_slice_lengths)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call,
+                               calculate_slice_lengths, subgroup_list)
+
+
+template <typename ValueType, typename IndexType>
+void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size,
+                   size_type stride, const ValueType *__restrict__ source,
+                   size_type *__restrict__ slice_lengths,
+                   size_type *__restrict__ slice_sets,
+                   IndexType *__restrict__ col_idxs,
+                   ValueType *__restrict__ vals, sycl::nd_item<3> item_ct1)
+{
+    const auto global_row = thread::get_thread_id_flat(item_ct1);
+    const auto row = global_row % slice_size;
+    const auto sliceid = global_row / slice_size;
+
+    if (global_row < num_rows) {
+        size_type sellp_ind = slice_sets[sliceid] * slice_size + row;
+
+        for (size_type col = 0; col < num_cols; col++) {
+            auto val = source[global_row * stride + col];
+            if (val != zero<ValueType>()) {
+                col_idxs[sellp_ind] = col;
+                vals[sellp_ind] = val;
+                sellp_ind += slice_size;
+            }
+        }
+        for (size_type i = sellp_ind;
+             i <
+             (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row;
+             i += slice_size) {
+            col_idxs[i] = 0;
+            vals[i] = zero<ValueType>();
+        }
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp)
+
+
+template <std::uint32_t cfg>
+void reduce_max_nnz(size_type size, const size_type *__restrict__ nnz_per_row,
+                    size_type *__restrict__ result, sycl::nd_item<3> item_ct1,
+                    uint8_t *dpct_local)
+{
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    auto block_max = (size_type *)dpct_local;
+
+    reduce_array<sg_size>(
+        size, nnz_per_row, block_max, item_ct1,
+        [](const size_type &x, const size_type &y) { return max(x, y); });
+
+    if (item_ct1.get_local_id(2) == 0) {
+        result[item_ct1.get_group(2)] = block_max[0];
+    }
+}
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16)>
+void reduce_max_nnz(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                    sycl::queue *stream, size_type size,
+                    const size_type *nnz_per_row, size_type *result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh);
+
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                reduce_max_nnz<cfg>(size, nnz_per_row, result, item_ct1,
+                                    dpct_local_acc_ct1.get_pointer().get());
+            });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz);
+GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz,
+                               kcfg_1d_list)
+
+
+template <std::uint32_t cfg>
+void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size,
+                              size_type stride_factor,
+                              const size_type *__restrict__ nnz_per_row,
+                              size_type *__restrict__ result,
+                              sycl::nd_item<3> item_ct1)
+{
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    auto warp_tile =
+        group::tiled_partition<sg_size>(group::this_thread_block(item_ct1));
+    const auto warpid = thread::get_subwarp_id_flat<sg_size>(item_ct1);
+    const auto tid_in_warp = warp_tile.thread_rank();
+    const auto slice_num = ceildiv(num_rows, slice_size);
+
+    size_type thread_result = 0;
+    for (size_type i = tid_in_warp; i < slice_size; i += sg_size) {
+        if (warpid * slice_size + i < num_rows) {
+            thread_result =
+                max(thread_result, nnz_per_row[warpid * slice_size + i]);
+        }
+    }
+
+    auto warp_result = ::gko::kernels::dpcpp::reduce(
+        warp_tile, thread_result,
+        [](const size_type &a, const size_type &b) { return max(a, b); });
+
+    if (tid_in_warp == 0 && warpid < slice_num) {
+        result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor;
+    }
+}
+
+GKO_ENABLE_DEFAULT_HOST_CONFIG(reduce_max_nnz_per_slice,
+                               reduce_max_nnz_per_slice)
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz_per_slice,
+                                           reduce_max_nnz_per_slice)
+GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call,
+                               reduce_max_nnz_per_slice, kcfg_1d_list)
+
+
+template <std::uint32_t cfg>
+void reduce_total_cols(size_type num_slices,
+                       const size_type *__restrict__ max_nnz_per_slice,
+                       size_type *__restrict__ result,
+                       sycl::nd_item<3> item_ct1, uint8_t *dpct_local)
+{
+    auto block_result = (size_type *)dpct_local;
+    constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
+    reduce_array<sg_size>(
+        num_slices, max_nnz_per_slice, block_result, item_ct1,
+        [](const size_type &x, const size_type &y) { return x + y; });
+
+    if (item_ct1.get_local_id(2) == 0) {
+        result[item_ct1.get_group(2)] = block_result[0];
+    }
+}
+
+template <std::uint32_t cfg = KCFG_1D::encode(256, 16)>
+void reduce_total_cols(dim3 grid, dim3 block, size_t dynamic_shared_memory,
+                       sycl::queue *stream, size_type num_slices,
+                       const size_type *max_nnz_per_slice, size_type *result)
+{
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                       sycl::access::target::local>
+            dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                reduce_total_cols<cfg>(num_slices, max_nnz_per_slice, result,
+                                       item_ct1,
+                                       dpct_local_acc_ct1.get_pointer().get());
+            });
+    });
+}
+
+GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_total_cols,
+                                           reduce_total_cols);
+GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols,
+                               kcfg_1d_list)
+
+
+}  // namespace kernel
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DpcppExecutor> exec,
                   const matrix::Dense<ValueType> *a,
                   const matrix::Dense<ValueType> *b,
-                  matrix::Dense<ValueType> *c) GKO_NOT_IMPLEMENTED;
+                  matrix::Dense<ValueType> *c)
+{
+    using namespace oneapi::mkl;
+    oneapi::mkl::blas::row_major::gemm(
+        *exec->get_queue(), transpose::nontrans, transpose::nontrans,
+        c->get_size()[0], c->get_size()[1], a->get_size()[1], one<ValueType>(),
+        a->get_const_values(), a->get_stride(), b->get_const_values(),
+        b->get_stride(), zero<ValueType>(), c->get_values(), c->get_stride());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
@@ -78,8 +696,17 @@ template <typename ValueType>
 void apply(std::shared_ptr<const DpcppExecutor> exec,
            const matrix::Dense<ValueType> *alpha,
            const matrix::Dense<ValueType> *a, const matrix::Dense<ValueType> *b,
-           const matrix::Dense<ValueType> *beta,
-           matrix::Dense<ValueType> *c) GKO_NOT_IMPLEMENTED;
+           const matrix::Dense<ValueType> *beta, matrix::Dense<ValueType> *c)
+{
+    using namespace oneapi::mkl;
+    oneapi::mkl::blas::row_major::gemm(
+        *exec->get_queue(), transpose::nontrans, transpose::nontrans,
+        c->get_size()[0], c->get_size()[1], a->get_size()[1],
+        exec->copy_val_to_host(alpha->get_const_values()),
+        a->get_const_values(), a->get_stride(), b->get_const_values(),
+        b->get_stride(), exec->copy_val_to_host(beta->get_const_values()),
+        c->get_values(), c->get_stride());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
@@ -88,7 +715,43 @@ template <typename ValueType>
 void compute_dot(std::shared_ptr<const DpcppExecutor> exec,
                  const matrix::Dense<ValueType> *x,
                  const matrix::Dense<ValueType> *y,
-                 matrix::Dense<ValueType> *result) GKO_NOT_IMPLEMENTED;
+                 matrix::Dense<ValueType> *result)
+{
+    if (x->get_size()[1] == 1) {
+        // TODO: write a custom kernel which does this more efficiently
+        onemkl::dot(*exec->get_queue(), x->get_size()[0], x->get_const_values(),
+                    x->get_stride(), y->get_const_values(), y->get_stride(),
+                    result->get_values());
+    } else {
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        auto queue = exec->get_queue();
+        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+        const std::uint32_t cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+                return validate(queue, KCFG_1D::decode<0>(cfg),
+                                KCFG_1D::decode<1>(cfg));
+            });
+        const auto wg_size = KCFG_1D::decode<0>(cfg);
+        const auto sg_size = KCFG_1D::decode<1>(cfg);
+        const auto work_per_block = work_per_thread * wg_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
+        Array<ValueType> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_dot_call(
+                cfg, grid_dim, block_dim, 0, exec->get_queue(),
+                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
+                y->get_const_values() + col, y->get_stride(), work.get_data());
+            kernel::finalize_sum_reduce_computation_call(
+                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
+                work.get_const_data(), result->get_values() + col);
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 
@@ -97,7 +760,46 @@ template <typename ValueType>
 void compute_conj_dot(std::shared_ptr<const DpcppExecutor> exec,
                       const matrix::Dense<ValueType> *x,
                       const matrix::Dense<ValueType> *y,
-                      matrix::Dense<ValueType> *result) GKO_NOT_IMPLEMENTED;
+                      matrix::Dense<ValueType> *result)
+{
+    if (x->get_size()[1] == 1) {
+        // TODO: write a custom kernel which does this more efficiently
+        onemkl::conj_dot(*exec->get_queue(), x->get_size()[0],
+                         x->get_const_values(), x->get_stride(),
+                         y->get_const_values(), y->get_stride(),
+                         result->get_values());
+
+    } else {
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        auto queue = exec->get_queue();
+        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+        const std::uint32_t cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+                return validate(queue, KCFG_1D::decode<0>(cfg),
+                                KCFG_1D::decode<1>(cfg));
+            });
+        const auto wg_size = KCFG_1D::decode<0>(cfg);
+        const auto sg_size = KCFG_1D::decode<1>(cfg);
+
+        const auto work_per_block = work_per_thread * wg_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
+        Array<ValueType> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_conj_dot_call(
+                cfg, grid_dim, block_dim, 0, exec->get_queue(),
+                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
+                y->get_const_values() + col, y->get_stride(), work.get_data());
+            kernel::finalize_sum_reduce_computation_call(
+                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
+                work.get_const_data(), result->get_values() + col);
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 
@@ -106,7 +808,43 @@ template <typename ValueType>
 void compute_norm2(std::shared_ptr<const DpcppExecutor> exec,
                    const matrix::Dense<ValueType> *x,
                    matrix::Dense<remove_complex<ValueType>> *result)
-    GKO_NOT_IMPLEMENTED;
+{
+    if (x->get_size()[1] == 1) {
+        oneapi::mkl::blas::row_major::nrm2(
+            *exec->get_queue(), x->get_size()[0], x->get_const_values(),
+            x->get_stride(), result->get_values());
+    } else {
+        using norm_type = remove_complex<ValueType>;
+        // TODO: these are tuning parameters obtained experimentally, once
+        // we decide how to handle this uniformly, they should be modified
+        // appropriately
+        constexpr auto work_per_thread = 32;
+        auto queue = exec->get_queue();
+        constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+        const std::uint32_t cfg =
+            get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+                return validate(queue, KCFG_1D::decode<0>(cfg),
+                                KCFG_1D::decode<1>(cfg));
+            });
+        const auto wg_size = KCFG_1D::decode<0>(cfg);
+        const auto sg_size = KCFG_1D::decode<1>(cfg);
+
+        const auto work_per_block = work_per_thread * wg_size;
+        const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
+        const dim3 block_dim{sg_size, 1, wg_size / sg_size};
+        Array<norm_type> work(exec, grid_dim.x);
+        // TODO: write a kernel which does this more efficiently
+        for (size_type col = 0; col < x->get_size()[1]; ++col) {
+            kernel::compute_partial_norm2_call(
+                cfg, grid_dim, block_dim, 0, exec->get_queue(),
+                x->get_size()[0], x->get_const_values() + col, x->get_stride(),
+                work.get_data());
+            kernel::finalize_sqrt_reduce_computation_call(
+                cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x,
+                work.get_const_data(), result->get_values() + col);
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
@@ -115,7 +853,36 @@ template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *source,
                     matrix::Coo<ValueType, IndexType> *result)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+
+    auto row_idxs = result->get_row_idxs();
+    auto col_idxs = result->get_col_idxs();
+    auto values = result->get_values();
+
+    auto stride = source->get_stride();
+
+    auto nnz_prefix_sum = Array<size_type>(exec, num_rows);
+    calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum);
+
+    components::prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows);
+
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+    size_type grid_dim = ceildiv(num_rows, wg_size);
+
+    kernel::fill_in_coo(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
+                        num_cols, stride, nnz_prefix_sum.get_const_data(),
+                        source->get_const_values(), row_idxs, col_idxs, values);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
@@ -125,7 +892,41 @@ template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *source,
                     matrix::Csr<ValueType, IndexType> *result)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+
+    auto row_ptrs = result->get_row_ptrs();
+    auto col_idxs = result->get_col_idxs();
+    auto values = result->get_values();
+
+    auto stride = source->get_stride();
+
+    const auto rows_per_block = ceildiv(wg_size, sg_size);
+    const auto grid_dim_nnz = ceildiv(source->get_size()[0], rows_per_block);
+
+    kernel::count_nnz_per_row_call(
+        cfg, grid_dim_nnz, wg_size, 0, exec->get_queue(), num_rows, num_cols,
+        stride, source->get_const_values(), row_ptrs);
+
+    components::prefix_sum(exec, row_ptrs, num_rows + 1);
+
+    size_type grid_dim = ceildiv(num_rows, wg_size);
+
+    kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(),
+                        num_rows, num_cols, stride, source->get_const_values(),
+                        row_ptrs, col_idxs, values);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
@@ -135,7 +936,31 @@ template <typename ValueType, typename IndexType>
 void convert_to_ell(std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Dense<ValueType> *source,
                     matrix::Ell<ValueType, IndexType> *result)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto num_rows = result->get_size()[0];
+    auto num_cols = result->get_size()[1];
+    auto max_nnz_per_row = result->get_num_stored_elements_per_row();
+
+    auto col_ptrs = result->get_col_idxs();
+    auto values = result->get_values();
+
+    auto source_stride = source->get_stride();
+    auto result_stride = result->get_stride();
+
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+    auto grid_dim = ceildiv(result_stride, wg_size);
+    kernel::fill_in_ell(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
+                        num_cols, source_stride, source->get_const_values(),
+                        max_nnz_per_row, result_stride, col_ptrs, values);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_ELL_KERNEL);
@@ -155,7 +980,56 @@ template <typename ValueType, typename IndexType>
 void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
                       const matrix::Dense<ValueType> *source,
                       matrix::Sellp<ValueType, IndexType> *result)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+
+    const auto stride = source->get_stride();
+    const auto num_rows = result->get_size()[0];
+    const auto num_cols = result->get_size()[1];
+
+    auto vals = result->get_values();
+    auto col_idxs = result->get_col_idxs();
+    auto slice_lengths = result->get_slice_lengths();
+    auto slice_sets = result->get_slice_sets();
+
+    const auto slice_size = (result->get_slice_size() == 0)
+                                ? matrix::default_slice_size
+                                : result->get_slice_size();
+    const auto stride_factor = (result->get_stride_factor() == 0)
+                                   ? matrix::default_stride_factor
+                                   : result->get_stride_factor();
+    const int slice_num = ceildiv(num_rows, slice_size);
+
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    auto grid_dim = slice_num;
+
+    if (grid_dim > 0) {
+        kernel::calculate_slice_lengths_call(
+            sg_size, grid_dim, sg_size, 0, exec->get_queue(), num_rows,
+            slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(),
+            slice_lengths, slice_sets);
+    }
+
+    components::prefix_sum(exec, slice_sets, slice_num + 1);
+
+    grid_dim = ceildiv(num_rows, wg_size);
+    if (grid_dim > 0) {
+        kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows,
+                              num_cols, slice_size, stride,
+                              source->get_const_values(), slice_lengths,
+                              slice_sets, col_idxs, vals);
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_CONVERT_TO_SELLP_KERNEL);
@@ -173,8 +1047,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 template <typename ValueType>
 void count_nonzeros(std::shared_ptr<const DpcppExecutor> exec,
-                    const matrix::Dense<ValueType> *source,
-                    size_type *result) GKO_NOT_IMPLEMENTED;
+                    const matrix::Dense<ValueType> *source, size_type *result)
+{
+    const auto num_rows = source->get_size()[0];
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL);
 
@@ -182,7 +1063,37 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL);
 template <typename ValueType>
 void calculate_max_nnz_per_row(std::shared_ptr<const DpcppExecutor> exec,
                                const matrix::Dense<ValueType> *source,
-                               size_type *result) GKO_NOT_IMPLEMENTED;
+                               size_type *result)
+{
+    const auto num_rows = source->get_size()[0];
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto n = ceildiv(num_rows, wg_size);
+    const size_type grid_dim = (n <= wg_size) ? n : wg_size;
+
+    auto block_results = Array<size_type>(exec, grid_dim);
+
+    kernel::reduce_max_nnz_call(
+        cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
+        num_rows, nnz_per_row.get_const_data(), block_results.get_data());
+
+    auto d_result = Array<size_type>(exec, 1);
+
+    kernel::reduce_max_nnz_call(
+        cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
+        grid_dim, block_results.get_const_data(), d_result.get_data());
+
+    *result = exec->copy_val_to_host(d_result.get_const_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_CALCULATE_MAX_NNZ_PER_ROW_KERNEL);
@@ -191,7 +1102,28 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
 template <typename ValueType>
 void calculate_nonzeros_per_row(std::shared_ptr<const DpcppExecutor> exec,
                                 const matrix::Dense<ValueType> *source,
-                                Array<size_type> *result) GKO_NOT_IMPLEMENTED;
+                                Array<size_type> *result)
+{
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+    const dim3 block_size(wg_size, 1, 1);
+    auto rows_per_block = ceildiv(wg_size, sg_size);
+    const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block);
+    const dim3 grid_size(grid_x, 1, 1);
+    if (grid_x > 0) {
+        kernel::count_nnz_per_row_call(
+            cfg, grid_size, block_size, 0, exec->get_queue(),
+            source->get_size()[0], source->get_size()[1], source->get_stride(),
+            source->get_const_values(), result->get_data());
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_CALCULATE_NONZEROS_PER_ROW_KERNEL);
@@ -201,7 +1133,56 @@ template <typename ValueType>
 void calculate_total_cols(std::shared_ptr<const DpcppExecutor> exec,
                           const matrix::Dense<ValueType> *source,
                           size_type *result, size_type stride_factor,
-                          size_type slice_size) GKO_NOT_IMPLEMENTED;
+                          size_type slice_size)
+{
+    const auto num_rows = source->get_size()[0];
+
+    if (num_rows == 0) {
+        *result = 0;
+        return;
+    }
+
+    const auto num_cols = source->get_size()[1];
+    const auto slice_num = ceildiv(num_rows, slice_size);
+
+    auto nnz_per_row = Array<size_type>(exec, num_rows);
+
+    calculate_nonzeros_per_row(exec, source, &nnz_per_row);
+
+    auto max_nnz_per_slice = Array<size_type>(exec, slice_num);
+    auto queue = exec->get_queue();
+    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+    const std::uint32_t cfg =
+        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
+            return validate(queue, KCFG_1D::decode<0>(cfg),
+                            KCFG_1D::decode<1>(cfg));
+        });
+    const auto wg_size = KCFG_1D::decode<0>(cfg);
+    const auto sg_size = KCFG_1D::decode<1>(cfg);
+
+    auto grid_dim = ceildiv(slice_num * sg_size, wg_size);
+
+    kernel::reduce_max_nnz_per_slice_call(
+        cfg, grid_dim, wg_size, 0, exec->get_queue(), num_rows, slice_size,
+        stride_factor, nnz_per_row.get_const_data(),
+        max_nnz_per_slice.get_data());
+
+    grid_dim = ceildiv(slice_num, wg_size);
+    auto block_results = Array<size_type>(exec, grid_dim);
+
+    kernel::reduce_total_cols_call(
+        cfg, grid_dim, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
+        slice_num, max_nnz_per_slice.get_const_data(),
+        block_results.get_data());
+
+    auto d_result = Array<size_type>(exec, 1);
+
+    kernel::reduce_total_cols_call(
+        cfg, 1, wg_size, wg_size * sizeof(size_type), exec->get_queue(),
+        grid_dim, block_results.get_const_data(), d_result.get_data());
+
+    *result = exec->copy_val_to_host(d_result.get_const_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_CALCULATE_TOTAL_COLS_KERNEL);
diff --git a/dpcpp/test/components/CMakeLists.txt b/dpcpp/test/components/CMakeLists.txt
index 87a034a64df..77ad6684840 100644
--- a/dpcpp/test/components/CMakeLists.txt
+++ b/dpcpp/test/components/CMakeLists.txt
@@ -2,3 +2,4 @@ ginkgo_create_test(absolute_array)
 ginkgo_create_dpcpp_test(cooperative_groups_kernels)
 ginkgo_create_test(fill_array)
 ginkgo_create_test(precision_conversion)
+ginkgo_create_test(prefix_sum)
diff --git a/dpcpp/test/components/prefix_sum.cpp b/dpcpp/test/components/prefix_sum.cpp
new file mode 100644
index 00000000000..402192d0b77
--- /dev/null
+++ b/dpcpp/test/components/prefix_sum.cpp
@@ -0,0 +1,96 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+class PrefixSum : public ::testing::Test {
+protected:
+    using index_type = gko::int32;
+    PrefixSum()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::DpcppExecutor::create(0, ref)),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          dvals(exec)
+    {
+        std::uniform_int_distribution<index_type> dist(0, 1000);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+        }
+        dvals = vals;
+    }
+
+    void test(gko::size_type size)
+    {
+        gko::kernels::reference::components::prefix_sum(ref, vals.get_data(),
+                                                        size);
+        gko::kernels::dpcpp::components::prefix_sum(exec, dvals.get_data(),
+                                                    size);
+
+        GKO_ASSERT_ARRAY_EQ(vals, dvals);
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::DpcppExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<index_type> vals;
+    gko::Array<index_type> dvals;
+};
+
+
+TEST_F(PrefixSum, SmallEqualsReference) { test(100); }
+
+
+TEST_F(PrefixSum, BigEqualsReference) { test(total_size); }
+
+
+}  // namespace
diff --git a/dpcpp/test/matrix/CMakeLists.txt b/dpcpp/test/matrix/CMakeLists.txt
index af64b693718..ba8f0fb70fe 100644
--- a/dpcpp/test/matrix/CMakeLists.txt
+++ b/dpcpp/test/matrix/CMakeLists.txt
@@ -1 +1,2 @@
-ginkgo_create_test(csr_kernels)
\ No newline at end of file
+ginkgo_create_test(csr_kernels)
+ginkgo_create_test(dense_kernels)
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
new file mode 100644
index 00000000000..cf1bbe26cd4
--- /dev/null
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -0,0 +1,598 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+
+
+#include "core/components/fill_array.hpp"
+#include "core/matrix/dense_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "dpcpp/test/utils.hpp"
+
+
+namespace {
+
+
+class Dense : public ::testing::Test {
+protected:
+    using itype = int;
+#if GINKGO_DPCPP_SINGLE_MODE
+    using vtype = float;
+#else
+    using vtype = double;
+#endif  // GINKGO_DPCPP_SINGLE_MODE
+    using Mtx = gko::matrix::Dense<vtype>;
+    using MixedMtx = gko::matrix::Dense<gko::next_precision<vtype>>;
+    using NormVector = gko::matrix::Dense<gko::remove_complex<vtype>>;
+    using Arr = gko::Array<itype>;
+    using ComplexMtx = gko::matrix::Dense<std::complex<vtype>>;
+    using MixedComplexMtx =
+        gko::matrix::Dense<gko::next_precision<std::complex<vtype>>>;
+
+    Dense() : rand_engine(15) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::DpcppExecutor::get_num_devices("all"), 0);
+        ref = gko::ReferenceExecutor::create();
+        dpcpp = gko::DpcppExecutor::create(0, ref);
+    }
+
+    void TearDown()
+    {
+        if (dpcpp != nullptr) {
+            ASSERT_NO_THROW(dpcpp->synchronize());
+        }
+    }
+
+    template <typename MtxType>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols)
+    {
+        return gko::test::generate_random_matrix<MtxType>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(num_cols, num_cols),
+            std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
+    }
+
+    void set_up_vector_data(gko::size_type num_vecs,
+                            bool different_alpha = false)
+    {
+        x = gen_mtx<Mtx>(1000, num_vecs);
+        y = gen_mtx<Mtx>(1000, num_vecs);
+        if (different_alpha) {
+            alpha = gen_mtx<Mtx>(1, num_vecs);
+        } else {
+            alpha = gko::initialize<Mtx>({2.0}, ref);
+        }
+        dx = Mtx::create(dpcpp);
+        dx->copy_from(x.get());
+        dy = Mtx::create(dpcpp);
+        dy->copy_from(y.get());
+        dalpha = Mtx::create(dpcpp);
+        dalpha->copy_from(alpha.get());
+        expected = Mtx::create(ref, gko::dim<2>{1, num_vecs});
+        dresult = Mtx::create(dpcpp, gko::dim<2>{1, num_vecs});
+    }
+
+    void set_up_apply_data()
+    {
+        x = gen_mtx<Mtx>(65, 25);
+        c_x = gen_mtx<ComplexMtx>(65, 25);
+        y = gen_mtx<Mtx>(25, 35);
+        expected = gen_mtx<Mtx>(65, 35);
+        alpha = gko::initialize<Mtx>({2.0}, ref);
+        beta = gko::initialize<Mtx>({-1.0}, ref);
+        square = gen_mtx<Mtx>(x->get_size()[0], x->get_size()[0]);
+        dx = Mtx::create(dpcpp);
+        dx->copy_from(x.get());
+        dc_x = ComplexMtx::create(dpcpp);
+        dc_x->copy_from(c_x.get());
+        dy = Mtx::create(dpcpp);
+        dy->copy_from(y.get());
+        dresult = Mtx::create(dpcpp);
+        dresult->copy_from(expected.get());
+        dalpha = Mtx::create(dpcpp);
+        dalpha->copy_from(alpha.get());
+        dbeta = Mtx::create(dpcpp);
+        dbeta->copy_from(beta.get());
+        dsquare = Mtx::create(dpcpp);
+        dsquare->copy_from(square.get());
+
+        std::vector<itype> tmp(x->get_size()[0], 0);
+        auto rng = std::default_random_engine{};
+        std::iota(tmp.begin(), tmp.end(), 0);
+        std::shuffle(tmp.begin(), tmp.end(), rng);
+        std::vector<itype> tmp2(x->get_size()[1], 0);
+        std::iota(tmp2.begin(), tmp2.end(), 0);
+        std::shuffle(tmp2.begin(), tmp2.end(), rng);
+        std::vector<itype> tmp3(x->get_size()[0] / 10);
+        std::uniform_int_distribution<itype> row_dist(0, x->get_size()[0] - 1);
+        for (auto &i : tmp3) {
+            i = row_dist(rng);
+        }
+        rpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp.begin(), tmp.end()});
+        cpermute_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp2.begin(), tmp2.end()});
+        rgather_idxs =
+            std::unique_ptr<Arr>(new Arr{ref, tmp3.begin(), tmp3.end()});
+    }
+
+    template <typename ConvertedType, typename InputType>
+    std::unique_ptr<ConvertedType> convert(InputType &&input)
+    {
+        auto result = ConvertedType::create(input->get_executor());
+        input->convert_to(result.get());
+        return result;
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::DpcppExecutor> dpcpp;
+
+    std::ranlux48 rand_engine;
+
+    std::unique_ptr<Mtx> x;
+    std::unique_ptr<ComplexMtx> c_x;
+    std::unique_ptr<Mtx> y;
+    std::unique_ptr<Mtx> alpha;
+    std::unique_ptr<Mtx> beta;
+    std::unique_ptr<Mtx> expected;
+    std::unique_ptr<Mtx> square;
+    std::unique_ptr<Mtx> dresult;
+    std::unique_ptr<Mtx> dx;
+    std::unique_ptr<ComplexMtx> dc_x;
+    std::unique_ptr<Mtx> dy;
+    std::unique_ptr<Mtx> dalpha;
+    std::unique_ptr<Mtx> dbeta;
+    std::unique_ptr<Mtx> dsquare;
+    std::unique_ptr<Arr> rpermute_idxs;
+    std::unique_ptr<Arr> cpermute_idxs;
+    std::unique_ptr<Arr> rgather_idxs;
+};
+
+
+TEST_F(Dense, SingleVectorDpcppComputeDotIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    x->compute_dot(y.get(), expected.get());
+    dx->compute_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->compute_dot(y.get(), expected.get());
+    dx->compute_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, SingleVectorDpcppComputeConjDotIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+
+    x->compute_conj_dot(y.get(), expected.get());
+    dx->compute_conj_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, MultipleVectorDpcppComputeConjDotIsEquivalentToRef)
+{
+    set_up_vector_data(20);
+
+    x->compute_conj_dot(y.get(), expected.get());
+    dx->compute_conj_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef)
+{
+    set_up_vector_data(20);
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->dpcpp, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);
+}
+
+
+TEST_F(Dense, SimpleApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    x->apply(y.get(), expected.get());
+    dx->apply(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, SimpleApplyMixedIsEquivalentToRef)
+{
+    SKIP_IF_SINGLE_MODE;
+    set_up_apply_data();
+
+    x->apply(convert<MixedMtx>(y).get(), convert<MixedMtx>(expected).get());
+    dx->apply(convert<MixedMtx>(dy).get(), convert<MixedMtx>(dresult).get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7);
+}
+
+
+TEST_F(Dense, AdvancedApplyIsEquivalentToRef)
+{
+    set_up_apply_data();
+
+    x->apply(alpha.get(), y.get(), beta.get(), expected.get());
+    dx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, AdvancedApplyMixedIsEquivalentToRef)
+{
+    SKIP_IF_SINGLE_MODE;
+    set_up_apply_data();
+
+    x->apply(convert<MixedMtx>(alpha).get(), convert<MixedMtx>(y).get(),
+             convert<MixedMtx>(beta).get(), convert<MixedMtx>(expected).get());
+    dx->apply(convert<MixedMtx>(dalpha).get(), convert<MixedMtx>(dy).get(),
+              convert<MixedMtx>(dbeta).get(), convert<MixedMtx>(dresult).get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-7);
+}
+
+
+TEST_F(Dense, ApplyToComplexIsEquivalentToRef)
+{
+    SKIP_IF_SINGLE_MODE;
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(25, 1);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(65, 1);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(complex_b.get(), complex_x.get());
+    dx->apply(dcomplex_b.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ApplyToMixedComplexIsEquivalentToRef)
+{
+    SKIP_IF_SINGLE_MODE;
+    set_up_apply_data();
+    auto complex_b = gen_mtx<MixedComplexMtx>(25, 1);
+    auto dcomplex_b = MixedComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<MixedComplexMtx>(65, 1);
+    auto dcomplex_x = MixedComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(complex_b.get(), complex_x.get());
+    dx->apply(dcomplex_b.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7);
+}
+
+
+TEST_F(Dense, AdvancedApplyToComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(25, 1);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(65, 1);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(alpha.get(), complex_b.get(), beta.get(), complex_x.get());
+    dx->apply(dalpha.get(), dcomplex_b.get(), dbeta.get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, r<vtype>::value);
+}
+
+
+TEST_F(Dense, AdvancedApplyToMixedComplexIsEquivalentToRef)
+{
+    SKIP_IF_SINGLE_MODE;
+    set_up_apply_data();
+    auto complex_b = gen_mtx<MixedComplexMtx>(25, 1);
+    auto dcomplex_b = MixedComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<MixedComplexMtx>(65, 1);
+    auto dcomplex_x = MixedComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+
+    x->apply(convert<MixedMtx>(alpha).get(), complex_b.get(),
+             convert<MixedMtx>(beta).get(), complex_x.get());
+    dx->apply(convert<MixedMtx>(dalpha).get(), dcomplex_b.get(),
+              convert<MixedMtx>(dbeta).get(), dcomplex_x.get());
+
+    GKO_ASSERT_MTX_NEAR(dcomplex_x, complex_x, 1e-7);
+}
+
+
+TEST_F(Dense, ComputeDotComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+    auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2});
+    auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2});
+
+    complex_b->compute_dot(complex_x.get(), result.get());
+    dcomplex_b->compute_dot(dcomplex_x.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(result, dresult, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ComputeConjDotComplexIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto complex_b = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_b = ComplexMtx::create(dpcpp);
+    dcomplex_b->copy_from(complex_b.get());
+    auto complex_x = gen_mtx<ComplexMtx>(1234, 2);
+    auto dcomplex_x = ComplexMtx::create(dpcpp);
+    dcomplex_x->copy_from(complex_x.get());
+    auto result = ComplexMtx::create(ref, gko::dim<2>{1, 2});
+    auto dresult = ComplexMtx::create(dpcpp, gko::dim<2>{1, 2});
+
+    complex_b->compute_conj_dot(complex_x.get(), result.get());
+    dcomplex_b->compute_conj_dot(dcomplex_x.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(result, dresult, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ConvertToCooIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto coo_mtx = gko::matrix::Coo<vtype>::create(ref);
+    auto dcoo_mtx = gko::matrix::Coo<vtype>::create(dpcpp);
+
+    x->convert_to(coo_mtx.get());
+    dx->convert_to(dcoo_mtx.get());
+
+    ASSERT_EQ(dcoo_mtx->get_num_stored_elements(),
+              coo_mtx->get_num_stored_elements());
+    GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), r<vtype>::value);
+}
+
+
+TEST_F(Dense, MoveToCooIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto coo_mtx = gko::matrix::Coo<vtype>::create(ref);
+    auto dcoo_mtx = gko::matrix::Coo<vtype>::create(dpcpp);
+
+    x->move_to(coo_mtx.get());
+    dx->move_to(dcoo_mtx.get());
+
+    ASSERT_EQ(dcoo_mtx->get_num_stored_elements(),
+              coo_mtx->get_num_stored_elements());
+    GKO_ASSERT_MTX_NEAR(dcoo_mtx.get(), coo_mtx.get(), r<vtype>::value);
+}
+
+
+TEST_F(Dense, ConvertToCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto csr_mtx = gko::matrix::Csr<vtype>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<vtype>::create(dpcpp);
+
+    x->convert_to(csr_mtx.get());
+    dx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), r<vtype>::value);
+}
+
+
+TEST_F(Dense, MoveToCsrIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto csr_mtx = gko::matrix::Csr<vtype>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<vtype>::create(dpcpp);
+
+    x->move_to(csr_mtx.get());
+    dx->move_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dcsr_mtx.get(), csr_mtx.get(), r<vtype>::value);
+}
+
+
+TEST_F(Dense, ConvertToEllIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto ell_mtx = gko::matrix::Ell<vtype>::create(ref);
+    auto dell_mtx = gko::matrix::Ell<vtype>::create(dpcpp);
+
+    x->convert_to(ell_mtx.get());
+    dx->convert_to(dell_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), r<vtype>::value);
+}
+
+
+TEST_F(Dense, MoveToEllIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto ell_mtx = gko::matrix::Ell<vtype>::create(ref);
+    auto dell_mtx = gko::matrix::Ell<vtype>::create(dpcpp);
+
+    x->move_to(ell_mtx.get());
+    dx->move_to(dell_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dell_mtx.get(), ell_mtx.get(), r<vtype>::value);
+}
+
+
+TEST_F(Dense, ConvertToSellpIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sellp_mtx = gko::matrix::Sellp<vtype>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<vtype>::create(dpcpp);
+
+    x->convert_to(sellp_mtx.get());
+    dx->convert_to(dsellp_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r<vtype>::value);
+}
+
+
+TEST_F(Dense, MoveToSellpIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto sellp_mtx = gko::matrix::Sellp<vtype>::create(ref);
+    auto dsellp_mtx = gko::matrix::Sellp<vtype>::create(dpcpp);
+
+    x->move_to(sellp_mtx.get());
+    dx->move_to(dsellp_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(sellp_mtx, dsellp_mtx, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ConvertsEmptyToSellp)
+{
+    auto dempty_mtx = Mtx::create(dpcpp);
+    auto dsellp_mtx = gko::matrix::Sellp<vtype>::create(dpcpp);
+
+    dempty_mtx->convert_to(dsellp_mtx.get());
+
+    ASSERT_EQ(dpcpp->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
+    ASSERT_FALSE(dsellp_mtx->get_size());
+}
+
+
+TEST_F(Dense, CountNNZIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::size_type nnz;
+    gko::size_type dnnz;
+
+    gko::kernels::reference::dense::count_nonzeros(ref, x.get(), &nnz);
+    gko::kernels::dpcpp::dense::count_nonzeros(dpcpp, dx.get(), &dnnz);
+
+    ASSERT_EQ(nnz, dnnz);
+}
+
+
+TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::Array<gko::size_type> nnz_per_row(ref);
+    nnz_per_row.resize_and_reset(x->get_size()[0]);
+    gko::Array<gko::size_type> dnnz_per_row(dpcpp);
+    dnnz_per_row.resize_and_reset(dx->get_size()[0]);
+
+    gko::kernels::reference::dense::calculate_nonzeros_per_row(ref, x.get(),
+                                                               &nnz_per_row);
+    gko::kernels::dpcpp::dense::calculate_nonzeros_per_row(dpcpp, dx.get(),
+                                                           &dnnz_per_row);
+
+    auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
+        ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
+    }
+}
+
+
+TEST_F(Dense, CalculateMaxNNZPerRowIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::size_type max_nnz;
+    gko::size_type dmax_nnz;
+
+    gko::kernels::reference::dense::calculate_max_nnz_per_row(ref, x.get(),
+                                                              &max_nnz);
+    gko::kernels::dpcpp::dense::calculate_max_nnz_per_row(dpcpp, dx.get(),
+                                                          &dmax_nnz);
+
+    ASSERT_EQ(max_nnz, dmax_nnz);
+}
+
+
+TEST_F(Dense, CalculateTotalColsIsEquivalentToRef)
+{
+    set_up_apply_data();
+    gko::size_type total_cols;
+    gko::size_type dtotal_cols;
+
+    gko::kernels::reference::dense::calculate_total_cols(
+        ref, x.get(), &total_cols, 2, gko::matrix::default_slice_size);
+    gko::kernels::dpcpp::dense::calculate_total_cols(
+        dpcpp, dx.get(), &dtotal_cols, 2, gko::matrix::default_slice_size);
+
+    ASSERT_EQ(total_cols, dtotal_cols);
+}
+
+
+}  // namespace
diff --git a/dpcpp/test/utils.hpp b/dpcpp/test/utils.hpp
new file mode 100644
index 00000000000..57e703b8ef1
--- /dev/null
+++ b/dpcpp/test/utils.hpp
@@ -0,0 +1,56 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_TEST_UTILS_HPP_
+#define GKO_DPCPP_TEST_UTILS_HPP_
+
+
+#include <gtest/gtest.h>
+
+
+namespace {
+
+
+#if GINKGO_DPCPP_SINGLE_MODE
+#define SKIP_IF_SINGLE_MODE GTEST_SKIP() << "Skip due to single mode"
+#else
+#define SKIP_IF_SINGLE_MODE                                                  \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
+                  "semi-colon warnings")
+#endif
+
+
+}  // namespace
+
+
+#endif  // GKO_DPCPP_TEST_UTILS_HPP_
diff --git a/hip/components/prefix_sum.hip.cpp b/hip/components/prefix_sum.hip.cpp
index 28cd01b4fb5..9302fc07b9a 100644
--- a/hip/components/prefix_sum.hip.cpp
+++ b/hip/components/prefix_sum.hip.cpp
@@ -49,7 +49,7 @@ template <typename IndexType>
 void prefix_sum(std::shared_ptr<const HipExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // prefix_sum should be on the valid array
+    // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
@@ -58,8 +58,8 @@ void prefix_sum(std::shared_ptr<const HipExecutor> exec, IndexType *counts,
             HIP_KERNEL_NAME(start_prefix_sum<prefix_sum_block_size>),
             dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, num_entries,
             counts, block_sums);
-        // add the total sum of the previous block only when the number of block
-        // is larger than 1.
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
         if (num_blocks > 1) {
             hipLaunchKernelGGL(
                 HIP_KERNEL_NAME(finalize_prefix_sum<prefix_sum_block_size>),
diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp
index ebd5b441f6c..0e9570540fa 100644
--- a/include/ginkgo/core/synthesizer/containers.hpp
+++ b/include/ginkgo/core/synthesizer/containers.hpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_PUBLIC_CORE_SYNTHESIZER_CONTAINERS_HPP_
 
 
+#include <array>
 #include <type_traits>
 
 
@@ -46,14 +47,32 @@ namespace gko {
 namespace syn {
 
 
+/**
+ * value_list records several values with the same type in template.
+ *
+ * @tparam T  the value type of the list
+ * @tparam Values  the values in the list
+ */
 template <typename T, T... Values>
 struct value_list {};
 
 
+/**
+ * type_list records several types in template
+ *
+ * @tparam Types  the types in the list
+ */
 template <typename... Types>
 struct type_list {};
 
 
+/**
+ * range records start, end, step in template
+ *
+ * @tparam Start  start of range
+ * @tparam End  end of range
+ * @tparam Step  step of range. default is 1
+ */
 template <int Start, int End, int Step = 1>
 struct range {};
 
@@ -61,9 +80,22 @@ struct range {};
 namespace detail {
 
 
+/**
+ * concatenate_impl base type
+ *
+ * @tparam List1  the first List
+ * @tparam List2  the second List
+ */
 template <typename List1, typename List2>
 struct concatenate_impl;
 
+/**
+ * concatenate_impl specializes for two value_list with the same value type.
+ *
+ * @tparam T  the value type of two value_list
+ * @tparam Values  the values of the first list
+ * @tparam Values  the values of the second list
+ */
 template <typename T, T... Values1, T... Values2>
 struct concatenate_impl<value_list<T, Values1...>, value_list<T, Values2...>> {
     using type = value_list<T, Values1..., Values2...>;
@@ -73,6 +105,12 @@ struct concatenate_impl<value_list<T, Values1...>, value_list<T, Values2...>> {
 }  // namespace detail
 
 
+/**
+ * concatenate combines two value_list into one value_list.
+ *
+ * @tparam List1  the first list
+ * @tparam List2  the second list
+ */
 template <typename List1, typename List2>
 using concatenate = typename detail::concatenate_impl<List1, List2>::type;
 
@@ -80,19 +118,43 @@ using concatenate = typename detail::concatenate_impl<List1, List2>::type;
 namespace detail {
 
 
+/**
+ * as_list_impl base type
+ *
+ * @tparam T  the input template
+ */
 template <typename T, typename = void>
 struct as_list_impl;
 
+/**
+ * as_list_impl specializes for the value_list
+ *
+ * @tparam T  the value_list type
+ * @tparam Values  the values of value_list
+ */
 template <typename T, T... Values>
 struct as_list_impl<value_list<T, Values...>> {
     using type = value_list<T, Values...>;
 };
 
+/**
+ * as_list_impl specializes for the type_list
+ *
+ * @tparam ...Types  the types of type_list
+ */
 template <typename... Types>
 struct as_list_impl<type_list<Types...>> {
     using type = type_list<Types...>;
 };
 
+/**
+ * as_list_impl specializes for the range. This is the recursive case. It will
+ * concatenate Start and range<Start + Step, End, Step>.
+ *
+ * @tparam int  the start of range
+ * @tparam int  the end of range
+ * @tparam int  the step of range
+ */
 template <int Start, int End, int Step>
 struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start < End)>> {
     using type = concatenate<
@@ -100,6 +162,13 @@ struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start < End)>> {
         typename as_list_impl<range<Start + Step, End, Step>>::type>;
 };
 
+/**
+ * as_list_impl specializes for the range. This is the end case.
+ *
+ * @tparam int  the start of range
+ * @tparam int  the end of range
+ * @tparam int  the step of range
+ */
 template <int Start, int End, int Step>
 struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start >= End)>> {
     using type = value_list<int>;
@@ -109,10 +178,34 @@ struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start >= End)>> {
 }  // namespace detail
 
 
+/**
+ * as_list<T> gives the alias type of as_list_impl<T>::type. It gives a list
+ * (itself) if input is already a list, or generates list type from range input.
+ *
+ * @tparam T  list or range
+ */
 template <typename T>
 using as_list = typename detail::as_list_impl<T>::type;
 
 
+/**
+ * as_array<T> returns the array from value_list. It will be helpful if using
+ * for in runtime on the array.
+ *
+ * @tparam T  the type of value_list
+ * @tparam Value  the values of value_list
+ *
+ * @param value_list  the input value_list
+ *
+ * @return std::array  the std::array contains the values of value_list
+ */
+template <typename T, T... Value>
+constexpr std::array<T, sizeof...(Value)> as_array(value_list<T, Value...> vl)
+{
+    return std::array<T, sizeof...(Value)>{Value...};
+}
+
+
 }  // namespace syn
 }  // namespace gko