update the usage and doc

- inline binding - use queue, dynamic_shared_memory consistently - delete unused library Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu> Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
ginkgo-project · Jul 27, 2021 · 03bd66f · 03bd66f
1 parent 5951d47
commit 03bd66f
Show file tree

Hide file tree

Showing 20 changed files with 246 additions and 351 deletions.
diff --git a/dev_tools/oneapi/add_host_function.sh b/dev_tools/oneapi/add_host_function.sh
@@ -158,8 +158,8 @@ while IFS='' read -r line || [ -n "$line" ]; do
                     if [ -n "${TEMPLATE_INPUT}" ]; then
                         TEMPLATE_INPUT="<${TEMPLATE_INPUT}>"
                     fi
-                    echo "${TEMPLATE} void ${NAME}${HOST_SUFFIX} (dim3 grid, dim3 block, gko::size_type dynamic_shared_memory, cudaStream_t stream, ${VARIABLE}) {
-                        /*KEEP*/${NAME}${TEMPLATE_INPUT}<<<grid, block, dynamic_shared_memory, stream>>>(${VAR_INPUT});
+                    echo "${TEMPLATE} void ${NAME}${HOST_SUFFIX} (dim3 grid, dim3 block, size_type dynamic_shared_memory, cudaStream_t queue, ${VARIABLE}) {
+                        /*KEEP*/${NAME}${TEMPLATE_INPUT}<<<grid, block, dynamic_shared_memory, queue>>>(${VAR_INPUT});
                         }"
                     echo "${NAME} -> ${NAME}${HOST_SUFFIX}" >> ${MAP_FILE}
                 fi

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
@@ -17,7 +17,6 @@ target_sources(ginkgo_dpcpp
     base/version.dp.cpp
     base/executor.dp.cpp
     base/helper.dp.cpp
-    base/onemkl_bindings.dp.cpp
     components/absolute_array.dp.cpp
     components/fill_array.dp.cpp
     components/prefix_sum.dp.cpp
@@ -78,7 +77,7 @@ else ()
     target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
 endif()
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
-target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL)
+target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP)
 if (GINKGO_DPCPP_SINGLE_MODE)
     target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
 endif()

diff --git a/dpcpp/base/onemkl_bindings.dp.cpp b/dpcpp/base/onemkl_bindings.dp.cpp
diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp
@@ -94,34 +94,35 @@ template <>
 struct is_supported<std::complex<double>> : std::true_type {};
 
 
-#define GKO_DECLARE_DOT(ValueType, Name)                                 \
-    void Name(::cl::sycl::queue &exec_queue, std::int64_t n,             \
-              const ValueType *x, std::int64_t incx, const ValueType *y, \
-              std::int64_t incy, ValueType *result)
-
-// Declare the dot for x^T * y
-GKO_DECLARE_DOT(float, dot);
-GKO_DECLARE_DOT(double, dot);
-GKO_DECLARE_DOT(std::complex<float>, dotu);
-GKO_DECLARE_DOT(std::complex<double>, dotu);
+#define GKO_BIND_DOT(ValueType, Name, Func)                                    \
+    inline void Name(::cl::sycl::queue &exec_queue, std::int64_t n,            \
+                     const ValueType *x, std::int64_t incx,                    \
+                     const ValueType *y, std::int64_t incy, ValueType *result) \
+    {                                                                          \
+        Func(exec_queue, n, x, incx, y, incy, result);                         \
+    }                                                                          \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+// Bind the dot for x^T * y
+GKO_BIND_DOT(float, dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(double, dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(std::complex<float>, dot, oneapi::mkl::blas::row_major::dotu);
+GKO_BIND_DOT(std::complex<double>, dot, oneapi::mkl::blas::row_major::dotu);
 template <typename ValueType>
-GKO_DECLARE_DOT(ValueType, dot)
-{
-    detail::not_implemented(exec_queue, n, x, incx, y, incy, result);
-}
-
-// Declare the conj_dot for x' * y
-GKO_DECLARE_DOT(float, conj_dot);
-GKO_DECLARE_DOT(double, conj_dot);
-GKO_DECLARE_DOT(std::complex<float>, conj_dot);
-GKO_DECLARE_DOT(std::complex<double>, conj_dot);
+GKO_BIND_DOT(ValueType, dot, detail::not_implemented);
+
+// Bind the conj_dot for x' * y
+GKO_BIND_DOT(float, conj_dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(double, conj_dot, oneapi::mkl::blas::row_major::dot);
+GKO_BIND_DOT(std::complex<float>, conj_dot, oneapi::mkl::blas::row_major::dotc);
+GKO_BIND_DOT(std::complex<double>, conj_dot,
+             oneapi::mkl::blas::row_major::dotc);
 template <typename ValueType>
-GKO_DECLARE_DOT(ValueType, conj_dot)
-{
-    detail::not_implemented(exec_queue, n, x, incx, y, incy, result);
-}
+GKO_BIND_DOT(ValueType, conj_dot, detail::not_implemented);
 
-#undef GKO_DECLARE_DOT
+#undef GKO_BIND_DOT
 
 }  // namespace onemkl
 }  // namespace dpcpp

diff --git a/dpcpp/components/format_conversion.dp.hpp b/dpcpp/components/format_conversion.dp.hpp
@@ -59,8 +59,8 @@ namespace kernel {
  * It counts the number of explicit nonzeros per row of Ell.
  */
 template <typename ValueType, typename IndexType>
-void count_nnz_per_row(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                       sycl::queue *stream, size_type num_rows,
+void count_nnz_per_row(dim3 grid, dim3 block, size_type dynamic_shared_memory,
+                       sycl::queue *queue, size_type num_rows,
                        size_type max_nnz_per_row, size_type stride,
                        const ValueType *values, IndexType *result);
 
@@ -80,9 +80,10 @@ namespace kernel {
  */
 template <typename IndexType>
 void convert_row_idxs_to_ptrs(dim3 grid, dim3 block,
-                              size_t dynamic_shared_memory, sycl::queue *stream,
-                              const IndexType *idxs, size_type num_nonzeros,
-                              IndexType *ptrs, size_type length);
+                              size_type dynamic_shared_memory,
+                              sycl::queue *queue, const IndexType *idxs,
+                              size_type num_nonzeros, IndexType *ptrs,
+                              size_type length);
 
 
 }  // namespace kernel

diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
@@ -179,11 +179,11 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
 }
 
 template <std::uint32_t block_size, typename ValueType>
-void start_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                      sycl::queue *stream, size_type num_elements,
+void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory,
+                      sycl::queue *queue, size_type num_elements,
                       ValueType *elements, ValueType *block_sum)
 {
-    stream->submit([&](sycl::handler &cgh) {
+    queue->submit([&](sycl::handler &cgh) {
         sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
                        sycl::access::mode::read_write,
                        sycl::access::target::local>
@@ -231,11 +231,11 @@ void finalize_prefix_sum(size_type num_elements,
 }
 
 template <std::uint32_t block_size, typename ValueType>
-void finalize_prefix_sum(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                         sycl::queue *stream, size_type num_elements,
+void finalize_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory,
+                         sycl::queue *queue, size_type num_elements,
                          ValueType *elements, const ValueType *block_sum)
 {
-    stream->submit([&](sycl::handler &cgh) {
+    queue->submit([&](sycl::handler &cgh) {
         cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              finalize_prefix_sum<block_size>(

diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
@@ -217,11 +217,11 @@ void reduce_add_array(
 }
 
 template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
-void reduce_add_array(dim3 grid, dim3 block, size_t dynamic_shared_memory,
-                      sycl::queue *stream, size_type size,
+void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
+                      sycl::queue *queue, size_type size,
                       const ValueType *source, ValueType *result)
 {
-    stream->submit([&](sycl::handler &cgh) {
+    queue->submit([&](sycl::handler &cgh) {
         sycl::accessor<UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>,
                        0, sycl::access::mode::read_write,
                        sycl::access::target::local>