ginkgo-project · upsj · Jun 19, 2020 · Jun 6, 2020 · Jun 6, 2020 · Jun 17, 2020
diff --git a/common/matrix/csr_kernels.hpp.inc b/common/matrix/csr_kernels.hpp.inc
@@ -496,21 +496,23 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_classical_spmv(
 
 template <int subwarp_size, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spgeam_nnz(
-    const IndexType *a_row_ptrs, const IndexType *a_col_idxs,
-    const IndexType *b_row_ptrs, const IndexType *b_col_idxs,
-    IndexType num_rows, IndexType *nnz)
+    const IndexType *__restrict__ a_row_ptrs,
+    const IndexType *__restrict__ a_col_idxs,
+    const IndexType *__restrict__ b_row_ptrs,
+    const IndexType *__restrict__ b_col_idxs, IndexType num_rows,
+    IndexType *__restrict__ nnz)
 {
-    auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
     auto subwarp =
         group::tiled_partition<subwarp_size>(group::this_thread_block());
     if (row >= num_rows) {
         return;
     }
 
-    auto a_begin = a_row_ptrs[row];
-    auto b_begin = b_row_ptrs[row];
-    auto a_size = a_row_ptrs[row + 1] - a_begin;
-    auto b_size = b_row_ptrs[row + 1] - b_begin;
+    const auto a_begin = a_row_ptrs[row];
+    const auto b_begin = b_row_ptrs[row];
+    const auto a_size = a_row_ptrs[row + 1] - a_begin;
+    const auto b_size = b_row_ptrs[row + 1] - b_begin;
     IndexType count{};
     group_merge<subwarp_size>(
         a_col_idxs + a_begin, a_size, b_col_idxs + b_begin, b_size, subwarp,
@@ -528,28 +530,35 @@ __global__ __launch_bounds__(default_block_size) void spgeam_nnz(
 
 template <int subwarp_size, typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void spgeam(
-    ValueType alpha, const IndexType *a_row_ptrs, const IndexType *a_col_idxs,
-    const ValueType *a_vals, ValueType beta, const IndexType *b_row_ptrs,
-    const IndexType *b_col_idxs, const ValueType *b_vals, IndexType num_rows,
-    const IndexType *c_row_ptrs, IndexType *c_col_idxs, ValueType *c_vals)
-{
-    auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
+    const ValueType *__restrict__ palpha,
+    const IndexType *__restrict__ a_row_ptrs,
+    const IndexType *__restrict__ a_col_idxs,
+    const ValueType *__restrict__ a_vals, const ValueType *__restrict__ pbeta,
+    const IndexType *__restrict__ b_row_ptrs,
+    const IndexType *__restrict__ b_col_idxs,
+    const ValueType *__restrict__ b_vals, IndexType num_rows,
+    const IndexType *__restrict__ c_row_ptrs,
+    IndexType *__restrict__ c_col_idxs, ValueType *__restrict__ c_vals)
+{
+    const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
     auto subwarp =
         group::tiled_partition<subwarp_size>(group::this_thread_block());
     if (row >= num_rows) {
         return;
     }
 
-    auto lane = static_cast<IndexType>(subwarp.thread_rank());
+    const auto alpha = palpha[0];
+    const auto beta = pbeta[0];
+    const auto lane = static_cast<IndexType>(subwarp.thread_rank());
     constexpr auto lanemask_full =
         ~config::lane_mask_type{} >> (config::warp_size - subwarp_size);
-    auto lanemask_eq = config::lane_mask_type{1} << lane;
-    auto lanemask_lt = lanemask_eq - 1;
+    const auto lanemask_eq = config::lane_mask_type{1} << lane;
+    const auto lanemask_lt = lanemask_eq - 1;
 
-    auto a_begin = a_row_ptrs[row];
-    auto b_begin = b_row_ptrs[row];
-    auto a_size = a_row_ptrs[row + 1] - a_begin;
-    auto b_size = b_row_ptrs[row + 1] - b_begin;
+    const auto a_begin = a_row_ptrs[row];
+    const auto b_begin = b_row_ptrs[row];
+    const auto a_size = a_row_ptrs[row + 1] - a_begin;
+    const auto b_size = b_row_ptrs[row + 1] - b_begin;
     auto c_begin = c_row_ptrs[row];
     bool skip_first{};
     group_merge<subwarp_size>(

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -513,6 +513,11 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);

diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -57,6 +58,7 @@ GKO_REGISTER_OPERATION(spmv, csr::spmv);
 GKO_REGISTER_OPERATION(advanced_spmv, csr::advanced_spmv);
 GKO_REGISTER_OPERATION(spgemm, csr::spgemm);
 GKO_REGISTER_OPERATION(advanced_spgemm, csr::advanced_spgemm);
+GKO_REGISTER_OPERATION(spgeam, csr::spgeam);
 GKO_REGISTER_OPERATION(convert_to_coo, csr::convert_to_coo);
 GKO_REGISTER_OPERATION(convert_to_dense, csr::convert_to_dense);
 GKO_REGISTER_OPERATION(convert_to_sellp, csr::convert_to_sellp);
@@ -111,6 +113,12 @@ void Csr<ValueType, IndexType>::apply_impl(const LinOp *alpha, const LinOp *b,
         this->get_executor()->run(
             csr::make_advanced_spgemm(as<Dense>(alpha), this, b_csr,
                                       as<Dense>(beta), x_copy.get(), x_csr));
+    } else if (dynamic_cast<const Identity<ValueType> *>(b)) {
+        // if b is an identity matrix, we compute an SpGEAM
+        auto x_csr = as<TCsr>(x);
+        auto x_copy = x_csr->clone();
+        this->get_executor()->run(csr::make_spgeam(
+            as<Dense>(alpha), this, as<Dense>(beta), lend(x_copy), x_csr));
     } else {
         // otherwise we assume that b is dense and compute a SpMV/SpMM
         this->get_executor()->run(

diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
@@ -79,6 +79,14 @@ namespace kernels {
                          const matrix::Csr<ValueType, IndexType> *d,  \
                          matrix::Csr<ValueType, IndexType> *c)
 
+#define GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType)  \
+    void spgeam(std::shared_ptr<const DefaultExecutor> exec, \
+                const matrix::Dense<ValueType> *alpha,       \
+                const matrix::Csr<ValueType, IndexType> *a,  \
+                const matrix::Dense<ValueType> *beta,        \
+                const matrix::Csr<ValueType, IndexType> *b,  \
+                matrix::Csr<ValueType, IndexType> *c)
+
 #define GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType)      \
     void convert_to_dense(std::shared_ptr<const DefaultExecutor> exec,     \
                           const matrix::Csr<ValueType, IndexType> *source, \
@@ -176,6 +184,8 @@ namespace kernels {
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType);            \
     template <typename ValueType, typename IndexType>                        \
+    GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType);                     \
+    template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CONVERT_TO_DENSE_KERNEL(ValueType, IndexType);           \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CONVERT_TO_COO_KERNEL(ValueType, IndexType);             \

diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
@@ -93,6 +93,9 @@ using compiled_kernels = syn::value_list<int, 3, 4, 6, 7, 8, 12, 14>;
 using classical_kernels =
     syn::value_list<int, config::warp_size, 32, 16, 8, 4, 2, 1>;
 
+using spgeam_kernels =
+    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+
 
 #include "common/matrix/csr_kernels.hpp.inc"
 
@@ -580,6 +583,73 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 
 
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void spgeam(syn::value_list<int, subwarp_size>,
+            std::shared_ptr<const DefaultExecutor> exec, const ValueType *alpha,
+            const IndexType *a_row_ptrs, const IndexType *a_col_idxs,
+            const ValueType *a_vals, const ValueType *beta,
+            const IndexType *b_row_ptrs, const IndexType *b_col_idxs,
+            const ValueType *b_vals, matrix::Csr<ValueType, IndexType> *c)
+{
+    auto m = static_cast<IndexType>(c->get_size()[0]);
+    auto c_row_ptrs = c->get_row_ptrs();
+    // count nnz for alpha * A + beta * B
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(m, subwarps_per_block);
+    kernel::spgeam_nnz<subwarp_size><<<num_blocks, default_block_size>>>(
+        a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs);
+
+    // build row pointers
+    components::prefix_sum(exec, c_row_ptrs, m + 1);
+
+    // accumulate non-zeros for alpha * A + beta * B
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m);
+    c_builder.get_col_idx_array().resize_and_reset(c_nnz);
+    c_builder.get_value_array().resize_and_reset(c_nnz);
+    auto c_col_idxs = c->get_col_idxs();
+    auto c_vals = c->get_values();
+    kernel::spgeam<subwarp_size><<<num_blocks, default_block_size>>>(
+        as_cuda_type(alpha), a_row_ptrs, a_col_idxs, as_cuda_type(a_vals),
+        as_cuda_type(beta), b_row_ptrs, b_col_idxs, as_cuda_type(b_vals), m,
+        c_row_ptrs, c_col_idxs, as_cuda_type(c_vals));
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename IndexType>
+void spgeam(std::shared_ptr<const DefaultExecutor> exec,
+            const matrix::Dense<ValueType> *alpha,
+            const matrix::Csr<ValueType, IndexType> *a,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Csr<ValueType, IndexType> *b,
+            matrix::Csr<ValueType, IndexType> *c)
+{
+    auto total_nnz =
+        a->get_num_stored_elements() + b->get_num_stored_elements();
+    auto nnz_per_row = total_nnz / a->get_size()[0];
+    select_spgeam(spgeam_kernels(),
+                  [&](int compiled_subwarp_size) {
+                      return compiled_subwarp_size >= nnz_per_row ||
+                             compiled_subwarp_size == config::warp_size;
+                  },
+                  syn::value_list<int>(), syn::type_list<>(), exec,
+                  alpha->get_const_values(), a->get_const_row_ptrs(),
+                  a->get_const_col_idxs(), a->get_const_values(),
+                  beta->get_const_values(), b->get_const_row_ptrs(),
+                  b->get_const_col_idxs(), b->get_const_values(), c);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+
+
 template <typename IndexType>
 void convert_row_ptrs_to_idxs(std::shared_ptr<const CudaExecutor> exec,
                               const IndexType *ptrs, size_type num_rows,

diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -384,6 +385,28 @@ TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>());
+    auto a = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
+    auto b = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
+    auto da = Mtx::create(cuda);
+    auto db = Mtx::create(cuda);
+    da->copy_from(a.get());
+    db->copy_from(b.get());
+    auto id = gko::matrix::Identity<Mtx::value_type>::create(ref, mtx_size[1]);
+    auto did =
+        gko::matrix::Identity<Mtx::value_type>::create(cuda, mtx_size[1]);
+
+    a->apply(alpha.get(), id.get(), beta.get(), b.get());
+    da->apply(dalpha.get(), did.get(), dbeta.get(), db.get());
+
+    GKO_ASSERT_MTX_NEAR(b, db, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(b, db);
+    ASSERT_TRUE(db->is_sorted_by_column_index());
+}
+
+
 TEST_F(Csr, TransposeIsEquivalentToRef)
 {
     set_up_apply_data(std::make_shared<Mtx::automatical>(cuda));

diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
@@ -536,9 +536,9 @@ namespace {
 
 template <int subwarp_size, typename ValueType, typename IndexType>
 void spgeam(syn::value_list<int, subwarp_size>,
-            std::shared_ptr<const HipExecutor> exec, ValueType alpha,
+            std::shared_ptr<const HipExecutor> exec, const ValueType *alpha,
             const IndexType *a_row_ptrs, const IndexType *a_col_idxs,
-            const ValueType *a_vals, ValueType beta,
+            const ValueType *a_vals, const ValueType *beta,
             const IndexType *b_row_ptrs, const IndexType *b_col_idxs,
             const ValueType *b_vals, matrix::Csr<ValueType, IndexType> *c)
 {
@@ -570,7 +570,6 @@ void spgeam(syn::value_list<int, subwarp_size>,
                        c_col_idxs, as_hip_type(c_vals));
 }
 
-
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam);
 
 
@@ -650,18 +649,17 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
         hipsparse::destroy(b_descr);
         hipsparse::destroy(a_descr);
 
-        auto valpha = exec->copy_val_to_host(alpha->get_const_values());
-        auto vbeta = exec->copy_val_to_host(beta->get_const_values());
         auto total_nnz = c_nnz + d->get_num_stored_elements();
         auto nnz_per_row = total_nnz / m;
         select_spgeam(spgeam_kernels(),
                       [&](int compiled_subwarp_size) {
                           return compiled_subwarp_size >= nnz_per_row ||
                                  compiled_subwarp_size == config::warp_size;
                       },
-                      syn::value_list<int>(), syn::type_list<>(), exec, valpha,
-                      c_tmp_row_ptrs, c_tmp_col_idxs, c_tmp_vals, vbeta,
-                      d_row_ptrs, d_col_idxs, d_vals, c);
+                      syn::value_list<int>(), syn::type_list<>(), exec,
+                      alpha->get_const_values(), c_tmp_row_ptrs, c_tmp_col_idxs,
+                      c_tmp_vals, beta->get_const_values(), d_row_ptrs,
+                      d_col_idxs, d_vals, c);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -671,6 +669,32 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void spgeam(std::shared_ptr<const DefaultExecutor> exec,
+            const matrix::Dense<ValueType> *alpha,
+            const matrix::Csr<ValueType, IndexType> *a,
+            const matrix::Dense<ValueType> *beta,
+            const matrix::Csr<ValueType, IndexType> *b,
+            matrix::Csr<ValueType, IndexType> *c)
+{
+    auto total_nnz =
+        a->get_num_stored_elements() + b->get_num_stored_elements();
+    auto nnz_per_row = total_nnz / a->get_size()[0];
+    select_spgeam(spgeam_kernels(),
+                  [&](int compiled_subwarp_size) {
+                      return compiled_subwarp_size >= nnz_per_row ||
+                             compiled_subwarp_size == config::warp_size;
+                  },
+                  syn::value_list<int>(), syn::type_list<>(), exec,
+                  alpha->get_const_values(), a->get_const_row_ptrs(),
+                  a->get_const_col_idxs(), a->get_const_values(),
+                  beta->get_const_values(), b->get_const_row_ptrs(),
+                  b->get_const_col_idxs(), b->get_const_values(), c);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
+
+
 template <typename IndexType>
 void convert_row_ptrs_to_idxs(std::shared_ptr<const HipExecutor> exec,
                               const IndexType *ptrs, size_type num_rows,

diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp
@@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
 #include <ginkgo/core/matrix/sellp.hpp>
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
@@ -370,6 +371,27 @@ TEST_F(Csr, SimpleApplyToCsrMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, AdvancedApplyToIdentityMatrixIsEquivalentToRef)
+{
+    set_up_apply_data(std::make_shared<Mtx::automatical>(hip));
+    auto a = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
+    auto b = gen_mtx<Mtx>(mtx_size[0], mtx_size[1], 0);
+    auto da = Mtx::create(hip);
+    auto db = Mtx::create(hip);
+    da->copy_from(a.get());
+    db->copy_from(b.get());
+    auto id = gko::matrix::Identity<Mtx::value_type>::create(ref, mtx_size[1]);
+    auto did = gko::matrix::Identity<Mtx::value_type>::create(hip, mtx_size[1]);
+
+    a->apply(alpha.get(), id.get(), beta.get(), b.get());
+    da->apply(dalpha.get(), did.get(), dbeta.get(), db.get());
+
+    GKO_ASSERT_MTX_NEAR(b, db, 1e-14);
+    GKO_ASSERT_MTX_EQ_SPARSITY(b, db);
+    ASSERT_TRUE(db->is_sorted_by_column_index());
+}
+
+
 TEST_F(Csr, TransposeIsEquivalentToRef)
 {
     set_up_apply_data(std::make_shared<Mtx::automatical>(hip));