From 33c576aae4c1278580c1eede69bbae9081cb3f9a Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Fri, 11 Sep 2020 17:28:35 +0200
Subject: [PATCH 1/5] Add executor operator== for memory compatibility

Co-authored-with: Terry Cojean <terry.cojean@kit.edu>
---
 core/device_hooks/cuda_hooks.cpp              |   3 +
 core/device_hooks/dpcpp_hooks.cpp             |  28 +
 core/device_hooks/hip_hooks.cpp               |   3 +
 core/test/base/array.cpp                      |  20 +-
 core/test/base/executor.cpp                   |  59 ++
 core/test/base/lin_op.cpp                     |  35 +-
 core/test/base/utils.cpp                      |   4 +-
 core/test/utils.hpp                           |   2 +
 cuda/base/executor.cpp                        |  10 +
 cuda/test/base/CMakeLists.txt                 |   1 +
 cuda/test/base/array.cu                       |  92 +++
 cuda/test/base/lin_op.cu                      | 104 ++++
 dpcpp/base/executor.dp.cpp                    |  39 +-
 hip/base/executor.hip.cpp                     |  10 +
 include/ginkgo/core/base/dim.hpp              |   1 -
 .../ginkgo/core/base/exception_helpers.hpp    |   1 +
 include/ginkgo/core/base/executor.hpp         | 119 ++++
 include/ginkgo/core/base/temporary_clone.hpp  | 202 +++++++
 include/ginkgo/core/base/utils.hpp            | 553 +-----------------
 include/ginkgo/core/base/utils_helper.hpp     | 438 ++++++++++++++
 include/ginkgo/core/log/logger.hpp            |   2 +-
 21 files changed, 1143 insertions(+), 583 deletions(-)
 create mode 100644 cuda/test/base/array.cu
 create mode 100644 include/ginkgo/core/base/temporary_clone.hpp
 create mode 100644 include/ginkgo/core/base/utils_helper.hpp

diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index 46e342d9c0a..a5c33c6541d 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -98,6 +98,9 @@ void CudaExecutor::raw_copy_to(const DpcppExecutor *, size_type num_bytes,
     GKO_NOT_COMPILED(cuda);
 
 
+bool CudaExecutor::verify_memory_to(const HipExecutor *) const { return false; }
+
+
 void CudaExecutor::synchronize() const GKO_NOT_COMPILED(cuda);
 
 
diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp
index 9d6c57f11a4..472451c6513 100644
--- a/core/device_hooks/dpcpp_hooks.cpp
+++ b/core/device_hooks/dpcpp_hooks.cpp
@@ -64,6 +64,14 @@ void OmpExecutor::raw_copy_to(const DpcppExecutor *, size_type num_bytes,
     GKO_NOT_COMPILED(dpcpp);
 
 
+bool OmpExecutor::verify_memory_to(const DpcppExecutor *dest_exec) const
+{
+    // Dummy check
+    auto dev_type = dest_exec->get_device_type();
+    return dev_type == "cpu" || dev_type == "host";
+}
+
+
 void DpcppExecutor::raw_free(void *ptr) const noexcept
 {
     // Free must never fail, as it can be called in destructors.
@@ -112,6 +120,26 @@ int DpcppExecutor::get_num_devices(std::string) { return 0; }
 void DpcppExecutor::set_device_property() {}
 
 
+bool DpcppExecutor::verify_memory_to(const OmpExecutor *dest_exec) const
+{
+    // Dummy check
+    return device_type_ == "cpu" || device_type_ == "host";
+}
+
+bool DpcppExecutor::verify_memory_to(const ReferenceExecutor *dest_exec) const
+{
+    // Dummy check
+    return device_type_ == "cpu" || device_type_ == "host";
+}
+
+bool DpcppExecutor::verify_memory_to(const DpcppExecutor *dest_exec) const
+{
+    // Dummy check
+    return dest_exec->get_device_type() == device_type_ &&
+           dest_exec->get_device_id() == device_id_;
+}
+
+
 }  // namespace gko
 
 
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index b98b2fcb8a4..45180980159 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -95,6 +95,9 @@ void HipExecutor::raw_copy_to(const DpcppExecutor *, size_type num_bytes,
     GKO_NOT_COMPILED(hip);
 
 
+bool HipExecutor::verify_memory_to(const CudaExecutor *) const { return false; }
+
+
 void HipExecutor::synchronize() const GKO_NOT_COMPILED(hip);
 
 
diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp
index 959392e5c71..c20782651f7 100644
--- a/core/test/base/array.cpp
+++ b/core/test/base/array.cpp
@@ -57,12 +57,13 @@ class Array : public ::testing::Test {
         x.get_data()[1] = 2;
     }
 
-    static void assert_equal_to_original_x(gko::Array<T> &a)
+    static void assert_equal_to_original_x(gko::Array<T> &a,
+                                           bool check_zero = true)
     {
         ASSERT_EQ(a.get_num_elems(), 2);
-        EXPECT_EQ(a.get_data()[0], T{5});
+        if (check_zero) EXPECT_EQ(a.get_data()[0], T{5});
         EXPECT_EQ(a.get_data()[1], T{2});
-        EXPECT_EQ(a.get_const_data()[0], T{5});
+        if (check_zero) EXPECT_EQ(a.get_const_data()[0], T{5});
         EXPECT_EQ(a.get_const_data()[1], T{2});
     }
 
@@ -271,29 +272,30 @@ TYPED_TEST(Array, CanCreateTemporaryCloneOnSameExecutor)
 }
 
 
-TYPED_TEST(Array, CanCreateTemporaryCloneOnDifferentExecutor)
+// For tests between different memory, check cuda/test/base/array.cu
+TYPED_TEST(Array, DoesNotCreateATemporaryCloneBetweenSameMemory)
 {
     auto omp = gko::OmpExecutor::create();
 
     auto tmp_clone = make_temporary_clone(omp, &this->x);
 
     this->assert_equal_to_original_x(*tmp_clone.get());
-    ASSERT_NE(tmp_clone.get(), &this->x);
+    ASSERT_EQ(tmp_clone.get(), &this->x);
 }
 
 
-TYPED_TEST(Array, CanCopyBackTemporaryCloneOnDifferentExecutor)
+TYPED_TEST(Array, CanDoesNotCopyBackTemporaryCloneBetweenSameMemory)
 {
     auto omp = gko::OmpExecutor::create();
 
     {
         auto tmp_clone = make_temporary_clone(omp, &this->x);
-        // change x, so it no longer matches the original x
-        // the copy-back will overwrite it again with the correct value
+        // change x, and check that there is no copy-back to overwrite it again
         this->x.get_data()[0] = 0;
     }
 
-    this->assert_equal_to_original_x(this->x);
+    this->assert_equal_to_original_x(this->x, false);
+    EXPECT_EQ(this->x.get_data()[0], TypeParam{0});
 }
 
 
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index b968bbe210a..73c82a06fa8 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -474,6 +474,65 @@ TEST(DpcppExecutor, KnowsItsDeviceId)
 }
 
 
+TEST(Executor, CanVerifyMemory)
+{
+    auto ref = gko::ReferenceExecutor::create();
+    auto omp = gko::OmpExecutor::create();
+    auto hip = gko::HipExecutor::create(0, omp);
+    auto cuda = gko::CudaExecutor::create(0, omp);
+    auto cpu_dpcpp = gko::DpcppExecutor::create(0, omp, "cpu");
+    auto host_dpcpp = gko::DpcppExecutor::create(0, omp, "host");
+    auto gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu");
+    auto omp2 = gko::OmpExecutor::create();
+    auto hip2 = gko::HipExecutor::create(0, omp);
+    auto cuda2 = gko::CudaExecutor::create(0, omp);
+    std::shared_ptr<gko::HipExecutor> hip_1 = gko::HipExecutor::create(1, omp);
+    std::shared_ptr<gko::CudaExecutor> cuda_1 =
+        gko::CudaExecutor::create(1, omp);
+
+    ASSERT_EQ(true, *ref == *omp);
+    ASSERT_EQ(true, *omp == *ref);
+    ASSERT_EQ(false, *ref == *hip);
+    ASSERT_EQ(false, *hip == *ref);
+    ASSERT_EQ(false, *omp == *hip);
+    ASSERT_EQ(false, *hip == *omp);
+    ASSERT_EQ(false, *ref == *cuda);
+    ASSERT_EQ(false, *cuda == *ref);
+    ASSERT_EQ(false, *omp == *cuda);
+    ASSERT_EQ(false, *cuda == *omp);
+    ASSERT_EQ(true, *cpu_dpcpp == *ref);
+    ASSERT_EQ(true, *host_dpcpp == *ref);
+    ASSERT_EQ(false, *gpu_dpcpp == *ref);
+    ASSERT_EQ(true, *ref == *cpu_dpcpp);
+    ASSERT_EQ(true, *ref == *host_dpcpp);
+    ASSERT_EQ(false, *ref == *gpu_dpcpp);
+    ASSERT_EQ(true, *cpu_dpcpp == *omp);
+    ASSERT_EQ(true, *host_dpcpp == *omp);
+    ASSERT_EQ(false, *gpu_dpcpp == *omp);
+    ASSERT_EQ(true, *omp == *cpu_dpcpp);
+    ASSERT_EQ(true, *omp == *host_dpcpp);
+    ASSERT_EQ(false, *omp == *gpu_dpcpp);
+#if GINKGO_HIP_PLATFORM_NVCC
+    ASSERT_EQ(true, *hip == *cuda);
+    ASSERT_EQ(true, *cuda == *hip);
+    ASSERT_EQ(true, *hip_1 == *cuda_1);
+    ASSERT_EQ(true, *cuda_1 == *hip_1);
+#else
+    ASSERT_EQ(false, *hip == *cuda);
+    ASSERT_EQ(false, *cuda == *hip);
+    ASSERT_EQ(false, *hip_1 == *cuda_1);
+    ASSERT_EQ(false, *cuda_1 == *hip_1);
+#endif
+    ASSERT_EQ(true, *omp == *omp2);
+    ASSERT_EQ(true, *hip == *hip2);
+    ASSERT_EQ(true, *cuda == *cuda2);
+    ASSERT_EQ(false, *hip == *hip_1);
+    ASSERT_EQ(false, *cuda == *hip_1);
+    ASSERT_EQ(false, *cuda == *cuda_1);
+    ASSERT_EQ(false, *hip == *cuda_1);
+}
+
+
 template <typename T>
 struct mock_free : T {
     /**
diff --git a/core/test/base/lin_op.cpp b/core/test/base/lin_op.cpp
index 303e8bd6e89..3da8eda55f0 100644
--- a/core/test/base/lin_op.cpp
+++ b/core/test/base/lin_op.cpp
@@ -204,43 +204,44 @@ TEST_F(EnableLinOp, ExtendedApplyFailsOnWrongBetaDimension)
 }
 
 
-TEST_F(EnableLinOp, ApplyCopiesDataToCorrectExecutor)
+// For tests between different memory, check cuda/test/base/lin_op.cu
+TEST_F(EnableLinOp, ApplyDoesNotCopyBetweenSameMemory)
 {
     op->apply(gko::lend(b), gko::lend(x));
 
-    ASSERT_EQ(op->last_b_access, omp);
-    ASSERT_EQ(op->last_x_access, omp);
+    ASSERT_EQ(op->last_b_access, ref);
+    ASSERT_EQ(op->last_x_access, ref);
 }
 
 
-TEST_F(EnableLinOp, ApplyCopiesBackOnlyX)
+TEST_F(EnableLinOp, ApplyNoCopyBackBetweenSameMemory)
 {
     op->apply(gko::lend(b), gko::lend(x));
 
-    ASSERT_EQ(b->last_access, nullptr);
-    ASSERT_EQ(x->last_access, omp);
+    ASSERT_EQ(b->last_access, ref);
+    ASSERT_EQ(x->last_access, ref);
 }
 
 
-TEST_F(EnableLinOp, ExtendedApplyCopiesDataToCorrectExecutor)
+TEST_F(EnableLinOp, ExtendedApplyDoesNotCopyBetweenSameMemory)
 {
     op->apply(gko::lend(alpha), gko::lend(b), gko::lend(beta), gko::lend(x));
 
-    ASSERT_EQ(op->last_alpha_access, omp);
-    ASSERT_EQ(op->last_b_access, omp);
-    ASSERT_EQ(op->last_beta_access, omp);
-    ASSERT_EQ(op->last_x_access, omp);
+    ASSERT_EQ(op->last_alpha_access, ref);
+    ASSERT_EQ(op->last_b_access, ref);
+    ASSERT_EQ(op->last_beta_access, ref);
+    ASSERT_EQ(op->last_x_access, ref);
 }
 
 
-TEST_F(EnableLinOp, ExtendedApplyCopiesBackOnlyX)
+TEST_F(EnableLinOp, ExtendedApplyNoCopyBackBetweenSameMemory)
 {
-    op->apply(gko::lend(b), gko::lend(x));
+    op->apply(gko::lend(alpha), gko::lend(b), gko::lend(beta), gko::lend(x));
 
-    ASSERT_EQ(alpha->last_access, nullptr);
-    ASSERT_EQ(b->last_access, nullptr);
-    ASSERT_EQ(beta->last_access, nullptr);
-    ASSERT_EQ(x->last_access, omp);
+    ASSERT_EQ(alpha->last_access, ref);
+    ASSERT_EQ(b->last_access, ref);
+    ASSERT_EQ(beta->last_access, ref);
+    ASSERT_EQ(x->last_access, ref);
 }
 
 
diff --git a/core/test/base/utils.cpp b/core/test/base/utils.cpp
index df187cfb81f..822449e8f24 100644
--- a/core/test/base/utils.cpp
+++ b/core/test/base/utils.cpp
@@ -400,11 +400,11 @@ class TemporaryClone : public ::testing::Test {
 };
 
 
-TEST_F(TemporaryClone, CopiesToAnotherExecutor)
+TEST_F(TemporaryClone, DoesNotCopyToSameMemory)
 {
     auto clone = make_temporary_clone(omp, gko::lend(obj));
 
-    ASSERT_EQ(clone.get()->get_executor(), omp);
+    ASSERT_NE(clone.get()->get_executor(), omp);
     ASSERT_EQ(obj->get_executor(), ref);
 }
 
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index d6ae92bb951..5838379c5f0 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -69,6 +69,8 @@ using ValueAndIndexTypes =
     ::testing::Types<float, double, std::complex<float>, std::complex<double>,
                      gko::int32, gko::int64, gko::size_type>;
 
+using RealValueAndIndexTypes =
+    ::testing::Types<float, double, gko::int32, gko::int64, gko::size_type>;
 
 using ValueIndexTypes = ::testing::Types<
     std::tuple<float, gko::int32>, std::tuple<double, gko::int32>,
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 7926ae98a21..5284daa2dd1 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -162,6 +162,16 @@ void CudaExecutor::raw_copy_to(const CudaExecutor *dest, size_type num_bytes,
 }
 
 
+bool CudaExecutor::verify_memory_to(const HipExecutor *dest_exec) const
+{
+#if GINKGO_HIP_PLATFORM_NVCC
+    return device_id_ == dest_exec->get_device_id();
+#else
+    return false;
+#endif
+}
+
+
 void CudaExecutor::synchronize() const
 {
     cuda::device_guard g(this->get_device_id());
diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt
index d9ba808fa21..acbe3244fd4 100644
--- a/cuda/test/base/CMakeLists.txt
+++ b/cuda/test/base/CMakeLists.txt
@@ -1,3 +1,4 @@
+ginkgo_create_cuda_test(array)
 ginkgo_create_cuda_test(cuda_executor)
 ginkgo_create_cuda_test(exception_helpers)
 ginkgo_create_cuda_test(lin_op)
diff --git a/cuda/test/base/array.cu b/cuda/test/base/array.cu
new file mode 100644
index 00000000000..c4757c42b44
--- /dev/null
+++ b/cuda/test/base/array.cu
@@ -0,0 +1,92 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/array.hpp>
+
+
+#include <gtest/gtest.h>
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+template <typename T>
+class Array : public ::testing::Test {
+protected:
+    Array() : exec(gko::ReferenceExecutor::create()), x(exec, 2)
+    {
+        x.get_data()[0] = 5;
+        x.get_data()[1] = 2;
+    }
+
+    static void assert_equal_to_original_x(gko::Array<T> &a)
+    {
+        ASSERT_EQ(a.get_num_elems(), 2);
+        EXPECT_EQ(a.get_data()[0], T{5});
+        EXPECT_EQ(a.get_data()[1], T{2});
+        EXPECT_EQ(a.get_const_data()[0], T{5});
+        EXPECT_EQ(a.get_const_data()[1], T{2});
+    }
+
+    std::shared_ptr<gko::Executor> exec;
+    gko::Array<T> x;
+};
+
+TYPED_TEST_SUITE(Array, gko::test::ValueAndIndexTypes);
+
+
+TYPED_TEST(Array, CanCreateTemporaryCloneOnDifferentExecutor)
+{
+    auto cuda = gko::CudaExecutor::create(0, this->exec);
+
+    auto tmp_clone = make_temporary_clone(cuda, &this->x);
+
+    ASSERT_NE(tmp_clone.get(), &this->x);
+    tmp_clone->set_executor(this->exec);
+    this->assert_equal_to_original_x(*tmp_clone.get());
+}
+
+
+TYPED_TEST(Array, CanCopyBackTemporaryCloneOnDifferentExecutor)
+{
+    auto cuda = gko::CudaExecutor::create(0, this->exec);
+
+    {
+        auto tmp_clone = make_temporary_clone(cuda, &this->x);
+        // change x, so it no longer matches the original x
+        // the copy-back will overwrite it again with the correct value
+        this->x.get_data()[0] = 0;
+    }
+
+    this->assert_equal_to_original_x(this->x);
+}
diff --git a/cuda/test/base/lin_op.cu b/cuda/test/base/lin_op.cu
index 82016f87816..668c2df339c 100644
--- a/cuda/test/base/lin_op.cu
+++ b/cuda/test/base/lin_op.cu
@@ -39,6 +39,110 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
+class DummyLinOp : public gko::EnableLinOp<DummyLinOp>,
+                   public gko::EnableCreateMethod<DummyLinOp> {
+public:
+    DummyLinOp(std::shared_ptr<const gko::Executor> exec,
+               gko::dim<2> size = gko::dim<2>{})
+        : EnableLinOp<DummyLinOp>(exec, size)
+    {}
+
+    void access() const { last_access = this->get_executor(); }
+
+    mutable std::shared_ptr<const gko::Executor> last_access;
+    mutable std::shared_ptr<const gko::Executor> last_b_access;
+    mutable std::shared_ptr<const gko::Executor> last_x_access;
+    mutable std::shared_ptr<const gko::Executor> last_alpha_access;
+    mutable std::shared_ptr<const gko::Executor> last_beta_access;
+
+protected:
+    void apply_impl(const gko::LinOp *b, gko::LinOp *x) const override
+    {
+        this->access();
+        static_cast<const DummyLinOp *>(b)->access();
+        static_cast<const DummyLinOp *>(x)->access();
+        last_b_access = b->get_executor();
+        last_x_access = x->get_executor();
+    }
+
+    void apply_impl(const gko::LinOp *alpha, const gko::LinOp *b,
+                    const gko::LinOp *beta, gko::LinOp *x) const override
+    {
+        this->access();
+        static_cast<const DummyLinOp *>(alpha)->access();
+        static_cast<const DummyLinOp *>(b)->access();
+        static_cast<const DummyLinOp *>(beta)->access();
+        static_cast<const DummyLinOp *>(x)->access();
+        last_alpha_access = alpha->get_executor();
+        last_b_access = b->get_executor();
+        last_beta_access = beta->get_executor();
+        last_x_access = x->get_executor();
+    }
+};
+
+
+class EnableLinOp : public ::testing::Test {
+protected:
+    EnableLinOp()
+        : ref{gko::ReferenceExecutor::create()},
+          cuda{gko::CudaExecutor::create(0, ref)},
+          op{DummyLinOp::create(cuda, gko::dim<2>{3, 5})},
+          alpha{DummyLinOp::create(ref, gko::dim<2>{1})},
+          beta{DummyLinOp::create(ref, gko::dim<2>{1})},
+          b{DummyLinOp::create(ref, gko::dim<2>{5, 4})},
+          x{DummyLinOp::create(ref, gko::dim<2>{3, 4})}
+    {}
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::CudaExecutor> cuda;
+    std::unique_ptr<DummyLinOp> op;
+    std::unique_ptr<DummyLinOp> alpha;
+    std::unique_ptr<DummyLinOp> beta;
+    std::unique_ptr<DummyLinOp> b;
+    std::unique_ptr<DummyLinOp> x;
+};
+
+
+TEST_F(EnableLinOp, ApplyCopiesDataToCorrectExecutor)
+{
+    op->apply(gko::lend(b), gko::lend(x));
+
+    ASSERT_EQ(op->last_b_access, cuda);
+    ASSERT_EQ(op->last_x_access, cuda);
+}
+
+
+TEST_F(EnableLinOp, ApplyCopiesBackOnlyX)
+{
+    op->apply(gko::lend(b), gko::lend(x));
+
+    ASSERT_EQ(b->last_access, nullptr);
+    ASSERT_EQ(x->last_access, cuda);
+}
+
+
+TEST_F(EnableLinOp, ExtendedApplyCopiesDataToCorrectExecutor)
+{
+    op->apply(gko::lend(alpha), gko::lend(b), gko::lend(beta), gko::lend(x));
+
+    ASSERT_EQ(op->last_alpha_access, cuda);
+    ASSERT_EQ(op->last_b_access, cuda);
+    ASSERT_EQ(op->last_beta_access, cuda);
+    ASSERT_EQ(op->last_x_access, cuda);
+}
+
+
+TEST_F(EnableLinOp, ExtendedApplyCopiesBackOnlyX)
+{
+    op->apply(gko::lend(alpha), gko::lend(b), gko::lend(beta), gko::lend(x));
+
+    ASSERT_EQ(alpha->last_access, nullptr);
+    ASSERT_EQ(b->last_access, nullptr);
+    ASSERT_EQ(beta->last_access, nullptr);
+    ASSERT_EQ(x->last_access, cuda);
+}
+
+
 class FactoryParameter : public ::testing::Test {
 protected:
     FactoryParameter() {}
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 7e311d489c7..1381f079945 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -57,6 +57,7 @@ const std::vector<sycl::device> get_devices(std::string device_type)
         {"accelerator", sycl::info::device_type::accelerator},
         {"all", sycl::info::device_type::all},
         {"cpu", sycl::info::device_type::cpu},
+        {"host", sycl::info::device_type::host},
         {"gpu", sycl::info::device_type::gpu}};
     std::for_each(device_type.begin(), device_type.end(),
                   [](char &c) { c = std::tolower(c); });
@@ -76,6 +77,14 @@ void OmpExecutor::raw_copy_to(const DpcppExecutor *dest, size_type num_bytes,
 }
 
 
+bool OmpExecutor::verify_memory_to(const DpcppExecutor *dest_exec) const
+{
+    auto device = detail::get_devices(
+        dest_exec->getdevice_type())[dest_exec->get_device_id()];
+    return device.is_host() || device.is_cpu();
+}
+
+
 std::shared_ptr<DpcppExecutor> DpcppExecutor::create(
     int device_id, std::shared_ptr<Executor> master, std::string device_type)
 {
@@ -150,6 +159,30 @@ int DpcppExecutor::get_num_devices(std::string device_type)
 }
 
 
+bool DpcppExecutor::verify_memory_to(const OmpExecutor *dest_exec) const
+{
+    auto device = detail::get_devices(device_type_)[device_id_];
+
+    return device.is_host() || device.is_cpu();
+}
+
+bool DpcppExecutor::verify_memory_to(const ReferenceExecutor *dest_exec) const
+{
+    auto device = detail::get_devices(device_type_)[device_id_];
+    return device.is_host() || device.is_cpu();
+}
+
+bool DpcppExecutor::verify_memory_to(const DpcppExecutor *dest_exec) const
+{
+    auto device = detail::get_devices(device_type_)[device_id_];
+    auto other_device = detail::get_devices(
+        dest_exec->get_device_type())[dest_exec->get_device_id()];
+    return device.get_info<cl::sycl::info::device::device_type>() ==
+               other_device.get_info<cl::sycl::info::device::device_type>() &&
+           device.get() == other_device.get();
+}
+
+
 namespace detail {
 
 
@@ -181,8 +214,10 @@ void DpcppExecutor::set_device_property()
     for (std::size_t i = 0; i < 3; i++) {
         max_workitem_sizes_.push_back(max_workitem_sizes[i]);
     }
-    max_workgroup_size_ =
-        device.get_info<sycl::info::device::max_work_group_size>();
+    if (!device.is_host()) {
+        max_workgroup_size_ =
+            device.get_info<sycl::info::device::max_work_group_size>();
+    }
     // Here we declare the queue with the property `in_order` which ensures the
     // kernels are executed in the submission order. Otherwise, calls to
     // `wait()` would be needed after every call to a DPC++ function or kernel.
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 3e400986e2b..8f8bbbf7d9d 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -162,6 +162,16 @@ void HipExecutor::raw_copy_to(const HipExecutor *dest, size_type num_bytes,
 }
 
 
+bool HipExecutor::verify_memory_to(const CudaExecutor *dest_exec) const
+{
+#if GINKGO_HIP_PLATFORM_NVCC
+    return device_id_ == dest_exec->get_device_id();
+#else
+    return false;
+#endif
+}
+
+
 void HipExecutor::synchronize() const
 {
     hip::device_guard g(this->get_device_id());
diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp
index 3030f0d1442..052c50e13b8 100644
--- a/include/ginkgo/core/base/dim.hpp
+++ b/include/ginkgo/core/base/dim.hpp
@@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/base/utils.hpp>
 
 
 namespace gko {
diff --git a/include/ginkgo/core/base/exception_helpers.hpp b/include/ginkgo/core/base/exception_helpers.hpp
index e14bb1c0c61..af3e1efb598 100644
--- a/include/ginkgo/core/base/exception_helpers.hpp
+++ b/include/ginkgo/core/base/exception_helpers.hpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/dim.hpp>
 #include <ginkgo/core/base/exception.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
+#include <ginkgo/core/base/utils_helper.hpp>
 
 
 namespace gko {
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 2cc18bf3502..dd0293d30ee 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -633,6 +633,17 @@ class Executor : public log::EnableLogging<Executor> {
      */
     virtual void synchronize() const = 0;
 
+    /**
+     * Overload the equal-to operator which verifies whether the executors share
+     * the same memory.
+     *
+     * @param other  the other Executor to compare against
+     */
+    bool operator==(const Executor &other) const
+    {
+        return this->verify_memory_from(other);
+    }
+
 protected:
     /**
      * Allocates raw memory in this Executor.
@@ -684,6 +695,33 @@ class Executor : public log::EnableLogging<Executor> {
 
 #undef GKO_ENABLE_RAW_COPY_TO
 
+    /**
+     * Verify the memory from another Executor.
+     *
+     * @param src_exec  Executor from which to verify the memory.
+     *
+     * @return whether this executor and src_exec share the same memory.
+     */
+    virtual bool verify_memory_from(const Executor &src_exec) const = 0;
+
+/**
+ * @internal
+ * Declares a verify_memory_to() overload for a specified Executor subclass.
+ *
+ * This is the second stage of the double dispatch emulation required to
+ * implement verify_memory_from().
+ *
+ * @param _exec_type  the Executor subclass
+ */
+#define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
+    virtual bool verify_memory_to(const _exec_type *dest_exec) const = 0
+
+    GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
+
+    GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor);
+
+#undef GKO_ENABLE_VERIFY_MEMORY_TO
+
 private:
     /**
      * The LambdaOperation class wraps three functor objects into an
@@ -832,6 +870,11 @@ class ExecutorBase : public Executor {
         src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
     }
 
+    bool verify_memory_from(const Executor &src_exec) const override
+    {
+        return src_exec.verify_memory_to(self());
+    }
+
 private:
     ConcreteExecutor *self() noexcept
     {
@@ -891,6 +934,13 @@ class EnableDeviceReset {
                      const void *src_ptr, void *dest_ptr) const override
 
 
+#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_)                       \
+    bool verify_memory_to(const dest_ *other) const override { return bool_; } \
+    static_assert(true,                                                        \
+                  "This assert is used to counter the false positive extra "   \
+                  "semi-colon warnings")
+
+
 /**
  * This is the Executor subclass which represents the OpenMP device
  * (typically CPU).
@@ -925,6 +975,16 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
     void raw_free(void *ptr) const noexcept override;
 
     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
+
+    bool verify_memory_to(const DpcppExecutor *dest_exec) const override;
 };
 
 
@@ -1090,6 +1150,19 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
+
+    bool verify_memory_to(const HipExecutor *dest_exec) const override;
+
+    bool verify_memory_to(const CudaExecutor *dest_exec) const override
+    {
+        return device_id_ == dest_exec->get_device_id();
+    }
+
     static void increase_num_execs(unsigned device_id)
     {
         std::lock_guard<std::mutex> guard(mutex[device_id]);
@@ -1256,6 +1329,19 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
+
+    bool verify_memory_to(const CudaExecutor *dest_exec) const override;
+
+    bool verify_memory_to(const HipExecutor *dest_exec) const override
+    {
+        return device_id_ == dest_exec->get_device_id();
+    }
+
     static void increase_num_execs(int device_id)
     {
         std::lock_guard<std::mutex> guard(mutex[device_id]);
@@ -1335,6 +1421,8 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
 
     /**
      * Get the DPCPP device id of the device associated to this executor.
+     *
+     * @return the DPCPP device id of the device associated to this executor
      */
     int get_device_id() const noexcept { return device_id_; }
 
@@ -1342,11 +1430,17 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
 
     /**
      * Get the number of devices present on the system.
+     *
+     * @param device_type  a string representing the device type
+     *
+     * @return the number of devices present on the system
      */
     static int get_num_devices(std::string device_type);
 
     /**
      * Get the available subgroup sizes for this device.
+     *
+     * @return the available subgroup sizes for this device
      */
     const std::vector<size_type> &get_subgroup_sizes() const noexcept
     {
@@ -1355,6 +1449,8 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
 
     /**
      * Get the number of Computing Units of this executor.
+     *
+     * @return the number of Computing Units of this executor
      */
     size_type get_num_computing_units() const noexcept
     {
@@ -1363,6 +1459,8 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
 
     /**
      * Get the maximum work item sizes.
+     *
+     * @return the maximum work item sizes
      */
     const std::vector<size_type> &get_max_workitem_sizes() const noexcept
     {
@@ -1371,12 +1469,21 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
 
     /**
      * Get the maximum workgroup size.
+     *
+     * @return the maximum workgroup size
      */
     size_type get_max_workgroup_size() const noexcept
     {
         return max_workgroup_size_;
     }
 
+    /**
+     * Get a string representing the device type.
+     *
+     * @return a string representing the device type
+     */
+    std::string get_device_type() const noexcept { return device_type_; }
+
 protected:
     void set_device_property();
 
@@ -1384,6 +1491,8 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
                   std::string device_type = "all")
         : device_id_(device_id), master_(master), device_type_(device_type)
     {
+        std::for_each(device_type_.begin(), device_type_.end(),
+                      [](char &c) { c = std::tolower(c); });
         this->set_device_property();
     }
 
@@ -1393,6 +1502,16 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
 
     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
 
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
+
+    bool verify_memory_to(const OmpExecutor *dest_exec) const override;
+
+    bool verify_memory_to(const ReferenceExecutor *dest_exec) const override;
+
+    bool verify_memory_to(const DpcppExecutor *dest_exec) const override;
+
 private:
     int device_id_;
     std::shared_ptr<Executor> master_;
diff --git a/include/ginkgo/core/base/temporary_clone.hpp b/include/ginkgo/core/base/temporary_clone.hpp
new file mode 100644
index 00000000000..a0f59f73919
--- /dev/null
+++ b/include/ginkgo/core/base/temporary_clone.hpp
@@ -0,0 +1,202 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_BASE_TEMPORARY_CLONE_HPP_
+#define GKO_CORE_BASE_TEMPORARY_CLONE_HPP_
+
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/utils_helper.hpp>
+
+
+namespace gko {
+namespace detail {
+
+
+/**
+ * A copy_back_deleter is a type of deleter that copies the data to an
+ * internally referenced object before performing the deletion.
+ *
+ * The deleter will use the `copy_from` method to perform the copy, and then
+ * delete the passed object using the `delete` keyword. This kind of deleter is
+ * useful when temporarily copying an object with the intent of copying it back
+ * once it goes out of scope.
+ *
+ * There is also a specialization for constant objects that does not perform the
+ * copy, since a constant object couldn't have been changed.
+ *
+ * @tparam T  the type of object being deleted
+ */
+template <typename T>
+class copy_back_deleter {
+public:
+    using pointer = T *;
+
+    /**
+     * Creates a new deleter object.
+     *
+     * @param original  the origin object where the data will be copied before
+     *                  deletion
+     */
+    copy_back_deleter(pointer original) : original_{original} {}
+
+    /**
+     * Deletes the object.
+     *
+     * @param ptr  pointer to the object being deleted
+     */
+    void operator()(pointer ptr) const
+    {
+        original_->copy_from(ptr);
+        delete ptr;
+    }
+
+private:
+    pointer original_;
+};
+
+// specialization for constant objects, no need to copy back something that
+// cannot change
+template <typename T>
+class copy_back_deleter<const T> {
+public:
+    using pointer = const T *;
+    copy_back_deleter(pointer original) : original_{original} {}
+
+    void operator()(pointer ptr) const { delete ptr; }
+
+private:
+    pointer original_;
+};
+
+
+template <typename T>
+struct temporary_clone_helper {
+    static std::unique_ptr<T> create(std::shared_ptr<const Executor> exec,
+                                     T *ptr)
+    {
+        return gko::clone(std::move(exec), ptr);
+    }
+};
+
+
+/**
+ * A temporary_clone is a special smart pointer-like object that is designed to
+ * hold an object temporarily copied to another executor.
+ *
+ * After the temporary_clone goes out of scope, the stored object will be copied
+ * back to its original location. This class is optimized to avoid copies if the
+ * object is already on the correct executor, in which case it will just hold a
+ * reference to that object, without performing the copy.
+ *
+ * @tparam T  the type of object held in the temporary_clone
+ */
+template <typename T>
+class temporary_clone {
+public:
+    using value_type = T;
+    using pointer = T *;
+
+    /**
+     * Creates a temporary_clone.
+     *
+     * @param exec  the executor where the clone will be created
+     * @param ptr  a pointer to the object of which the clone will be created
+     */
+    explicit temporary_clone(std::shared_ptr<const Executor> exec, pointer ptr)
+    {
+        if (*ptr->get_executor() == *exec) {
+            // just use the object we already have
+            handle_ = handle_type(ptr, null_deleter<T>());
+        } else {
+            // clone the object to the new executor and make sure it's copied
+            // back before we delete it
+            handle_ = handle_type(
+                temporary_clone_helper<T>::create(std::move(exec), ptr)
+                    .release(),
+                copy_back_deleter<T>(ptr));
+        }
+    }
+
+    /**
+     * Returns the object held by temporary_clone.
+     *
+     * @return the object held by temporary_clone
+     */
+    T *get() const { return handle_.get(); }
+
+    /**
+     * Calls a method on the underlying object.
+     *
+     * @return the underlying object
+     */
+    T *operator->() const { return handle_.get(); }
+
+private:
+    // std::function deleter allows to decide the (type of) deleter at runtime
+    using handle_type = std::unique_ptr<T, std::function<void(T *)>>;
+
+    handle_type handle_;
+};
+
+
+}  // namespace detail
+
+
+/**
+ * Creates a temporary_clone.
+ *
+ * This is a helper function which avoids the need to explicitly specify the
+ * type of the object, as would be the case if using the constructor of
+ * temporary_clone.
+ *
+ * @param exec  the executor where the clone will be created
+ * @param ptr  a pointer to the object of which the clone will be created
+ */
+template <typename T>
+detail::temporary_clone<T> make_temporary_clone(
+    std::shared_ptr<const Executor> exec, T *ptr)
+{
+    return detail::temporary_clone<T>(std::move(exec), ptr);
+}
+
+
+}  // namespace gko
+
+
+#endif  // GKO_CORE_BASE_TEMPORARY_CLONE_HPP_
diff --git a/include/ginkgo/core/base/utils.hpp b/include/ginkgo/core/base/utils.hpp
index 5270d619445..509213e8476 100644
--- a/include/ginkgo/core/base/utils.hpp
+++ b/include/ginkgo/core/base/utils.hpp
@@ -34,557 +34,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_PUBLIC_CORE_BASE_UTILS_HPP_
 
 
-#include <functional>
-#include <memory>
-#include <type_traits>
-
-
-#include <ginkgo/core/base/exception.hpp>
-#include <ginkgo/core/base/name_demangling.hpp>
-#include <ginkgo/core/base/std_extensions.hpp>
-#include <ginkgo/core/base/types.hpp>
-
-
-#ifndef NDEBUG
-#include <cstdio>
-#endif  // NDEBUG
-
-
-namespace gko {
-
-
-class Executor;
-
-
-namespace detail {
-
-
-template <typename T>
-struct pointee_impl {};
-
-template <typename T>
-struct pointee_impl<T *> {
-    using type = T;
-};
-
-template <typename T, typename Deleter>
-struct pointee_impl<std::unique_ptr<T, Deleter>> {
-    using type = T;
-};
-
-template <typename T>
-struct pointee_impl<std::shared_ptr<T>> {
-    using type = T;
-};
-
-template <typename T>
-using pointee = typename pointee_impl<typename std::decay<T>::type>::type;
-
-
-template <typename T, typename = void>
-struct is_clonable_impl : std::false_type {};
-
-template <typename T>
-struct is_clonable_impl<T, xstd::void_t<decltype(std::declval<T>().clone())>>
-    : std::true_type {};
-
-template <typename T>
-constexpr bool is_clonable()
-{
-    return is_clonable_impl<typename std::decay<T>::type>::value;
-}
-
-
-template <typename T, typename = void>
-struct is_clonable_to_impl : std::false_type {};
-
-template <typename T>
-struct is_clonable_to_impl<
-    T, xstd::void_t<decltype(std::declval<T>().clone(
-           std::declval<std::shared_ptr<const Executor>>()))>>
-    : std::true_type {};
-
-template <typename T>
-constexpr bool is_clonable_to()
-{
-    return is_clonable_to_impl<typename std::decay<T>::type>::value;
-}
-
-
-template <typename T>
-struct have_ownership_impl : std::false_type {};
-
-template <typename T, typename Deleter>
-struct have_ownership_impl<std::unique_ptr<T, Deleter>> : std::true_type {};
-
-template <typename T>
-struct have_ownership_impl<std::shared_ptr<T>> : std::true_type {};
-
-template <typename T>
-using have_ownership_s = have_ownership_impl<typename std::decay<T>::type>;
-
-template <typename T>
-constexpr bool have_ownership()
-{
-    return have_ownership_s<T>::value;
-}
-
-
-template <typename Pointer>
-using cloned_type =
-    std::unique_ptr<typename std::remove_cv<pointee<Pointer>>::type>;
-
-
-template <typename Pointer>
-using shared_type = std::shared_ptr<pointee<Pointer>>;
-
-
-}  // namespace detail
-
-
-/**
- * Creates a unique clone of the object pointed to by `p`.
- *
- * The pointee (i.e. `*p`) needs to have a clone method that returns a
- * std::unique_ptr in order for this method to work.
- *
- * @tparam Pointer  type of pointer to the object (plain or smart pointer)
- *
- * @param p  a pointer to the object
- *
- * @note The difference between this function and directly calling
- *       LinOp::clone() is that this one preserves the static type of the
- *       object.
- */
-template <typename Pointer>
-inline detail::cloned_type<Pointer> clone(const Pointer &p)
-{
-    static_assert(detail::is_clonable<detail::pointee<Pointer>>(),
-                  "Object is not clonable");
-    return detail::cloned_type<Pointer>(
-        static_cast<typename std::remove_cv<detail::pointee<Pointer>>::type *>(
-            p->clone().release()));
-}
-
-
-/**
- * Creates a unique clone of the object pointed to by `p` on Executor `exec`.
- *
- * The pointee (i.e. `*p`) needs to have a clone method that takes an
- * executor and returns a std::unique_ptr in order for this method to work.
- *
- * @tparam Pointer  type of pointer to the object (plain or smart pointer)
- *
- * @param exec  the executor where the cloned object should be stored
- * @param p  a pointer to the object
- *
- * @note The difference between this function and directly calling
- *       LinOp::clone() is that this one preserves the static type of the
- *       object.
- */
-template <typename Pointer>
-inline detail::cloned_type<Pointer> clone(std::shared_ptr<const Executor> exec,
-                                          const Pointer &p)
-{
-    static_assert(detail::is_clonable_to<detail::pointee<Pointer>>(),
-                  "Object is not clonable");
-    return detail::cloned_type<Pointer>(
-        static_cast<typename std::remove_cv<detail::pointee<Pointer>>::type *>(
-            p->clone(std::move(exec)).release()));
-}
-
-
-/**
- * Marks the object pointed to by `p` as shared.
- *
- * Effectively converts a pointer with ownership to std::shared_ptr.
- *
- * @tparam OwningPointer  type of pointer with ownership to the object
- *                        (has to be a smart pointer)
- *
- * @param p  a pointer to the object
- *
- * @note The original pointer `p` becomes invalid after this call.
- */
-template <typename OwningPointer>
-inline detail::shared_type<OwningPointer> share(OwningPointer &&p)
-{
-    static_assert(detail::have_ownership<OwningPointer>(),
-                  "OwningPointer does not have ownership of the object");
-    return detail::shared_type<OwningPointer>(std::move(p));
-}
-
-
-/**
- * Marks that the object pointed to by `p` can be given to the callee.
- *
- * Effectively calls `std::move(p)`.
- *
- * @tparam OwningPointer  type of pointer with ownership to the object
- *                        (has to be a smart pointer)
- *
- * @param p  a pointer to the object
- *
- * @note The original pointer `p` becomes invalid after this call.
- */
-template <typename OwningPointer>
-inline typename std::remove_reference<OwningPointer>::type &&give(
-    OwningPointer &&p)
-{
-    static_assert(detail::have_ownership<OwningPointer>(),
-                  "OwningPointer does not have ownership of the object");
-    return std::move(p);
-}
-
-
-/**
- * Returns a non-owning (plain) pointer to the object pointed to by `p`.
- *
- * @tparam Pointer  type of pointer to the object (plain or smart pointer)
- *
- * @param p  a pointer to the object
- *
- * @note This is the overload for owning (smart) pointers, that behaves the
- *       same as calling .get() on the smart pointer.
- */
-template <typename Pointer>
-inline typename std::enable_if<detail::have_ownership_s<Pointer>::value,
-                               detail::pointee<Pointer> *>::type
-lend(const Pointer &p)
-{
-    return p.get();
-}
-
-/**
- * Returns a non-owning (plain) pointer to the object pointed to by `p`.
- *
- * @tparam Pointer  type of pointer to the object (plain or smart pointer)
- *
- * @param p  a pointer to the object
- *
- * @note This is the overload for non-owning (plain) pointers, that just
- *       returns `p`.
- */
-template <typename Pointer>
-inline typename std::enable_if<!detail::have_ownership_s<Pointer>::value,
-                               detail::pointee<Pointer> *>::type
-lend(const Pointer &p)
-{
-    return p;
-}
-
-
-/**
- * Performs polymorphic type conversion.
- *
- * @tparam T  requested result type
- * @tparam U  static type of the passed object
- *
- * @param obj  the object which should be converted
- *
- * @return If successful, returns a pointer to the subtype, otherwise throws
- *         NotSupported.
- */
-template <typename T, typename U>
-inline typename std::decay<T>::type *as(U *obj)
-{
-    if (auto p = dynamic_cast<typename std::decay<T>::type *>(obj)) {
-        return p;
-    } else {
-        throw NotSupported(__FILE__, __LINE__,
-                           std::string{"gko::as<"} +
-                               name_demangling::get_type_name(typeid(T)) + ">",
-                           name_demangling::get_type_name(typeid(*obj)));
-    }
-}
-
-/**
- * Performs polymorphic type conversion.
- *
- * This is the constant version of the function.
- *
- * @tparam T  requested result type
- * @tparam U  static type of the passed object
- *
- * @param obj  the object which should be converted
- *
- * @return If successful, returns a pointer to the subtype, otherwise throws
- *         NotSupported.
- */
-template <typename T, typename U>
-inline const typename std::decay<T>::type *as(const U *obj)
-{
-    if (auto p = dynamic_cast<const typename std::decay<T>::type *>(obj)) {
-        return p;
-    } else {
-        throw NotSupported(__FILE__, __LINE__,
-                           std::string{"gko::as<"} +
-                               name_demangling::get_type_name(typeid(T)) + ">",
-                           name_demangling::get_type_name(typeid(*obj)));
-    }
-}
-
-
-/**
- * Performs polymorphic type conversion of a unique_ptr.
- *
- * @tparam T  requested result type
- * @tparam U  static type of the passed object
- *
- * @param obj  the unique_ptr to the object which should be converted.
- *             If successful, it will be reset to a nullptr.
- *
- * @return If successful, returns a unique_ptr to the subtype, otherwise throws
- *         NotSupported.
- */
-template <typename T, typename U>
-inline std::unique_ptr<typename std::decay<T>::type> as(
-    std::unique_ptr<U> &&obj)
-{
-    if (auto p = dynamic_cast<typename std::decay<T>::type *>(obj.get())) {
-        obj.release();
-        return std::unique_ptr<typename std::decay<T>::type>{p};
-    } else {
-        throw NotSupported(__FILE__, __LINE__, __func__,
-                           name_demangling::get_type_name(typeid(*obj)));
-    }
-}
-
-
-/**
- * Performs polymorphic type conversion of a shared_ptr.
- *
- * @tparam T  requested result type
- * @tparam U  static type of the passed object
- *
- * @param obj  the shared_ptr to the object which should be converted.
- *
- * @return If successful, returns a shared_ptr to the subtype, otherwise throws
- *         NotSupported. This pointer shares ownership with the input pointer.
- */
-template <typename T, typename U>
-inline std::shared_ptr<typename std::decay<T>::type> as(std::shared_ptr<U> obj)
-{
-    auto ptr = std::dynamic_pointer_cast<typename std::decay<T>::type>(obj);
-    if (ptr) {
-        return ptr;
-    } else {
-        throw NotSupported(__FILE__, __LINE__, __func__,
-                           name_demangling::get_type_name(typeid(*obj)));
-    }
-}
-
-
-/**
- * Performs polymorphic type conversion of a shared_ptr.
- *
- * This is the constant version of the function.
- *
- * @tparam T  requested result type
- * @tparam U  static type of the passed object
- *
- * @param obj  the shared_ptr to the object which should be converted.
- *
- * @return If successful, returns a shared_ptr to the subtype, otherwise throws
- *         NotSupported. This pointer shares ownership with the input pointer.
- */
-template <typename T, typename U>
-inline std::shared_ptr<const typename std::decay<T>::type> as(
-    std::shared_ptr<const U> obj)
-{
-    auto ptr =
-        std::dynamic_pointer_cast<const typename std::decay<T>::type>(obj);
-    if (ptr) {
-        return ptr;
-    } else {
-        throw NotSupported(__FILE__, __LINE__, __func__,
-                           name_demangling::get_type_name(typeid(*obj)));
-    }
-}
-
-
-/**
- * This is a deleter that does not delete the object.
- *
- * It is useful where the object has been allocated elsewhere and will be
- * deleted manually.
- */
-template <typename T>
-class null_deleter {
-public:
-    using pointer = T *;
-
-    /**
-     * Deletes the object.
-     *
-     * @param ptr  pointer to the object being deleted
-     */
-    void operator()(pointer) const noexcept {}
-};
-
-// a specialization for arrays
-template <typename T>
-class null_deleter<T[]> {
-public:
-    using pointer = T[];
-
-    void operator()(pointer) const noexcept {}
-};
-
-
-namespace detail {
-
-
-/**
- * A copy_back_deleter is a type of deleter that copies the data to an
- * internally referenced object before performing the deletion.
- *
- * The deleter will use the `copy_from` method to perform the copy, and then
- * delete the passed object using the `delete` keyword. This kind of deleter is
- * useful when temporarily copying an object with the intent of copying it back
- * once it goes out of scope.
- *
- * There is also a specialization for constant objects that does not perform the
- * copy, since a constant object couldn't have been changed.
- *
- * @tparam T  the type of object being deleted
- */
-template <typename T>
-class copy_back_deleter {
-public:
-    using pointer = T *;
-
-    /**
-     * Creates a new deleter object.
-     *
-     * @param original  the origin object where the data will be copied before
-     *                  deletion
-     */
-    copy_back_deleter(pointer original) : original_{original} {}
-
-    /**
-     * Deletes the object.
-     *
-     * @param ptr  pointer to the object being deleted
-     */
-    void operator()(pointer ptr) const
-    {
-        original_->copy_from(ptr);
-        delete ptr;
-    }
-
-private:
-    pointer original_;
-};
-
-
-// specialization for constant objects, no need to copy back something that
-// cannot change
-template <typename T>
-class copy_back_deleter<const T> {
-public:
-    using pointer = const T *;
-    copy_back_deleter(pointer original) : original_{original} {}
-
-    void operator()(pointer ptr) const { delete ptr; }
-
-private:
-    pointer original_;
-};
-
-
-template <typename T>
-struct temporary_clone_helper {
-    static std::unique_ptr<T> create(std::shared_ptr<const Executor> exec,
-                                     T *ptr)
-    {
-        return gko::clone(std::move(exec), ptr);
-    }
-};
-
-
-/**
- * A temporary_clone is a special smart pointer-like object that is designed to
- * hold an object temporarily copied to another executor.
- *
- * After the temporary_clone goes out of scope, the stored object will be copied
- * back to its original location. This class is optimized to avoid copies if the
- * object is already on the correct executor, in which case it will just hold a
- * reference to that object, without performing the copy.
- *
- * @tparam T  the type of object held in the temporary_clone
- */
-template <typename T>
-class temporary_clone {
-public:
-    using value_type = T;
-    using pointer = T *;
-
-    /**
-     * Creates a temporary_clone.
-     *
-     * @param exec  the executor where the clone will be created
-     * @param ptr  a pointer to the object of which the clone will be created
-     */
-    explicit temporary_clone(std::shared_ptr<const Executor> exec, pointer ptr)
-    {
-        if (ptr->get_executor() == exec) {
-            // just use the object we already have
-            handle_ = handle_type(ptr, null_deleter<T>());
-        } else {
-            // clone the object to the new executor and make sure it's copied
-            // back before we delete it
-            handle_ = handle_type(
-                temporary_clone_helper<T>::create(std::move(exec), ptr)
-                    .release(),
-                copy_back_deleter<T>(ptr));
-        }
-    }
-
-    /**
-     * Returns the object held by temporary_clone.
-     *
-     * @return the object held by temporary_clone
-     */
-    T *get() const { return handle_.get(); }
-
-    /**
-     * Calls a method on the underlying object.
-     *
-     * @return the underlying object
-     */
-    T *operator->() const { return handle_.get(); }
-
-private:
-    // std::function deleter allows to decide the (type of) deleter at runtime
-    using handle_type = std::unique_ptr<T, std::function<void(T *)>>;
-
-    handle_type handle_;
-};
-
-
-}  // namespace detail
-
-
-/**
- * Creates a temporary_clone.
- *
- * This is a helper function which avoids the need to explicitly specify the
- * type of the object, as would be the case if using the constructor of
- * temporary_clone.
- *
- * @param exec  the executor where the clone will be created
- * @param ptr  a pointer to the object of which the clone will be created
- */
-template <typename T>
-detail::temporary_clone<T> make_temporary_clone(
-    std::shared_ptr<const Executor> exec, T *ptr)
-{
-    return detail::temporary_clone<T>(std::move(exec), ptr);
-}
-
-
-}  // namespace gko
+#include <ginkgo/core/base/temporary_clone.hpp>
+#include <ginkgo/core/base/utils_helper.hpp>
 
 
 #endif  // GKO_PUBLIC_CORE_BASE_UTILS_HPP_
diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp
new file mode 100644
index 00000000000..01c6b5cf5f5
--- /dev/null
+++ b/include/ginkgo/core/base/utils_helper.hpp
@@ -0,0 +1,438 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_BASE_UTILS_HELPER_HPP_
+#define GKO_CORE_BASE_UTILS_HELPER_HPP_
+
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/name_demangling.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+#ifndef NDEBUG
+#include <cstdio>
+#endif  // NDEBUG
+
+
+namespace gko {
+
+
+class Executor;
+
+
+namespace detail {
+
+
+template <typename T>
+struct pointee_impl {};
+
+template <typename T>
+struct pointee_impl<T *> {
+    using type = T;
+};
+
+template <typename T, typename Deleter>
+struct pointee_impl<std::unique_ptr<T, Deleter>> {
+    using type = T;
+};
+
+template <typename T>
+struct pointee_impl<std::shared_ptr<T>> {
+    using type = T;
+};
+
+template <typename T>
+using pointee = typename pointee_impl<typename std::decay<T>::type>::type;
+
+
+template <typename T, typename = void>
+struct is_clonable_impl : std::false_type {};
+
+template <typename T>
+struct is_clonable_impl<T, xstd::void_t<decltype(std::declval<T>().clone())>>
+    : std::true_type {};
+
+template <typename T>
+constexpr bool is_clonable()
+{
+    return is_clonable_impl<typename std::decay<T>::type>::value;
+}
+
+
+template <typename T, typename = void>
+struct is_clonable_to_impl : std::false_type {};
+
+template <typename T>
+struct is_clonable_to_impl<
+    T, xstd::void_t<decltype(std::declval<T>().clone(
+           std::declval<std::shared_ptr<const Executor>>()))>>
+    : std::true_type {};
+
+template <typename T>
+constexpr bool is_clonable_to()
+{
+    return is_clonable_to_impl<typename std::decay<T>::type>::value;
+}
+
+
+template <typename T>
+struct have_ownership_impl : std::false_type {};
+
+template <typename T, typename Deleter>
+struct have_ownership_impl<std::unique_ptr<T, Deleter>> : std::true_type {};
+
+template <typename T>
+struct have_ownership_impl<std::shared_ptr<T>> : std::true_type {};
+
+template <typename T>
+using have_ownership_s = have_ownership_impl<typename std::decay<T>::type>;
+
+template <typename T>
+constexpr bool have_ownership()
+{
+    return have_ownership_s<T>::value;
+}
+
+
+template <typename Pointer>
+using cloned_type =
+    std::unique_ptr<typename std::remove_cv<pointee<Pointer>>::type>;
+
+
+template <typename Pointer>
+using shared_type = std::shared_ptr<pointee<Pointer>>;
+
+
+}  // namespace detail
+
+
+/**
+ * Creates a unique clone of the object pointed to by `p`.
+ *
+ * The pointee (i.e. `*p`) needs to have a clone method that returns a
+ * std::unique_ptr in order for this method to work.
+ *
+ * @tparam Pointer  type of pointer to the object (plain or smart pointer)
+ *
+ * @param p  a pointer to the object
+ *
+ * @note The difference between this function and directly calling
+ *       LinOp::clone() is that this one preserves the static type of the
+ *       object.
+ */
+template <typename Pointer>
+inline detail::cloned_type<Pointer> clone(const Pointer &p)
+{
+    static_assert(detail::is_clonable<detail::pointee<Pointer>>(),
+                  "Object is not clonable");
+    return detail::cloned_type<Pointer>(
+        static_cast<typename std::remove_cv<detail::pointee<Pointer>>::type *>(
+            p->clone().release()));
+}
+
+
+/**
+ * Creates a unique clone of the object pointed to by `p` on Executor `exec`.
+ *
+ * The pointee (i.e. `*p`) needs to have a clone method that takes an
+ * executor and returns a std::unique_ptr in order for this method to work.
+ *
+ * @tparam Pointer  type of pointer to the object (plain or smart pointer)
+ *
+ * @param exec  the executor where the cloned object should be stored
+ * @param p  a pointer to the object
+ *
+ * @note The difference between this function and directly calling
+ *       LinOp::clone() is that this one preserves the static type of the
+ *       object.
+ */
+template <typename Pointer>
+inline detail::cloned_type<Pointer> clone(std::shared_ptr<const Executor> exec,
+                                          const Pointer &p)
+{
+    static_assert(detail::is_clonable_to<detail::pointee<Pointer>>(),
+                  "Object is not clonable");
+    return detail::cloned_type<Pointer>(
+        static_cast<typename std::remove_cv<detail::pointee<Pointer>>::type *>(
+            p->clone(std::move(exec)).release()));
+}
+
+
+/**
+ * Marks the object pointed to by `p` as shared.
+ *
+ * Effectively converts a pointer with ownership to std::shared_ptr.
+ *
+ * @tparam OwningPointer  type of pointer with ownership to the object
+ *                        (has to be a smart pointer)
+ *
+ * @param p  a pointer to the object
+ *
+ * @note The original pointer `p` becomes invalid after this call.
+ */
+template <typename OwningPointer>
+inline detail::shared_type<OwningPointer> share(OwningPointer &&p)
+{
+    static_assert(detail::have_ownership<OwningPointer>(),
+                  "OwningPointer does not have ownership of the object");
+    return detail::shared_type<OwningPointer>(std::move(p));
+}
+
+
+/**
+ * Marks that the object pointed to by `p` can be given to the callee.
+ *
+ * Effectively calls `std::move(p)`.
+ *
+ * @tparam OwningPointer  type of pointer with ownership to the object
+ *                        (has to be a smart pointer)
+ *
+ * @param p  a pointer to the object
+ *
+ * @note The original pointer `p` becomes invalid after this call.
+ */
+template <typename OwningPointer>
+inline typename std::remove_reference<OwningPointer>::type &&give(
+    OwningPointer &&p)
+{
+    static_assert(detail::have_ownership<OwningPointer>(),
+                  "OwningPointer does not have ownership of the object");
+    return std::move(p);
+}
+
+
+/**
+ * Returns a non-owning (plain) pointer to the object pointed to by `p`.
+ *
+ * @tparam Pointer  type of pointer to the object (plain or smart pointer)
+ *
+ * @param p  a pointer to the object
+ *
+ * @note This is the overload for owning (smart) pointers, that behaves the
+ *       same as calling .get() on the smart pointer.
+ */
+template <typename Pointer>
+inline typename std::enable_if<detail::have_ownership_s<Pointer>::value,
+                               detail::pointee<Pointer> *>::type
+lend(const Pointer &p)
+{
+    return p.get();
+}
+
+/**
+ * Returns a non-owning (plain) pointer to the object pointed to by `p`.
+ *
+ * @tparam Pointer  type of pointer to the object (plain or smart pointer)
+ *
+ * @param p  a pointer to the object
+ *
+ * @note This is the overload for non-owning (plain) pointers, that just
+ *       returns `p`.
+ */
+template <typename Pointer>
+inline typename std::enable_if<!detail::have_ownership_s<Pointer>::value,
+                               detail::pointee<Pointer> *>::type
+lend(const Pointer &p)
+{
+    return p;
+}
+
+
+/**
+ * Performs polymorphic type conversion.
+ *
+ * @tparam T  requested result type
+ * @tparam U  static type of the passed object
+ *
+ * @param obj  the object which should be converted
+ *
+ * @return If successful, returns a pointer to the subtype, otherwise throws
+ *         NotSupported.
+ */
+template <typename T, typename U>
+inline typename std::decay<T>::type *as(U *obj)
+{
+    if (auto p = dynamic_cast<typename std::decay<T>::type *>(obj)) {
+        return p;
+    } else {
+        throw NotSupported(__FILE__, __LINE__,
+                           std::string{"gko::as<"} +
+                               name_demangling::get_type_name(typeid(T)) + ">",
+                           name_demangling::get_type_name(typeid(*obj)));
+    }
+}
+
+/**
+ * Performs polymorphic type conversion.
+ *
+ * This is the constant version of the function.
+ *
+ * @tparam T  requested result type
+ * @tparam U  static type of the passed object
+ *
+ * @param obj  the object which should be converted
+ *
+ * @return If successful, returns a pointer to the subtype, otherwise throws
+ *         NotSupported.
+ */
+template <typename T, typename U>
+inline const typename std::decay<T>::type *as(const U *obj)
+{
+    if (auto p = dynamic_cast<const typename std::decay<T>::type *>(obj)) {
+        return p;
+    } else {
+        throw NotSupported(__FILE__, __LINE__,
+                           std::string{"gko::as<"} +
+                               name_demangling::get_type_name(typeid(T)) + ">",
+                           name_demangling::get_type_name(typeid(*obj)));
+    }
+}
+
+
+/**
+ * Performs polymorphic type conversion of a unique_ptr.
+ *
+ * @tparam T  requested result type
+ * @tparam U  static type of the passed object
+ *
+ * @param obj  the unique_ptr to the object which should be converted.
+ *             If successful, it will be reset to a nullptr.
+ *
+ * @return If successful, returns a unique_ptr to the subtype, otherwise throws
+ *         NotSupported.
+ */
+template <typename T, typename U>
+inline std::unique_ptr<typename std::decay<T>::type> as(
+    std::unique_ptr<U> &&obj)
+{
+    if (auto p = dynamic_cast<typename std::decay<T>::type *>(obj.get())) {
+        obj.release();
+        return std::unique_ptr<typename std::decay<T>::type>{p};
+    } else {
+        throw NotSupported(__FILE__, __LINE__, __func__,
+                           name_demangling::get_type_name(typeid(*obj)));
+    }
+}
+
+
+/**
+ * Performs polymorphic type conversion of a shared_ptr.
+ *
+ * @tparam T  requested result type
+ * @tparam U  static type of the passed object
+ *
+ * @param obj  the shared_ptr to the object which should be converted.
+ *
+ * @return If successful, returns a shared_ptr to the subtype, otherwise throws
+ *         NotSupported. This pointer shares ownership with the input pointer.
+ */
+template <typename T, typename U>
+inline std::shared_ptr<typename std::decay<T>::type> as(std::shared_ptr<U> obj)
+{
+    auto ptr = std::dynamic_pointer_cast<typename std::decay<T>::type>(obj);
+    if (ptr) {
+        return ptr;
+    } else {
+        throw NotSupported(__FILE__, __LINE__, __func__,
+                           name_demangling::get_type_name(typeid(*obj)));
+    }
+}
+
+
+/**
+ * Performs polymorphic type conversion of a shared_ptr.
+ *
+ * This is the constant version of the function.
+ *
+ * @tparam T  requested result type
+ * @tparam U  static type of the passed object
+ *
+ * @param obj  the shared_ptr to the object which should be converted.
+ *
+ * @return If successful, returns a shared_ptr to the subtype, otherwise throws
+ *         NotSupported. This pointer shares ownership with the input pointer.
+ */
+template <typename T, typename U>
+inline std::shared_ptr<const typename std::decay<T>::type> as(
+    std::shared_ptr<const U> obj)
+{
+    auto ptr =
+        std::dynamic_pointer_cast<const typename std::decay<T>::type>(obj);
+    if (ptr) {
+        return ptr;
+    } else {
+        throw NotSupported(__FILE__, __LINE__, __func__,
+                           name_demangling::get_type_name(typeid(*obj)));
+    }
+}
+
+
+/**
+ * This is a deleter that does not delete the object.
+ *
+ * It is useful where the object has been allocated elsewhere and will be
+ * deleted manually.
+ */
+template <typename T>
+class null_deleter {
+public:
+    using pointer = T *;
+
+    /**
+     * Deletes the object.
+     *
+     * @param ptr  pointer to the object being deleted
+     */
+    void operator()(pointer) const noexcept {}
+};
+
+// a specialization for arrays
+template <typename T>
+class null_deleter<T[]> {
+public:
+    using pointer = T[];
+
+    void operator()(pointer) const noexcept {}
+};
+
+
+}  // namespace gko
+
+
+#endif  // GKO_CORE_BASE_UTILS_HELPER_HPP_
diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp
index d84e51c8ed6..f9df5660fd2 100644
--- a/include/ginkgo/core/log/logger.hpp
+++ b/include/ginkgo/core/log/logger.hpp
@@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/types.hpp>
-#include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/base/utils_helper.hpp>
 
 
 namespace gko {

From 7d6b209ce813a470d4f982481402fd81db928128 Mon Sep 17 00:00:00 2001
From: Terry Cojean <terry.cojean@kit.edu>
Date: Wed, 25 Nov 2020 18:39:00 +0100
Subject: [PATCH 2/5] Review updates.

+ Do not use `operator==`, but a funciton `memory_accessible` instead.
+ Make DPC++ host and CPU be memory compatible.
+ Use pointers for the interface instead of references.
+ Ensure DPC++ tests always work.
+ Fix some typos.

Co-authored-by: Yuhsiang M. Tsai <yhmtsai@gmail.com>
Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
Co-authored-by: Pratik Nayak <pratikvn@protonmail.com>
Co-authored-by: Aditya Kashi <aditya.kashi@kit.edu>
---
 core/test/base/array.cpp                     |  2 +-
 core/test/base/executor.cpp                  | 99 +++++++++++---------
 dpcpp/base/executor.dp.cpp                   | 28 +++---
 include/ginkgo/core/base/executor.hpp        | 17 ++--
 include/ginkgo/core/base/temporary_clone.hpp |  2 +-
 5 files changed, 81 insertions(+), 67 deletions(-)

diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp
index c20782651f7..54f7cfea053 100644
--- a/core/test/base/array.cpp
+++ b/core/test/base/array.cpp
@@ -284,7 +284,7 @@ TYPED_TEST(Array, DoesNotCreateATemporaryCloneBetweenSameMemory)
 }
 
 
-TYPED_TEST(Array, CanDoesNotCopyBackTemporaryCloneBetweenSameMemory)
+TYPED_TEST(Array, DoesNotCopyBackTemporaryCloneBetweenSameMemory)
 {
     auto omp = gko::OmpExecutor::create();
 
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 73c82a06fa8..783e20f1008 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -480,56 +480,67 @@ TEST(Executor, CanVerifyMemory)
     auto omp = gko::OmpExecutor::create();
     auto hip = gko::HipExecutor::create(0, omp);
     auto cuda = gko::CudaExecutor::create(0, omp);
-    auto cpu_dpcpp = gko::DpcppExecutor::create(0, omp, "cpu");
-    auto host_dpcpp = gko::DpcppExecutor::create(0, omp, "host");
-    auto gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu");
     auto omp2 = gko::OmpExecutor::create();
     auto hip2 = gko::HipExecutor::create(0, omp);
     auto cuda2 = gko::CudaExecutor::create(0, omp);
-    std::shared_ptr<gko::HipExecutor> hip_1 = gko::HipExecutor::create(1, omp);
-    std::shared_ptr<gko::CudaExecutor> cuda_1 =
-        gko::CudaExecutor::create(1, omp);
-
-    ASSERT_EQ(true, *ref == *omp);
-    ASSERT_EQ(true, *omp == *ref);
-    ASSERT_EQ(false, *ref == *hip);
-    ASSERT_EQ(false, *hip == *ref);
-    ASSERT_EQ(false, *omp == *hip);
-    ASSERT_EQ(false, *hip == *omp);
-    ASSERT_EQ(false, *ref == *cuda);
-    ASSERT_EQ(false, *cuda == *ref);
-    ASSERT_EQ(false, *omp == *cuda);
-    ASSERT_EQ(false, *cuda == *omp);
-    ASSERT_EQ(true, *cpu_dpcpp == *ref);
-    ASSERT_EQ(true, *host_dpcpp == *ref);
-    ASSERT_EQ(false, *gpu_dpcpp == *ref);
-    ASSERT_EQ(true, *ref == *cpu_dpcpp);
-    ASSERT_EQ(true, *ref == *host_dpcpp);
-    ASSERT_EQ(false, *ref == *gpu_dpcpp);
-    ASSERT_EQ(true, *cpu_dpcpp == *omp);
-    ASSERT_EQ(true, *host_dpcpp == *omp);
-    ASSERT_EQ(false, *gpu_dpcpp == *omp);
-    ASSERT_EQ(true, *omp == *cpu_dpcpp);
-    ASSERT_EQ(true, *omp == *host_dpcpp);
-    ASSERT_EQ(false, *omp == *gpu_dpcpp);
+    auto hip_1 = gko::HipExecutor::create(1, omp);
+    auto cuda_1 = gko::CudaExecutor::create(1, omp);
+    std::shared_ptr<gko::DpcppExecutor> host_dpcpp;
+    std::shared_ptr<gko::DpcppExecutor> cpu_dpcpp;
+    std::shared_ptr<gko::DpcppExecutor> gpu_dpcpp;
+    if (gko::DpcppExecutor::get_num_devices("host"))
+        host_dpcpp = gko::DpcppExecutor::create(0, omp, "host");
+    if (gko::DpcppExecutor::get_num_devices("cpu"))
+        cpu_dpcpp = gko::DpcppExecutor::create(0, omp, "cpu");
+    if (gko::DpcppExecutor::get_num_devices("gpu"))
+        gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu");
+
+    ASSERT_EQ(true, ref->memory_accessible(omp));
+    ASSERT_EQ(true, omp->memory_accessible(ref));
+    ASSERT_EQ(false, ref->memory_accessible(hip));
+    ASSERT_EQ(false, hip->memory_accessible(ref));
+    ASSERT_EQ(false, omp->memory_accessible(hip));
+    ASSERT_EQ(false, hip->memory_accessible(omp));
+    ASSERT_EQ(false, ref->memory_accessible(cuda));
+    ASSERT_EQ(false, cuda->memory_accessible(ref));
+    ASSERT_EQ(false, omp->memory_accessible(cuda));
+    ASSERT_EQ(false, cuda->memory_accessible(omp));
+    if (gko::DpcppExecutor::get_num_devices("host")) {
+        ASSERT_EQ(true, host_dpcpp->memory_accessible(ref));
+        ASSERT_EQ(true, ref->memory_accessible(host_dpcpp));
+        ASSERT_EQ(true, host_dpcpp->memory_accessible(omp));
+        ASSERT_EQ(true, omp->memory_accessible(host_dpcpp));
+    }
+    if (gko::DpcppExecutor::get_num_devices("cpu")) {
+        ASSERT_EQ(true, ref->memory_accessible(cpu_dpcpp));
+        ASSERT_EQ(true, cpu_dpcpp->memory_accessible(ref));
+        ASSERT_EQ(true, cpu_dpcpp->memory_accessible(omp));
+        ASSERT_EQ(true, omp->memory_accessible(cpu_dpcpp));
+    }
+    if (gko::DpcppExecutor::get_num_devices("gpu")) {
+        ASSERT_EQ(false, gpu_dpcpp->memory_accessible(ref));
+        ASSERT_EQ(false, ref->memory_accessible(gpu_dpcpp));
+        ASSERT_EQ(false, gpu_dpcpp->memory_accessible(omp));
+        ASSERT_EQ(false, omp->memory_accessible(gpu_dpcpp));
+    }
 #if GINKGO_HIP_PLATFORM_NVCC
-    ASSERT_EQ(true, *hip == *cuda);
-    ASSERT_EQ(true, *cuda == *hip);
-    ASSERT_EQ(true, *hip_1 == *cuda_1);
-    ASSERT_EQ(true, *cuda_1 == *hip_1);
+    ASSERT_EQ(true, hip->memory_accessible(cuda));
+    ASSERT_EQ(true, cuda->memory_accessible(hip));
+    ASSERT_EQ(true, hip_1->memory_accessible(cuda_1));
+    ASSERT_EQ(true, cuda_1->memory_accessible(hip_1));
 #else
-    ASSERT_EQ(false, *hip == *cuda);
-    ASSERT_EQ(false, *cuda == *hip);
-    ASSERT_EQ(false, *hip_1 == *cuda_1);
-    ASSERT_EQ(false, *cuda_1 == *hip_1);
+    ASSERT_EQ(false, hip->memory_accessible(cuda));
+    ASSERT_EQ(false, cuda->memory_accessible(hip));
+    ASSERT_EQ(false, hip_1->memory_accessible(cuda_1));
+    ASSERT_EQ(false, cuda_1->memory_accessible(hip_1));
 #endif
-    ASSERT_EQ(true, *omp == *omp2);
-    ASSERT_EQ(true, *hip == *hip2);
-    ASSERT_EQ(true, *cuda == *cuda2);
-    ASSERT_EQ(false, *hip == *hip_1);
-    ASSERT_EQ(false, *cuda == *hip_1);
-    ASSERT_EQ(false, *cuda == *cuda_1);
-    ASSERT_EQ(false, *hip == *cuda_1);
+    ASSERT_EQ(true, omp->memory_accessible(omp2));
+    ASSERT_EQ(true, hip->memory_accessible(hip2));
+    ASSERT_EQ(true, cuda->memory_accessible(cuda2));
+    ASSERT_EQ(false, hip->memory_accessible(hip_1));
+    ASSERT_EQ(false, cuda->memory_accessible(hip_1));
+    ASSERT_EQ(false, cuda->memory_accessible(cuda_1));
+    ASSERT_EQ(false, hip->memory_accessible(cuda_1));
 }
 
 
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 1381f079945..25e106a2546 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -80,7 +80,7 @@ void OmpExecutor::raw_copy_to(const DpcppExecutor *dest, size_type num_bytes,
 bool OmpExecutor::verify_memory_to(const DpcppExecutor *dest_exec) const
 {
     auto device = detail::get_devices(
-        dest_exec->getdevice_type())[dest_exec->get_device_id()];
+        dest_exec->get_device_type())[dest_exec->get_device_id()];
     return device.is_host() || device.is_cpu();
 }
 
@@ -177,9 +177,11 @@ bool DpcppExecutor::verify_memory_to(const DpcppExecutor *dest_exec) const
     auto device = detail::get_devices(device_type_)[device_id_];
     auto other_device = detail::get_devices(
         dest_exec->get_device_type())[dest_exec->get_device_id()];
-    return device.get_info<cl::sycl::info::device::device_type>() ==
-               other_device.get_info<cl::sycl::info::device::device_type>() &&
-           device.get() == other_device.get();
+    return ((device.is_host() || device.is_cpu()) &&
+            (other_device.is_host() || other_device.is_cpu())) ||
+           (device.get_info<cl::sycl::info::device::device_type>() ==
+                other_device.get_info<cl::sycl::info::device::device_type>() &&
+            device.get() == other_device.get());
 }
 
 
@@ -200,11 +202,13 @@ void DpcppExecutor::set_device_property()
 {
     assert(device_id_ < DpcppExecutor::get_num_devices(device_type_));
     auto device = detail::get_devices(device_type_)[device_id_];
-    try {
-        subgroup_sizes_ =
-            device.get_info<cl::sycl::info::device::sub_group_sizes>();
-    } catch (cl::sycl::runtime_error &err) {
-        GKO_NOT_SUPPORTED(device);
+    if (!device.is_host()) {
+        try {
+            subgroup_sizes_ =
+                device.get_info<cl::sycl::info::device::sub_group_sizes>();
+        } catch (cl::sycl::runtime_error &err) {
+            GKO_NOT_SUPPORTED(device);
+        }
     }
     num_computing_units_ =
         device.get_info<sycl::info::device::max_compute_units>();
@@ -214,10 +218,8 @@ void DpcppExecutor::set_device_property()
     for (std::size_t i = 0; i < 3; i++) {
         max_workitem_sizes_.push_back(max_workitem_sizes[i]);
     }
-    if (!device.is_host()) {
-        max_workgroup_size_ =
-            device.get_info<sycl::info::device::max_work_group_size>();
-    }
+    max_workgroup_size_ =
+        device.get_info<sycl::info::device::max_work_group_size>();
     // Here we declare the queue with the property `in_order` which ensures the
     // kernels are executed in the submission order. Otherwise, calls to
     // `wait()` would be needed after every call to a DPC++ function or kernel.
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index dd0293d30ee..2ff58f6fe82 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -634,14 +634,15 @@ class Executor : public log::EnableLogging<Executor> {
     virtual void synchronize() const = 0;
 
     /**
-     * Overload the equal-to operator which verifies whether the executors share
-     * the same memory.
+     * Verifies whether the executors share the same memory.
      *
      * @param other  the other Executor to compare against
+     *
+     * @return whether the executors this and other share the same memory.
      */
-    bool operator==(const Executor &other) const
+    bool memory_accessible(const std::shared_ptr<const Executor> &other) const
     {
-        return this->verify_memory_from(other);
+        return this->verify_memory_from(other.get());
     }
 
 protected:
@@ -702,7 +703,7 @@ class Executor : public log::EnableLogging<Executor> {
      *
      * @return whether this executor and src_exec share the same memory.
      */
-    virtual bool verify_memory_from(const Executor &src_exec) const = 0;
+    virtual bool verify_memory_from(const Executor *src_exec) const = 0;
 
 /**
  * @internal
@@ -718,7 +719,7 @@ class Executor : public log::EnableLogging<Executor> {
 
     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
 
-    GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor);
+    GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
 
 #undef GKO_ENABLE_VERIFY_MEMORY_TO
 
@@ -870,9 +871,9 @@ class ExecutorBase : public Executor {
         src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
     }
 
-    bool verify_memory_from(const Executor &src_exec) const override
+    bool verify_memory_from(const Executor *src_exec) const override
     {
-        return src_exec.verify_memory_to(self());
+        return src_exec->verify_memory_to(self());
     }
 
 private:
diff --git a/include/ginkgo/core/base/temporary_clone.hpp b/include/ginkgo/core/base/temporary_clone.hpp
index a0f59f73919..33f80848336 100644
--- a/include/ginkgo/core/base/temporary_clone.hpp
+++ b/include/ginkgo/core/base/temporary_clone.hpp
@@ -140,7 +140,7 @@ class temporary_clone {
      */
     explicit temporary_clone(std::shared_ptr<const Executor> exec, pointer ptr)
     {
-        if (*ptr->get_executor() == *exec) {
+        if (ptr->get_executor()->memory_accessible(exec)) {
             // just use the object we already have
             handle_ = handle_type(ptr, null_deleter<T>());
         } else {

From bd4936fe5354fd20282367889b704bfbbfc95974 Mon Sep 17 00:00:00 2001
From: ginkgo-bot <ginkgo.library@gmail.com>
Date: Wed, 2 Dec 2020 11:48:39 +0000
Subject: [PATCH 3/5] Format files

Co-authored-by: tcojean <tcojean@users.noreply.github.com>
---
 cuda/test/base/array.cu                      | 2 ++
 include/ginkgo/core/base/temporary_clone.hpp | 6 +++---
 include/ginkgo/core/base/utils_helper.hpp    | 6 +++---
 include/ginkgo/ginkgo.hpp                    | 2 ++
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/cuda/test/base/array.cu b/cuda/test/base/array.cu
index c4757c42b44..73770b36bb5 100644
--- a/cuda/test/base/array.cu
+++ b/cuda/test/base/array.cu
@@ -34,6 +34,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <gtest/gtest.h>
+
+
 #include <ginkgo/core/base/executor.hpp>
 
 
diff --git a/include/ginkgo/core/base/temporary_clone.hpp b/include/ginkgo/core/base/temporary_clone.hpp
index 33f80848336..fa0c2e32b13 100644
--- a/include/ginkgo/core/base/temporary_clone.hpp
+++ b/include/ginkgo/core/base/temporary_clone.hpp
@@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_BASE_TEMPORARY_CLONE_HPP_
-#define GKO_CORE_BASE_TEMPORARY_CLONE_HPP_
+#ifndef GKO_PUBLIC_CORE_BASE_TEMPORARY_CLONE_HPP_
+#define GKO_PUBLIC_CORE_BASE_TEMPORARY_CLONE_HPP_
 
 
 #include <functional>
@@ -199,4 +199,4 @@ detail::temporary_clone<T> make_temporary_clone(
 }  // namespace gko
 
 
-#endif  // GKO_CORE_BASE_TEMPORARY_CLONE_HPP_
+#endif  // GKO_PUBLIC_CORE_BASE_TEMPORARY_CLONE_HPP_
diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp
index 01c6b5cf5f5..d277d20cb4c 100644
--- a/include/ginkgo/core/base/utils_helper.hpp
+++ b/include/ginkgo/core/base/utils_helper.hpp
@@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_BASE_UTILS_HELPER_HPP_
-#define GKO_CORE_BASE_UTILS_HELPER_HPP_
+#ifndef GKO_PUBLIC_CORE_BASE_UTILS_HELPER_HPP_
+#define GKO_PUBLIC_CORE_BASE_UTILS_HELPER_HPP_
 
 
 #include <functional>
@@ -435,4 +435,4 @@ class null_deleter<T[]> {
 }  // namespace gko
 
 
-#endif  // GKO_CORE_BASE_UTILS_HELPER_HPP_
+#endif  // GKO_PUBLIC_CORE_BASE_UTILS_HELPER_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 40f415b2156..0602875c4dd 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -56,8 +56,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/base/utils.hpp>
+#include <ginkgo/core/base/utils_helper.hpp>
 #include <ginkgo/core/base/version.hpp>
 
 #include <ginkgo/core/factorization/ilu.hpp>

From 8608400e1a2eb4214a4941465b1b1eef3812c48a Mon Sep 17 00:00:00 2001
From: Terry Cojean <terry.cojean@kit.edu>
Date: Tue, 8 Dec 2020 09:15:34 +0100
Subject: [PATCH 4/5] Review updates.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some code style issues.

Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
---
 core/test/base/executor.cpp | 9 ++++++---
 dpcpp/base/executor.dp.cpp  | 1 -
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 783e20f1008..7e2f4ac23b4 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -488,12 +488,15 @@ TEST(Executor, CanVerifyMemory)
     std::shared_ptr<gko::DpcppExecutor> host_dpcpp;
     std::shared_ptr<gko::DpcppExecutor> cpu_dpcpp;
     std::shared_ptr<gko::DpcppExecutor> gpu_dpcpp;
-    if (gko::DpcppExecutor::get_num_devices("host"))
+    if (gko::DpcppExecutor::get_num_devices("host")) {
         host_dpcpp = gko::DpcppExecutor::create(0, omp, "host");
-    if (gko::DpcppExecutor::get_num_devices("cpu"))
+    }
+    if (gko::DpcppExecutor::get_num_devices("cpu")) {
         cpu_dpcpp = gko::DpcppExecutor::create(0, omp, "cpu");
-    if (gko::DpcppExecutor::get_num_devices("gpu"))
+    }
+    if (gko::DpcppExecutor::get_num_devices("gpu")) {
         gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu");
+    }
 
     ASSERT_EQ(true, ref->memory_accessible(omp));
     ASSERT_EQ(true, omp->memory_accessible(ref));
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 25e106a2546..39f9120701a 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -162,7 +162,6 @@ int DpcppExecutor::get_num_devices(std::string device_type)
 bool DpcppExecutor::verify_memory_to(const OmpExecutor *dest_exec) const
 {
     auto device = detail::get_devices(device_type_)[device_id_];
-
     return device.is_host() || device.is_cpu();
 }
 

From 018244f0191765749588edf2c6d34d9330781e69 Mon Sep 17 00:00:00 2001
From: Terry Cojean <terry.cojean@kit.edu>
Date: Tue, 8 Dec 2020 19:11:41 +0100
Subject: [PATCH 5/5] Isolate Reference from other executors in memory.

---
 core/device_hooks/cuda_hooks.cpp      |  3 --
 core/device_hooks/dpcpp_hooks.cpp     |  6 ----
 core/device_hooks/hip_hooks.cpp       |  3 --
 core/devices/cuda/executor.cpp        | 16 ++++++++++
 core/devices/hip/executor.cpp         | 16 ++++++++++
 core/test/base/array.cpp              |  8 ++---
 core/test/base/executor.cpp           | 12 +++----
 core/test/base/lin_op.cpp             | 10 +++---
 core/test/base/utils.cpp              |  5 +--
 cuda/base/executor.cpp                | 10 ------
 dpcpp/base/executor.dp.cpp            |  6 ----
 hip/base/executor.hip.cpp             | 10 ------
 include/ginkgo/core/base/executor.hpp | 46 +++++++++++++++++----------
 13 files changed, 80 insertions(+), 71 deletions(-)

diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index a5c33c6541d..46e342d9c0a 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -98,9 +98,6 @@ void CudaExecutor::raw_copy_to(const DpcppExecutor *, size_type num_bytes,
     GKO_NOT_COMPILED(cuda);
 
 
-bool CudaExecutor::verify_memory_to(const HipExecutor *) const { return false; }
-
-
 void CudaExecutor::synchronize() const GKO_NOT_COMPILED(cuda);
 
 
diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp
index 472451c6513..88caac1ccd3 100644
--- a/core/device_hooks/dpcpp_hooks.cpp
+++ b/core/device_hooks/dpcpp_hooks.cpp
@@ -126,12 +126,6 @@ bool DpcppExecutor::verify_memory_to(const OmpExecutor *dest_exec) const
     return device_type_ == "cpu" || device_type_ == "host";
 }
 
-bool DpcppExecutor::verify_memory_to(const ReferenceExecutor *dest_exec) const
-{
-    // Dummy check
-    return device_type_ == "cpu" || device_type_ == "host";
-}
-
 bool DpcppExecutor::verify_memory_to(const DpcppExecutor *dest_exec) const
 {
     // Dummy check
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index 45180980159..b98b2fcb8a4 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -95,9 +95,6 @@ void HipExecutor::raw_copy_to(const DpcppExecutor *, size_type num_bytes,
     GKO_NOT_COMPILED(hip);
 
 
-bool HipExecutor::verify_memory_to(const CudaExecutor *) const { return false; }
-
-
 void HipExecutor::synchronize() const GKO_NOT_COMPILED(hip);
 
 
diff --git a/core/devices/cuda/executor.cpp b/core/devices/cuda/executor.cpp
index 3566578a681..2d6af2b91ef 100644
--- a/core/devices/cuda/executor.cpp
+++ b/core/devices/cuda/executor.cpp
@@ -48,6 +48,22 @@ std::shared_ptr<const Executor> CudaExecutor::get_master() const noexcept
 }
 
 
+bool CudaExecutor::verify_memory_to(const CudaExecutor *dest_exec) const
+{
+    return device_id_ == dest_exec->get_device_id();
+}
+
+
+bool CudaExecutor::verify_memory_to(const HipExecutor *dest_exec) const
+{
+#if GINKGO_HIP_PLATFORM_NVCC
+    return device_id_ == dest_exec->get_device_id();
+#else
+    return false;
+#endif
+}
+
+
 unsigned CudaExecutor::num_execs[max_devices];
 
 
diff --git a/core/devices/hip/executor.cpp b/core/devices/hip/executor.cpp
index f4787523290..b9a643eb7e2 100644
--- a/core/devices/hip/executor.cpp
+++ b/core/devices/hip/executor.cpp
@@ -45,6 +45,22 @@ std::shared_ptr<const Executor> HipExecutor::get_master() const noexcept
 }
 
 
+bool HipExecutor::verify_memory_to(const HipExecutor *dest_exec) const
+{
+    return device_id_ == dest_exec->get_device_id();
+}
+
+
+bool HipExecutor::verify_memory_to(const CudaExecutor *dest_exec) const
+{
+#if GINKGO_HIP_PLATFORM_NVCC
+    return device_id_ == dest_exec->get_device_id();
+#else
+    return false;
+#endif
+}
+
+
 int HipExecutor::num_execs[max_devices];
 
 
diff --git a/core/test/base/array.cpp b/core/test/base/array.cpp
index 54f7cfea053..c20f5ca72a9 100644
--- a/core/test/base/array.cpp
+++ b/core/test/base/array.cpp
@@ -275,9 +275,9 @@ TYPED_TEST(Array, CanCreateTemporaryCloneOnSameExecutor)
 // For tests between different memory, check cuda/test/base/array.cu
 TYPED_TEST(Array, DoesNotCreateATemporaryCloneBetweenSameMemory)
 {
-    auto omp = gko::OmpExecutor::create();
+    auto other = gko::ReferenceExecutor::create();
 
-    auto tmp_clone = make_temporary_clone(omp, &this->x);
+    auto tmp_clone = make_temporary_clone(other, &this->x);
 
     this->assert_equal_to_original_x(*tmp_clone.get());
     ASSERT_EQ(tmp_clone.get(), &this->x);
@@ -286,10 +286,10 @@ TYPED_TEST(Array, DoesNotCreateATemporaryCloneBetweenSameMemory)
 
 TYPED_TEST(Array, DoesNotCopyBackTemporaryCloneBetweenSameMemory)
 {
-    auto omp = gko::OmpExecutor::create();
+    auto other = gko::ReferenceExecutor::create();
 
     {
-        auto tmp_clone = make_temporary_clone(omp, &this->x);
+        auto tmp_clone = make_temporary_clone(other, &this->x);
         // change x, and check that there is no copy-back to overwrite it again
         this->x.get_data()[0] = 0;
     }
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 7e2f4ac23b4..488334c9ab9 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -498,8 +498,8 @@ TEST(Executor, CanVerifyMemory)
         gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu");
     }
 
-    ASSERT_EQ(true, ref->memory_accessible(omp));
-    ASSERT_EQ(true, omp->memory_accessible(ref));
+    ASSERT_EQ(false, ref->memory_accessible(omp));
+    ASSERT_EQ(false, omp->memory_accessible(ref));
     ASSERT_EQ(false, ref->memory_accessible(hip));
     ASSERT_EQ(false, hip->memory_accessible(ref));
     ASSERT_EQ(false, omp->memory_accessible(hip));
@@ -509,14 +509,14 @@ TEST(Executor, CanVerifyMemory)
     ASSERT_EQ(false, omp->memory_accessible(cuda));
     ASSERT_EQ(false, cuda->memory_accessible(omp));
     if (gko::DpcppExecutor::get_num_devices("host")) {
-        ASSERT_EQ(true, host_dpcpp->memory_accessible(ref));
-        ASSERT_EQ(true, ref->memory_accessible(host_dpcpp));
+        ASSERT_EQ(false, host_dpcpp->memory_accessible(ref));
+        ASSERT_EQ(false, ref->memory_accessible(host_dpcpp));
         ASSERT_EQ(true, host_dpcpp->memory_accessible(omp));
         ASSERT_EQ(true, omp->memory_accessible(host_dpcpp));
     }
     if (gko::DpcppExecutor::get_num_devices("cpu")) {
-        ASSERT_EQ(true, ref->memory_accessible(cpu_dpcpp));
-        ASSERT_EQ(true, cpu_dpcpp->memory_accessible(ref));
+        ASSERT_EQ(false, ref->memory_accessible(cpu_dpcpp));
+        ASSERT_EQ(false, cpu_dpcpp->memory_accessible(ref));
         ASSERT_EQ(true, cpu_dpcpp->memory_accessible(omp));
         ASSERT_EQ(true, omp->memory_accessible(cpu_dpcpp));
     }
diff --git a/core/test/base/lin_op.cpp b/core/test/base/lin_op.cpp
index 3da8eda55f0..e608654ca1f 100644
--- a/core/test/base/lin_op.cpp
+++ b/core/test/base/lin_op.cpp
@@ -93,8 +93,8 @@ class EnableLinOp : public ::testing::Test {
 protected:
     EnableLinOp()
         : ref{gko::ReferenceExecutor::create()},
-          omp{gko::OmpExecutor::create()},
-          op{DummyLinOp::create(omp, gko::dim<2>{3, 5})},
+          ref2{gko::ReferenceExecutor::create()},
+          op{DummyLinOp::create(ref2, gko::dim<2>{3, 5})},
           alpha{DummyLinOp::create(ref, gko::dim<2>{1})},
           beta{DummyLinOp::create(ref, gko::dim<2>{1})},
           b{DummyLinOp::create(ref, gko::dim<2>{5, 4})},
@@ -102,7 +102,7 @@ class EnableLinOp : public ::testing::Test {
     {}
 
     std::shared_ptr<const gko::ReferenceExecutor> ref;
-    std::shared_ptr<const gko::OmpExecutor> omp;
+    std::shared_ptr<const gko::ReferenceExecutor> ref2;
     std::unique_ptr<DummyLinOp> op;
     std::unique_ptr<DummyLinOp> alpha;
     std::unique_ptr<DummyLinOp> beta;
@@ -115,7 +115,7 @@ TEST_F(EnableLinOp, CallsApplyImpl)
 {
     op->apply(gko::lend(b), gko::lend(x));
 
-    ASSERT_EQ(op->last_access, omp);
+    ASSERT_EQ(op->last_access, ref2);
 }
 
 
@@ -123,7 +123,7 @@ TEST_F(EnableLinOp, CallsExtendedApplyImpl)
 {
     op->apply(gko::lend(alpha), gko::lend(b), gko::lend(beta), gko::lend(x));
 
-    ASSERT_EQ(op->last_access, omp);
+    ASSERT_EQ(op->last_access, ref2);
 }
 
 
diff --git a/core/test/base/utils.cpp b/core/test/base/utils.cpp
index 822449e8f24..a3bafcb5b69 100644
--- a/core/test/base/utils.cpp
+++ b/core/test/base/utils.cpp
@@ -402,9 +402,10 @@ class TemporaryClone : public ::testing::Test {
 
 TEST_F(TemporaryClone, DoesNotCopyToSameMemory)
 {
-    auto clone = make_temporary_clone(omp, gko::lend(obj));
+    auto other = gko::ReferenceExecutor::create();
+    auto clone = make_temporary_clone(other, gko::lend(obj));
 
-    ASSERT_NE(clone.get()->get_executor(), omp);
+    ASSERT_NE(clone.get()->get_executor(), other);
     ASSERT_EQ(obj->get_executor(), ref);
 }
 
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 5284daa2dd1..7926ae98a21 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -162,16 +162,6 @@ void CudaExecutor::raw_copy_to(const CudaExecutor *dest, size_type num_bytes,
 }
 
 
-bool CudaExecutor::verify_memory_to(const HipExecutor *dest_exec) const
-{
-#if GINKGO_HIP_PLATFORM_NVCC
-    return device_id_ == dest_exec->get_device_id();
-#else
-    return false;
-#endif
-}
-
-
 void CudaExecutor::synchronize() const
 {
     cuda::device_guard g(this->get_device_id());
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 39f9120701a..92089616c02 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -165,12 +165,6 @@ bool DpcppExecutor::verify_memory_to(const OmpExecutor *dest_exec) const
     return device.is_host() || device.is_cpu();
 }
 
-bool DpcppExecutor::verify_memory_to(const ReferenceExecutor *dest_exec) const
-{
-    auto device = detail::get_devices(device_type_)[device_id_];
-    return device.is_host() || device.is_cpu();
-}
-
 bool DpcppExecutor::verify_memory_to(const DpcppExecutor *dest_exec) const
 {
     auto device = detail::get_devices(device_type_)[device_id_];
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 8f8bbbf7d9d..3e400986e2b 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -162,16 +162,6 @@ void HipExecutor::raw_copy_to(const HipExecutor *dest, size_type num_bytes,
 }
 
 
-bool HipExecutor::verify_memory_to(const CudaExecutor *dest_exec) const
-{
-#if GINKGO_HIP_PLATFORM_NVCC
-    return device_id_ == dest_exec->get_device_id();
-#else
-    return false;
-#endif
-}
-
-
 void HipExecutor::synchronize() const
 {
     hip::device_guard g(this->get_device_id());
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 2ff58f6fe82..f02c5834c73 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -468,6 +468,7 @@ class Executor : public log::EnableLogging<Executor> {
     friend class detail::ExecutorBase;
 
     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
+    friend class ReferenceExecutor;
 
 public:
     virtual ~Executor() = default;
@@ -855,6 +856,7 @@ namespace detail {
 template <typename ConcreteExecutor>
 class ExecutorBase : public Executor {
     GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
+    friend class ReferenceExecutor;
 
 public:
     void run(const Operation &op) const override
@@ -871,7 +873,7 @@ class ExecutorBase : public Executor {
         src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
     }
 
-    bool verify_memory_from(const Executor *src_exec) const override
+    virtual bool verify_memory_from(const Executor *src_exec) const override
     {
         return src_exec->verify_memory_to(self());
     }
@@ -935,10 +937,13 @@ class EnableDeviceReset {
                      const void *src_ptr, void *dest_ptr) const override
 
 
-#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_)                       \
-    bool verify_memory_to(const dest_ *other) const override { return bool_; } \
-    static_assert(true,                                                        \
-                  "This assert is used to counter the false positive extra "   \
+#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_)                     \
+    virtual bool verify_memory_to(const dest_ *other) const override         \
+    {                                                                        \
+        return bool_;                                                        \
+    }                                                                        \
+    static_assert(true,                                                      \
+                  "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
 
 
@@ -979,7 +984,7 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
 
     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, true);
 
-    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
 
     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
 
@@ -1020,6 +1025,21 @@ class ReferenceExecutor : public OmpExecutor {
 
 protected:
     ReferenceExecutor() = default;
+
+    bool verify_memory_from(const Executor *src_exec) const override
+    {
+        return src_exec->verify_memory_to(this);
+    }
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, true);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor, false);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor, false);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor, false);
+
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
 };
 
 
@@ -1159,10 +1179,7 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     bool verify_memory_to(const HipExecutor *dest_exec) const override;
 
-    bool verify_memory_to(const CudaExecutor *dest_exec) const override
-    {
-        return device_id_ == dest_exec->get_device_id();
-    }
+    bool verify_memory_to(const CudaExecutor *dest_exec) const override;
 
     static void increase_num_execs(unsigned device_id)
     {
@@ -1338,10 +1355,7 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
     bool verify_memory_to(const CudaExecutor *dest_exec) const override;
 
-    bool verify_memory_to(const HipExecutor *dest_exec) const override
-    {
-        return device_id_ == dest_exec->get_device_id();
-    }
+    bool verify_memory_to(const HipExecutor *dest_exec) const override;
 
     static void increase_num_execs(int device_id)
     {
@@ -1507,9 +1521,9 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
 
     GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor, false);
 
-    bool verify_memory_to(const OmpExecutor *dest_exec) const override;
+    GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor, false);
 
-    bool verify_memory_to(const ReferenceExecutor *dest_exec) const override;
+    bool verify_memory_to(const OmpExecutor *dest_exec) const override;
 
     bool verify_memory_to(const DpcppExecutor *dest_exec) const override;