ginkgo-project · upsj · Jun 16, 2020 · Jun 12, 2020
diff --git a/benchmark/utils/loggers.hpp b/benchmark/utils/loggers.hpp
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <chrono>
+#include <mutex>
 #include <regex>
 #include <unordered_map>
 
@@ -104,6 +105,7 @@ struct OperationLogger : gko::log::Logger {
                     rapidjson::MemoryPoolAllocator<> &alloc,
                     gko::uint32 repetitions)
     {
+        const std::lock_guard<std::mutex> lock(mutex);
         for (const auto &entry : total) {
             add_or_set_member(
                 object, entry.first.c_str(),
@@ -124,6 +126,7 @@ struct OperationLogger : gko::log::Logger {
                          const std::string &name) const
     {
         exec->synchronize();
+        const std::lock_guard<std::mutex> lock(mutex);
         auto nested_name = nested.empty() || !use_nested_name
                                ? name
                                : nested.back().first + "::" + name;
@@ -133,9 +136,10 @@ struct OperationLogger : gko::log::Logger {
 
     void end_operation(const gko::Executor *exec, const std::string &name) const
     {
+        exec->synchronize();
+        const std::lock_guard<std::mutex> lock(mutex);
         // if operations are properly nested, nested_name now ends with name
         auto nested_name = nested.back().first;
-        exec->synchronize();
         const auto end = std::chrono::steady_clock::now();
         const auto diff = end - start[nested_name];
         // make sure timings for nested operations are not counted twice
@@ -147,6 +151,7 @@ struct OperationLogger : gko::log::Logger {
     }
 
     bool use_nested_name;
+    mutable std::mutex mutex;
     mutable std::map<std::string, std::chrono::steady_clock::time_point> start;
     mutable std::map<std::string, std::chrono::steady_clock::duration> total;
     // the position i of this vector holds the total time spend on child
@@ -162,18 +167,21 @@ struct StorageLogger : gko::log::Logger {
                                  const gko::size_type &num_bytes,
                                  const gko::uintptr &location) const override
     {
+        const std::lock_guard<std::mutex> lock(mutex);
         storage[location] = num_bytes;
     }
 
     void on_free_completed(const gko::Executor *,
                            const gko::uintptr &location) const override
     {
+        const std::lock_guard<std::mutex> lock(mutex);
         storage[location] = 0;
     }
 
     void write_data(rapidjson::Value &output,
                     rapidjson::MemoryPoolAllocator<> &allocator)
     {
+        const std::lock_guard<std::mutex> lock(mutex);
         gko::size_type total{};
         for (const auto &e : storage) {
             total += e.second;
@@ -186,6 +194,7 @@ struct StorageLogger : gko::log::Logger {
     {}
 
 private:
+    mutable std::mutex mutex;
     mutable std::unordered_map<gko::uintptr, gko::size_type> storage;
 };
 

diff --git a/omp/preconditioner/isai_kernels.cpp b/omp/preconditioner/isai_kernels.cpp
@@ -109,16 +109,20 @@ void generic_generate(std::shared_ptr<const DefaultExecutor> exec,
     const auto i_cols = inverse_mtx->get_const_col_idxs();
     auto i_vals = inverse_mtx->get_values();
 
+    auto num_threads = static_cast<size_type>(omp_get_max_threads());
+    // RHS for local trisystem
+    gko::Array<ValueType> rhs_array{exec, row_size_limit * num_threads};
+    // memory for dense trisystem
+    gko::Array<ValueType> trisystem_array{
+        exec, row_size_limit * row_size_limit * num_threads};
+
 #pragma omp parallel
     {
-        // OpenMP seems to have issues copying the arrays, so we do it manually
-        // RHS for local trisystem
-        gko::Array<ValueType> rhs_array{exec, row_size_limit};
-        auto rhs = rhs_array.get_data();
-        // memory for dense trisystem
-        gko::Array<ValueType> trisystem_array{exec,
-                                              row_size_limit * row_size_limit};
-        auto trisystem_ptr = trisystem_array.get_data();
+        auto thread_num = static_cast<size_type>(omp_get_thread_num());
+
+        auto rhs = rhs_array.get_data() + thread_num * row_size_limit;
+        auto trisystem_ptr = trisystem_array.get_data() +
+                             thread_num * row_size_limit * row_size_limit;
 
 #pragma omp for
         for (size_type row = 0; row < num_rows; ++row) {