diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp index 66c71c21ae8..8b97a42fdd3 100644 --- a/benchmark/blas/blas.cpp +++ b/benchmark/blas/blas.cpp @@ -472,6 +472,12 @@ void apply_blas(const char *operation_name, std::shared_ptr exec, } catch (const std::exception &e) { add_or_set_member(test_case["blas"][operation_name], "completed", false, allocator); + if (FLAGS_keep_errors) { + rapidjson::Value msg_value; + msg_value.SetString(e.what(), allocator); + add_or_set_member(test_case["blas"][operation_name], "error", + msg_value, allocator); + } std::cerr << "Error when processing test case " << test_case << "\n" << "what(): " << e.what() << std::endl; } diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp index fd9481189b0..18313bf26db 100644 --- a/benchmark/conversions/conversions.cpp +++ b/benchmark/conversions/conversions.cpp @@ -88,9 +88,9 @@ void convert_matrix(const gko::LinOp *matrix_from, const char *format_to, matrix_to->copy_from(matrix_from); } add_or_set_member(conversion_case[conversion_name], "time", - timer->compute_average_time(), allocator); + ic.compute_average_time(), allocator); add_or_set_member(conversion_case[conversion_name], "repetitions", - timer->get_num_repetitions(), allocator); + ic.get_num_repetitions(), allocator); // compute and write benchmark data add_or_set_member(conversion_case[conversion_name], "completed", true, @@ -98,6 +98,12 @@ void convert_matrix(const gko::LinOp *matrix_from, const char *format_to, } catch (const std::exception &e) { add_or_set_member(test_case["conversions"][conversion_name], "completed", false, allocator); + if (FLAGS_keep_errors) { + rapidjson::Value msg_value; + msg_value.SetString(e.what(), allocator); + add_or_set_member(test_case["conversions"][conversion_name], + "error", msg_value, allocator); + } std::cerr << "Error when processing test case " << test_case << "\n" << "what(): " << e.what() << std::endl; } @@ -156,8 +162,7 @@ int main(int argc, char *argv[]) try { auto matrix_from = share(formats::matrix_factory.at(format_from)(exec, data)); - for (const auto &format : formats::matrix_factory) { - const auto format_to = std::get<0>(format); + for (const auto &format_to : formats) { if (format_from == format_to) { continue; } diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index 695c1e78314..002faeeddff 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -229,6 +229,12 @@ void run_preconditioner(const char *precond_name, rapidjson::Value(rapidjson::kObjectType), allocator); add_or_set_member(test_case["preconditioner"][encoded_name.c_str()], "completed", false, allocator); + if (FLAGS_keep_errors) { + rapidjson::Value msg_value; + msg_value.SetString(e.what(), allocator); + add_or_set_member(test_case["preconditioner"][encoded_name.c_str()], + "error", msg_value, allocator); + } std::cerr << "Error when processing test case " << test_case << "\n" << "what(): " << e.what() << std::endl; } diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh index bd481169057..db7fa5bb5ca 100644 --- a/benchmark/run_all_benchmarks.sh +++ b/benchmark/run_all_benchmarks.sh @@ -16,6 +16,16 @@ if [ ! "${EXECUTOR}" ]; then echo "EXECUTOR environment variable not set - assuming \"${EXECUTOR}\"" 1>&2 fi +if [ ! "${REPETITIONS}" ]; then + REPETITIONS=10 + echo "REPETITIONS environment variable not set - assuming ${REPETITIONS}" 1>&2 +fi + +if [ ! "${SOLVER_REPETITIONS}" ]; then + SOLVER_REPETITIONS=1 + echo "SOLVER_REPETITIONS environment variable not set - assuming ${SOLVER_REPETITIONS}" 1>&2 +fi + if [ ! "${SEGMENTS}" ]; then echo "SEGMENTS environment variable not set - running entire suite" 1>&2 SEGMENTS=1 @@ -35,6 +45,11 @@ if [ ! "${FORMATS}" ]; then FORMATS="csr,coo,ell,hybrid,sellp" fi +if [ ! "${ELL_IMBALANCE_LIMIT}" ]; then + echo "ELL_IMBALANCE_LIMIT environment variable not set - assuming 100" 1>&2 + ELL_IMBALANCE_LIMIT=100 +fi + if [ ! "${SOLVERS}" ]; then SOLVERS="bicgstab,cg,cgs,fcg,gmres,cb_gmres_reduce1,idr" echo "SOLVERS environment variable not set - assuming \"${SOLVERS}\"" 1>&2 @@ -67,7 +82,7 @@ fi if [ ! "${SOLVERS_JACOBI_MAX_BS}" ]; then SOLVERS_JACOBI_MAX_BS="32" - "SOLVERS_JACOBI_MAX_BS environment variable not set - assuming \"${SOLVERS_JACOBI_MAX_BS}\"" 1>&2 + echo "SOLVERS_JACOBI_MAX_BS environment variable not set - assuming \"${SOLVERS_JACOBI_MAX_BS}\"" 1>&2 fi if [ ! "${BENCHMARK_PRECISION}" ]; then @@ -202,6 +217,8 @@ run_conversion_benchmarks() { ./conversions/conversions${BENCH_SUFFIX} --backup="$1.bkp" --double_buffer="$1.bkp2" \ --executor="${EXECUTOR}" --formats="${FORMATS}" \ --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \ + --repetitions="${REPETITIONS}" \ + --ell_imbalance_limit="${ELL_IMBALANCE_LIMIT}" \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" } @@ -218,6 +235,8 @@ run_spmv_benchmarks() { ./spmv/spmv${BENCH_SUFFIX} --backup="$1.bkp" --double_buffer="$1.bkp2" \ --executor="${EXECUTOR}" --formats="${FORMATS}" \ --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \ + --repetitions="${REPETITIONS}" \ + --ell_imbalance_limit="${ELL_IMBALANCE_LIMIT}" \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" } @@ -239,6 +258,7 @@ run_solver_benchmarks() { --gpu_timer=${GPU_TIMER} \ --jacobi_max_block_size=${SOLVERS_JACOBI_MAX_BS} --device_id="${DEVICE_ID}" \ --gmres_restart="${SOLVERS_GMRES_RESTART}" \ + --repetitions="${SOLVER_REPETITIONS}" \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" } @@ -265,6 +285,7 @@ run_preconditioner_benchmarks() { --jacobi_max_block_size="${bsize}" \ --jacobi_storage="${prec}" \ --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \ + --repetitions="${REPETITIONS}" \ <"$1.imd" 2>&1 >"$1" keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" done diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp index d709cf442c5..e4df2948cb9 100644 --- a/benchmark/solver/solver.cpp +++ b/benchmark/solver/solver.cpp @@ -509,6 +509,12 @@ void solve_system(const std::string &solver_name, } catch (const std::exception &e) { add_or_set_member(test_case["solver"][precond_solver_name], "completed", false, allocator); + if (FLAGS_keep_errors) { + rapidjson::Value msg_value; + msg_value.SetString(e.what(), allocator); + add_or_set_member(test_case["solver"][precond_solver_name], "error", + msg_value, allocator); + } std::cerr << "Error when processing test case " << test_case << "\n" << "what(): " << e.what() << std::endl; } diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp index 2bf07e8ad45..6d21ddd88e6 100644 --- a/benchmark/spmv/spmv.cpp +++ b/benchmark/spmv/spmv.cpp @@ -130,7 +130,7 @@ void apply_spmv(const char *format_name, std::shared_ptr exec, for (auto _ : ic_tuning.run()) { system_matrix->apply(lend(b), lend(x_clone)); } - tuning_case["time"].PushBack(tuning_timer->compute_average_time(), + tuning_case["time"].PushBack(ic_tuning.compute_average_time(), allocator); tuning_case["values"].PushBack(val, allocator); } @@ -154,6 +154,12 @@ void apply_spmv(const char *format_name, std::shared_ptr exec, } catch (const std::exception &e) { add_or_set_member(test_case["spmv"][format_name], "completed", false, allocator); + if (FLAGS_keep_errors) { + rapidjson::Value msg_value; + msg_value.SetString(e.what(), allocator); + add_or_set_member(test_case["spmv"][format_name], "error", + msg_value, allocator); + } std::cerr << "Error when processing test case " << test_case << "\n" << "what(): " << e.what() << std::endl; } diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index 64de4e10f90..bbce4f25cf2 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include @@ -152,6 +153,10 @@ std::string format_command = // the formats command-line argument DEFINE_string(formats, "coo", formats::format_command.c_str()); +DEFINE_int64(ell_imbalance_limit, 100, + "Maximal storage overhead above which ELL benchmarks will be " + "skipped. Negative values mean no limit."); + namespace formats { @@ -181,6 +186,51 @@ std::unique_ptr read_matrix_from_data( return mat; } + +/** + * Creates a CSR strategy of the given type for the given executor if possible, + * falls back to csr::classical for executors without support for this strategy. + * + * @tparam Strategy one of csr::automatical or csr::load_balance + */ +template +std::shared_ptr create_gpu_strategy( + std::shared_ptr exec) +{ + if (auto cuda = dynamic_cast(exec.get())) { + return std::make_shared(cuda->shared_from_this()); + } else if (auto hip = dynamic_cast(exec.get())) { + return std::make_shared(hip->shared_from_this()); + } else { + return std::make_shared(); + } +} + + +/** + * Checks whether the given matrix data exceeds the ELL imbalance limit set by + * the --ell_imbalance_limit flag + * + * @throws gko::Error if the imbalance limit is exceeded + */ +void check_ell_admissibility(const gko::matrix_data &data) +{ + if (data.size[0] == 0 || FLAGS_ell_imbalance_limit < 0) { + return; + } + std::vector row_lengths(data.size[0]); + for (auto nz : data.nonzeros) { + row_lengths[nz.row]++; + } + auto max_len = *std::max_element(row_lengths.begin(), row_lengths.end()); + auto avg_len = data.nonzeros.size() / std::max(data.size[0], 1); + if (max_len / avg_len > FLAGS_ell_imbalance_limit) { + throw gko::Error(__FILE__, __LINE__, + "Matrix exceeds ELL imbalance limit"); + } +} + + /** * Creates a Ginkgo matrix from the intermediate data representation format * gko::matrix_data with support for variable arguments. @@ -201,15 +251,36 @@ const std::map( std::shared_ptr, const gko::matrix_data &)>> matrix_factory{ - {"csr", READ_MATRIX(csr, std::make_shared())}, - {"csri", READ_MATRIX(csr, std::make_shared())}, + {"csr", + [](std::shared_ptr exec, + const gko::matrix_data &data) -> std::unique_ptr { + auto mat = + csr::create(exec, create_gpu_strategy(exec)); + mat->read(data); + return mat; + }}, + {"csri", + [](std::shared_ptr exec, + const gko::matrix_data &data) -> std::unique_ptr { + auto mat = csr::create( + exec, create_gpu_strategy(exec)); + mat->read(data); + return mat; + }}, {"csrm", READ_MATRIX(csr, std::make_shared())}, {"csrc", READ_MATRIX(csr, std::make_shared())}, {"coo", read_matrix_from_data>}, - {"ell", read_matrix_from_data>}, + {"ell", [](std::shared_ptr exec, + const gko::matrix_data &data) { + check_ell_admissibility(data); + auto mat = gko::matrix::Ell::create(exec); + mat->read(data); + return mat; + }}, {"ell-mixed", [](std::shared_ptr exec, const gko::matrix_data &data) { + check_ell_admissibility(data); gko::matrix_data> conv_data; conv_data.size = data.size; conv_data.nonzeros.resize(data.nonzeros.size()); @@ -220,7 +291,8 @@ const std::map( it->value = el.value; ++it; } - auto mat = gko::matrix::Ell>::create(std::move(exec)); + auto mat = gko::matrix::Ell>::create( + std::move(exec)); mat->read(conv_data); return mat; }}, diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp index 84fb5926af4..dca22e7fe27 100644 --- a/benchmark/utils/general.hpp +++ b/benchmark/utils/general.hpp @@ -84,6 +84,10 @@ DEFINE_string(double_buffer, "", DEFINE_bool(detailed, true, "If set, performs several runs to obtain more detailed results"); +DEFINE_bool(keep_errors, false, + "If set, writes exception messages during the execution into the " + "JSON output"); + DEFINE_bool(nested_names, false, "If set, separately logs nested operations"); DEFINE_uint32(seed, 42, "Seed used for the random number generator"); diff --git a/omp/components/format_conversion.hpp b/omp/components/format_conversion.hpp index 706c686ac83..d634541c6f3 100644 --- a/omp/components/format_conversion.hpp +++ b/omp/components/format_conversion.hpp @@ -40,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/components/prefix_sum.hpp" + + namespace gko { namespace kernels { namespace omp { @@ -84,16 +87,25 @@ inline void convert_unsorted_idxs_to_ptrs(const IndexType *idxs, template inline void convert_sorted_idxs_to_ptrs(const IndexType *idxs, size_type num_nonzeros, IndexType *ptrs, - size_type length) + size_type num_rows) { ptrs[0] = 0; - ptrs[length - 1] = num_nonzeros; -#pragma omp parallel for schedule( \ - static, ceildiv(num_nonzeros, omp_get_max_threads())) - for (size_type i = 0; i < num_nonzeros - 1; i++) { - for (size_type j = idxs[i] + 1; j <= idxs[i + 1]; j++) { - ptrs[j] = i + 1; + if (num_nonzeros == 0) { +#pragma omp parallel for + for (size_type row = 0; row < num_rows; row++) { + ptrs[row + 1] = 0; + } + } else { + // add virtual sentinel values 0 and num_rows to handle empty first and + // last rows +#pragma omp parallel for + for (size_type i = 0; i <= num_nonzeros; i++) { + auto begin_row = i == 0 ? size_type{} : idxs[i - 1]; + auto end_row = i == num_nonzeros ? num_rows : idxs[i]; + for (auto row = begin_row; row < end_row; row++) { + ptrs[row + 1] = i; + } } } } diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp index 71f5283104c..1f9055dc129 100644 --- a/omp/matrix/coo_kernels.cpp +++ b/omp/matrix/coo_kernels.cpp @@ -204,9 +204,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void convert_row_idxs_to_ptrs(std::shared_ptr exec, const IndexType *idxs, size_type num_nonzeros, - IndexType *ptrs, size_type length) + IndexType *ptrs, size_type num_rows) { - convert_sorted_idxs_to_ptrs(idxs, num_nonzeros, ptrs, length); + convert_sorted_idxs_to_ptrs(idxs, num_nonzeros, ptrs, num_rows); } @@ -222,8 +222,7 @@ void convert_to_csr(std::shared_ptr exec, const auto source_row_idxs = source->get_const_row_idxs(); - convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, - num_rows + 1); + convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, num_rows); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp index 204da570121..f97bee4a640 100644 --- a/omp/matrix/hybrid_kernels.cpp +++ b/omp/matrix/hybrid_kernels.cpp @@ -112,58 +112,30 @@ void convert_to_csr(std::shared_ptr exec, const auto coo_row = source->get_const_coo_row_idxs(); const auto coo_nnz = source->get_coo_num_stored_elements(); const auto num_rows = source->get_size()[0]; - auto coo_row_ptrs = Array(exec, num_rows + 1); - auto coo_row_ptrs_val = coo_row_ptrs.get_data(); - convert_sorted_idxs_to_ptrs(coo_row, coo_nnz, coo_row_ptrs_val, - num_rows + 1); - - // Compute the row offset of Coo without zeros - auto coo_offset = Array(exec, num_rows); - auto coo_offset_val = coo_offset.get_data(); - for (size_type i = 0; i < num_rows; i++) { - IndexType nonzeros = 0; -#pragma omp parallel for reduction(+ : nonzeros) - for (size_type j = coo_row_ptrs_val[i]; j < coo_row_ptrs_val[i + 1]; - j++) { - nonzeros += coo_val[j] != zero(); - } - coo_offset_val[i] = nonzeros; - } - - // Compute row pointer of Csr - csr_row_ptrs[0] = 0; -#pragma omp parallel for - for (size_type i = 0; i < num_rows; i++) { - csr_row_ptrs[i + 1] = coo_offset_val[i]; - } - - for (size_type col = 0; col < max_nnz_per_row; col++) { -#pragma omp parallel for - for (size_type row = 0; row < num_rows; row++) { - csr_row_ptrs[row + 1] += - (ell->val_at(row, col) != zero()); - } - } + auto coo_row_ptrs_array = Array(exec, num_rows + 1); + auto coo_row_ptrs = coo_row_ptrs_array.get_data(); + convert_sorted_idxs_to_ptrs(coo_row, coo_nnz, coo_row_ptrs, num_rows); - auto workspace = Array(exec, num_rows + 1); - auto workspace_val = workspace.get_data(); - for (size_type i = 1; i < num_rows + 1; i <<= 1) { + // Compute the row sizes of Coo without zeros #pragma omp parallel for - for (size_type j = i; j < num_rows + 1; j++) { - workspace_val[j] = csr_row_ptrs[j] + csr_row_ptrs[j - i]; + for (size_type row = 0; row < num_rows; row++) { + IndexType nonzeros{}; + for (auto j = coo_row_ptrs[row]; j < coo_row_ptrs[row + 1]; j++) { + nonzeros += coo_val[j] != zero(); } -#pragma omp parallel for - for (size_type j = i; j < num_rows + 1; j++) { - csr_row_ptrs[j] = workspace_val[j]; + for (size_type col = 0; col < max_nnz_per_row; col++) { + nonzeros += (ell->val_at(row, col) != zero()); } + csr_row_ptrs[row] = nonzeros; } -// Fill in Csr + components::prefix_sum(exec, csr_row_ptrs, num_rows + 1); + + // Fill in Csr #pragma omp parallel for for (IndexType row = 0; row < num_rows; row++) { // Ell part - size_type csr_idx = csr_row_ptrs[row]; - size_type coo_idx = coo_offset_val[row]; + auto csr_idx = csr_row_ptrs[row]; for (IndexType col = 0; col < max_nnz_per_row; col++) { const auto val = ell->val_at(row, col); if (val != zero()) { @@ -173,11 +145,10 @@ void convert_to_csr(std::shared_ptr exec, } } // Coo part - for (auto coo_idx = coo_row_ptrs_val[row]; - coo_idx < coo_row_ptrs_val[row + 1]; coo_idx++) { - if (coo_val[coo_idx] != zero()) { - csr_val[csr_idx] = coo_val[coo_idx]; - csr_col_idxs[csr_idx] = coo_col[coo_idx]; + for (auto j = coo_row_ptrs[row]; j < coo_row_ptrs[row + 1]; j++) { + if (coo_val[j] != zero()) { + csr_val[csr_idx] = coo_val[j]; + csr_col_idxs[csr_idx] = coo_col[j]; csr_idx++; } } diff --git a/omp/test/matrix/hybrid_kernels.cpp b/omp/test/matrix/hybrid_kernels.cpp index 39f56d8442c..2ebe179d15b 100644 --- a/omp/test/matrix/hybrid_kernels.cpp +++ b/omp/test/matrix/hybrid_kernels.cpp @@ -55,6 +55,7 @@ namespace { class Hybrid : public ::testing::Test { protected: + using value_type = double; using Mtx = gko::matrix::Hybrid<>; using Vec = gko::matrix::Dense<>; using ComplexVec = gko::matrix::Dense>; @@ -77,10 +78,17 @@ class Hybrid : public ::testing::Test { template std::unique_ptr gen_mtx(int num_rows, int num_cols, int min_nnz_row) + { + return gen_mtx(num_rows, num_cols, min_nnz_row, num_cols); + } + + template + std::unique_ptr gen_mtx(int num_rows, int num_cols, + int min_nnz_row, int max_nnz_row) { return gko::test::generate_random_matrix( num_rows, num_cols, - std::uniform_int_distribution<>(min_nnz_row, num_cols), + std::uniform_int_distribution<>(min_nnz_row, max_nnz_row), std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); } @@ -230,6 +238,49 @@ TEST_F(Hybrid, CountNonzerosIsEquivalentToRef) } +TEST_F(Hybrid, ConvertEmptyCooToCsrIsEquivalentToRef) +{ + auto balanced_mtx = + Mtx::create(ref, std::make_shared(4)); + balanced_mtx->copy_from(gen_mtx(400, 200, 4, 4).get()); + auto dbalanced_mtx = + Mtx::create(omp, std::make_shared(4)); + dbalanced_mtx->copy_from(balanced_mtx.get()); + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(omp); + + balanced_mtx->convert_to(csr_mtx.get()); + dbalanced_mtx->convert_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); +} + + +TEST_F(Hybrid, ConvertWithEmptyFirstAndLastRowToCsrIsEquivalentToRef) +{ + // create a dense matrix for easier manipulation + auto dense_mtx = gen_mtx(400, 200, 0, 4); + // set first and last row to zero + for (gko::size_type col = 0; col < dense_mtx->get_size()[1]; col++) { + dense_mtx->at(0, col) = gko::zero(); + dense_mtx->at(dense_mtx->get_size()[0] - 1, col) = + gko::zero(); + } + // now convert them to hybrid matrices + auto balanced_mtx = Mtx::create(ref); + balanced_mtx->copy_from(dense_mtx.get()); + auto dbalanced_mtx = Mtx::create(omp); + dbalanced_mtx->copy_from(balanced_mtx.get()); + auto csr_mtx = gko::matrix::Csr<>::create(ref); + auto dcsr_mtx = gko::matrix::Csr<>::create(omp); + + balanced_mtx->convert_to(csr_mtx.get()); + dbalanced_mtx->convert_to(dcsr_mtx.get()); + + GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14); +} + + TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef) { set_up_apply_data(1, std::make_shared(2));