diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
index 66c71c21ae8..8b97a42fdd3 100644
--- a/benchmark/blas/blas.cpp
+++ b/benchmark/blas/blas.cpp
@@ -472,6 +472,12 @@ void apply_blas(const char *operation_name, std::shared_ptr<gko::Executor> exec,
     } catch (const std::exception &e) {
         add_or_set_member(test_case["blas"][operation_name], "completed", false,
                           allocator);
+        if (FLAGS_keep_errors) {
+            rapidjson::Value msg_value;
+            msg_value.SetString(e.what(), allocator);
+            add_or_set_member(test_case["blas"][operation_name], "error",
+                              msg_value, allocator);
+        }
         std::cerr << "Error when processing test case " << test_case << "\n"
                   << "what(): " << e.what() << std::endl;
     }
diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp
index fd9481189b0..18313bf26db 100644
--- a/benchmark/conversions/conversions.cpp
+++ b/benchmark/conversions/conversions.cpp
@@ -88,9 +88,9 @@ void convert_matrix(const gko::LinOp *matrix_from, const char *format_to,
             matrix_to->copy_from(matrix_from);
         }
         add_or_set_member(conversion_case[conversion_name], "time",
-                          timer->compute_average_time(), allocator);
+                          ic.compute_average_time(), allocator);
         add_or_set_member(conversion_case[conversion_name], "repetitions",
-                          timer->get_num_repetitions(), allocator);
+                          ic.get_num_repetitions(), allocator);
 
         // compute and write benchmark data
         add_or_set_member(conversion_case[conversion_name], "completed", true,
@@ -98,6 +98,12 @@ void convert_matrix(const gko::LinOp *matrix_from, const char *format_to,
     } catch (const std::exception &e) {
         add_or_set_member(test_case["conversions"][conversion_name],
                           "completed", false, allocator);
+        if (FLAGS_keep_errors) {
+            rapidjson::Value msg_value;
+            msg_value.SetString(e.what(), allocator);
+            add_or_set_member(test_case["conversions"][conversion_name],
+                              "error", msg_value, allocator);
+        }
         std::cerr << "Error when processing test case " << test_case << "\n"
                   << "what(): " << e.what() << std::endl;
     }
@@ -156,8 +162,7 @@ int main(int argc, char *argv[])
             try {
                 auto matrix_from =
                     share(formats::matrix_factory.at(format_from)(exec, data));
-                for (const auto &format : formats::matrix_factory) {
-                    const auto format_to = std::get<0>(format);
+                for (const auto &format_to : formats) {
                     if (format_from == format_to) {
                         continue;
                     }
diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp
index 695c1e78314..002faeeddff 100644
--- a/benchmark/preconditioner/preconditioner.cpp
+++ b/benchmark/preconditioner/preconditioner.cpp
@@ -229,6 +229,12 @@ void run_preconditioner(const char *precond_name,
                           rapidjson::Value(rapidjson::kObjectType), allocator);
         add_or_set_member(test_case["preconditioner"][encoded_name.c_str()],
                           "completed", false, allocator);
+        if (FLAGS_keep_errors) {
+            rapidjson::Value msg_value;
+            msg_value.SetString(e.what(), allocator);
+            add_or_set_member(test_case["preconditioner"][encoded_name.c_str()],
+                              "error", msg_value, allocator);
+        }
         std::cerr << "Error when processing test case " << test_case << "\n"
                   << "what(): " << e.what() << std::endl;
     }
diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh
index bd481169057..db7fa5bb5ca 100644
--- a/benchmark/run_all_benchmarks.sh
+++ b/benchmark/run_all_benchmarks.sh
@@ -16,6 +16,16 @@ if [ ! "${EXECUTOR}" ]; then
     echo "EXECUTOR    environment variable not set - assuming \"${EXECUTOR}\"" 1>&2
 fi
 
+if [ ! "${REPETITIONS}" ]; then
+    REPETITIONS=10
+    echo "REPETITIONS    environment variable not set - assuming ${REPETITIONS}" 1>&2
+fi
+
+if [ ! "${SOLVER_REPETITIONS}" ]; then
+    SOLVER_REPETITIONS=1
+    echo "SOLVER_REPETITIONS    environment variable not set - assuming ${SOLVER_REPETITIONS}" 1>&2
+fi
+
 if [ ! "${SEGMENTS}" ]; then
     echo "SEGMENTS    environment variable not set - running entire suite" 1>&2
     SEGMENTS=1
@@ -35,6 +45,11 @@ if [ ! "${FORMATS}" ]; then
     FORMATS="csr,coo,ell,hybrid,sellp"
 fi
 
+if [ ! "${ELL_IMBALANCE_LIMIT}" ]; then
+    echo "ELL_IMBALANCE_LIMIT    environment variable not set - assuming 100" 1>&2
+    ELL_IMBALANCE_LIMIT=100
+fi
+
 if [ ! "${SOLVERS}" ]; then
     SOLVERS="bicgstab,cg,cgs,fcg,gmres,cb_gmres_reduce1,idr"
     echo "SOLVERS    environment variable not set - assuming \"${SOLVERS}\"" 1>&2
@@ -67,7 +82,7 @@ fi
 
 if [ ! "${SOLVERS_JACOBI_MAX_BS}" ]; then
     SOLVERS_JACOBI_MAX_BS="32"
-    "SOLVERS_JACOBI_MAX_BS environment variable not set - assuming \"${SOLVERS_JACOBI_MAX_BS}\"" 1>&2
+    echo "SOLVERS_JACOBI_MAX_BS environment variable not set - assuming \"${SOLVERS_JACOBI_MAX_BS}\"" 1>&2
 fi
 
 if [ ! "${BENCHMARK_PRECISION}" ]; then
@@ -202,6 +217,8 @@ run_conversion_benchmarks() {
     ./conversions/conversions${BENCH_SUFFIX} --backup="$1.bkp" --double_buffer="$1.bkp2" \
                 --executor="${EXECUTOR}" --formats="${FORMATS}" \
                 --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
+                --repetitions="${REPETITIONS}" \
+                --ell_imbalance_limit="${ELL_IMBALANCE_LIMIT}" \
                 <"$1.imd" 2>&1 >"$1"
     keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
 }
@@ -218,6 +235,8 @@ run_spmv_benchmarks() {
     ./spmv/spmv${BENCH_SUFFIX} --backup="$1.bkp" --double_buffer="$1.bkp2" \
                 --executor="${EXECUTOR}" --formats="${FORMATS}" \
                 --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
+                --repetitions="${REPETITIONS}" \
+                --ell_imbalance_limit="${ELL_IMBALANCE_LIMIT}" \
                 <"$1.imd" 2>&1 >"$1"
     keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
 }
@@ -239,6 +258,7 @@ run_solver_benchmarks() {
                     --gpu_timer=${GPU_TIMER} \
                     --jacobi_max_block_size=${SOLVERS_JACOBI_MAX_BS} --device_id="${DEVICE_ID}" \
                     --gmres_restart="${SOLVERS_GMRES_RESTART}" \
+                    --repetitions="${SOLVER_REPETITIONS}" \
                     <"$1.imd" 2>&1 >"$1"
     keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
 }
@@ -265,6 +285,7 @@ run_preconditioner_benchmarks() {
                 --jacobi_max_block_size="${bsize}" \
                 --jacobi_storage="${prec}" \
                 --device_id="${DEVICE_ID}" --gpu_timer=${GPU_TIMER} \
+                --repetitions="${REPETITIONS}" \
                 <"$1.imd" 2>&1 >"$1"
             keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd"
         done
diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp
index d709cf442c5..e4df2948cb9 100644
--- a/benchmark/solver/solver.cpp
+++ b/benchmark/solver/solver.cpp
@@ -509,6 +509,12 @@ void solve_system(const std::string &solver_name,
     } catch (const std::exception &e) {
         add_or_set_member(test_case["solver"][precond_solver_name], "completed",
                           false, allocator);
+        if (FLAGS_keep_errors) {
+            rapidjson::Value msg_value;
+            msg_value.SetString(e.what(), allocator);
+            add_or_set_member(test_case["solver"][precond_solver_name], "error",
+                              msg_value, allocator);
+        }
         std::cerr << "Error when processing test case " << test_case << "\n"
                   << "what(): " << e.what() << std::endl;
     }
diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp
index 2bf07e8ad45..6d21ddd88e6 100644
--- a/benchmark/spmv/spmv.cpp
+++ b/benchmark/spmv/spmv.cpp
@@ -130,7 +130,7 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
             for (auto _ : ic_tuning.run()) {
                 system_matrix->apply(lend(b), lend(x_clone));
             }
-            tuning_case["time"].PushBack(tuning_timer->compute_average_time(),
+            tuning_case["time"].PushBack(ic_tuning.compute_average_time(),
                                          allocator);
             tuning_case["values"].PushBack(val, allocator);
         }
@@ -154,6 +154,12 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
     } catch (const std::exception &e) {
         add_or_set_member(test_case["spmv"][format_name], "completed", false,
                           allocator);
+        if (FLAGS_keep_errors) {
+            rapidjson::Value msg_value;
+            msg_value.SetString(e.what(), allocator);
+            add_or_set_member(test_case["spmv"][format_name], "error",
+                              msg_value, allocator);
+        }
         std::cerr << "Error when processing test case " << test_case << "\n"
                   << "what(): " << e.what() << std::endl;
     }
diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index 64de4e10f90..bbce4f25cf2 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/ginkgo.hpp>
 
 
+#include <algorithm>
 #include <map>
 #include <string>
 
@@ -152,6 +153,10 @@ std::string format_command =
 // the formats command-line argument
 DEFINE_string(formats, "coo", formats::format_command.c_str());
 
+DEFINE_int64(ell_imbalance_limit, 100,
+             "Maximal storage overhead above which ELL benchmarks will be "
+             "skipped. Negative values mean no limit.");
+
 
 namespace formats {
 
@@ -181,6 +186,51 @@ std::unique_ptr<MatrixType> read_matrix_from_data(
     return mat;
 }
 
+
+/**
+ * Creates a CSR strategy of the given type for the given executor if possible,
+ * falls back to csr::classical for executors without support for this strategy.
+ *
+ * @tparam Strategy  one of csr::automatical or csr::load_balance
+ */
+template <typename Strategy>
+std::shared_ptr<csr::strategy_type> create_gpu_strategy(
+    std::shared_ptr<const gko::Executor> exec)
+{
+    if (auto cuda = dynamic_cast<const gko::CudaExecutor *>(exec.get())) {
+        return std::make_shared<Strategy>(cuda->shared_from_this());
+    } else if (auto hip = dynamic_cast<const gko::HipExecutor *>(exec.get())) {
+        return std::make_shared<Strategy>(hip->shared_from_this());
+    } else {
+        return std::make_shared<csr::classical>();
+    }
+}
+
+
+/**
+ * Checks whether the given matrix data exceeds the ELL imbalance limit set by
+ * the --ell_imbalance_limit flag
+ *
+ * @throws gko::Error if the imbalance limit is exceeded
+ */
+void check_ell_admissibility(const gko::matrix_data<etype> &data)
+{
+    if (data.size[0] == 0 || FLAGS_ell_imbalance_limit < 0) {
+        return;
+    }
+    std::vector<gko::size_type> row_lengths(data.size[0]);
+    for (auto nz : data.nonzeros) {
+        row_lengths[nz.row]++;
+    }
+    auto max_len = *std::max_element(row_lengths.begin(), row_lengths.end());
+    auto avg_len = data.nonzeros.size() / std::max<double>(data.size[0], 1);
+    if (max_len / avg_len > FLAGS_ell_imbalance_limit) {
+        throw gko::Error(__FILE__, __LINE__,
+                         "Matrix exceeds ELL imbalance limit");
+    }
+}
+
+
 /**
  * Creates a Ginkgo matrix from the intermediate data representation format
  * gko::matrix_data with support for variable arguments.
@@ -201,15 +251,36 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
                                 std::shared_ptr<const gko::Executor>,
                                 const gko::matrix_data<etype> &)>>
     matrix_factory{
-        {"csr", READ_MATRIX(csr, std::make_shared<csr::automatical>())},
-        {"csri", READ_MATRIX(csr, std::make_shared<csr::load_balance>())},
+        {"csr",
+         [](std::shared_ptr<const gko::Executor> exec,
+            const gko::matrix_data<etype> &data) -> std::unique_ptr<csr> {
+            auto mat =
+                csr::create(exec, create_gpu_strategy<csr::automatical>(exec));
+            mat->read(data);
+            return mat;
+         }},
+        {"csri",
+         [](std::shared_ptr<const gko::Executor> exec,
+            const gko::matrix_data<etype> &data) -> std::unique_ptr<csr> {
+             auto mat = csr::create(
+                 exec, create_gpu_strategy<csr::load_balance>(exec));
+             mat->read(data);
+             return mat;
+         }},
         {"csrm", READ_MATRIX(csr, std::make_shared<csr::merge_path>())},
         {"csrc", READ_MATRIX(csr, std::make_shared<csr::classical>())},
         {"coo", read_matrix_from_data<gko::matrix::Coo<etype>>},
-        {"ell", read_matrix_from_data<gko::matrix::Ell<etype>>},
+        {"ell", [](std::shared_ptr<const gko::Executor> exec,
+            const gko::matrix_data<etype> &data) {
+             check_ell_admissibility(data);
+             auto mat = gko::matrix::Ell<etype>::create(exec);
+             mat->read(data);
+             return mat;
+         }},
         {"ell-mixed",
          [](std::shared_ptr<const gko::Executor> exec,
             const gko::matrix_data<etype> &data) {
+             check_ell_admissibility(data);
              gko::matrix_data<gko::next_precision<etype>> conv_data;
              conv_data.size = data.size;
              conv_data.nonzeros.resize(data.nonzeros.size());
@@ -220,7 +291,8 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
                  it->value = el.value;
                  ++it;
              }
-             auto mat = gko::matrix::Ell<gko::next_precision<etype>>::create(std::move(exec));
+             auto mat = gko::matrix::Ell<gko::next_precision<etype>>::create(
+                 std::move(exec));
              mat->read(conv_data);
              return mat;
          }},
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 84fb5926af4..dca22e7fe27 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -84,6 +84,10 @@ DEFINE_string(double_buffer, "",
 DEFINE_bool(detailed, true,
             "If set, performs several runs to obtain more detailed results");
 
+DEFINE_bool(keep_errors, false,
+            "If set, writes exception messages during the execution into the "
+            "JSON output");
+
 DEFINE_bool(nested_names, false, "If set, separately logs nested operations");
 
 DEFINE_uint32(seed, 42, "Seed used for the random number generator");
diff --git a/omp/components/format_conversion.hpp b/omp/components/format_conversion.hpp
index 706c686ac83..d634541c6f3 100644
--- a/omp/components/format_conversion.hpp
+++ b/omp/components/format_conversion.hpp
@@ -40,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/types.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace omp {
@@ -84,16 +87,25 @@ inline void convert_unsorted_idxs_to_ptrs(const IndexType *idxs,
 template <typename IndexType>
 inline void convert_sorted_idxs_to_ptrs(const IndexType *idxs,
                                         size_type num_nonzeros, IndexType *ptrs,
-                                        size_type length)
+                                        size_type num_rows)
 {
     ptrs[0] = 0;
-    ptrs[length - 1] = num_nonzeros;
 
-#pragma omp parallel for schedule( \
-    static, ceildiv(num_nonzeros, omp_get_max_threads()))
-    for (size_type i = 0; i < num_nonzeros - 1; i++) {
-        for (size_type j = idxs[i] + 1; j <= idxs[i + 1]; j++) {
-            ptrs[j] = i + 1;
+    if (num_nonzeros == 0) {
+#pragma omp parallel for
+        for (size_type row = 0; row < num_rows; row++) {
+            ptrs[row + 1] = 0;
+        }
+    } else {
+        // add virtual sentinel values 0 and num_rows to handle empty first and
+        // last rows
+#pragma omp parallel for
+        for (size_type i = 0; i <= num_nonzeros; i++) {
+            auto begin_row = i == 0 ? size_type{} : idxs[i - 1];
+            auto end_row = i == num_nonzeros ? num_rows : idxs[i];
+            for (auto row = begin_row; row < end_row; row++) {
+                ptrs[row + 1] = i;
+            }
         }
     }
 }
diff --git a/omp/matrix/coo_kernels.cpp b/omp/matrix/coo_kernels.cpp
index 71f5283104c..1f9055dc129 100644
--- a/omp/matrix/coo_kernels.cpp
+++ b/omp/matrix/coo_kernels.cpp
@@ -204,9 +204,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 template <typename IndexType>
 void convert_row_idxs_to_ptrs(std::shared_ptr<const OmpExecutor> exec,
                               const IndexType *idxs, size_type num_nonzeros,
-                              IndexType *ptrs, size_type length)
+                              IndexType *ptrs, size_type num_rows)
 {
-    convert_sorted_idxs_to_ptrs(idxs, num_nonzeros, ptrs, length);
+    convert_sorted_idxs_to_ptrs(idxs, num_nonzeros, ptrs, num_rows);
 }
 
 
@@ -222,8 +222,7 @@ void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
 
     const auto source_row_idxs = source->get_const_row_idxs();
 
-    convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs,
-                             num_rows + 1);
+    convert_row_idxs_to_ptrs(exec, source_row_idxs, nnz, row_ptrs, num_rows);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp
index 204da570121..f97bee4a640 100644
--- a/omp/matrix/hybrid_kernels.cpp
+++ b/omp/matrix/hybrid_kernels.cpp
@@ -112,58 +112,30 @@ void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
     const auto coo_row = source->get_const_coo_row_idxs();
     const auto coo_nnz = source->get_coo_num_stored_elements();
     const auto num_rows = source->get_size()[0];
-    auto coo_row_ptrs = Array<IndexType>(exec, num_rows + 1);
-    auto coo_row_ptrs_val = coo_row_ptrs.get_data();
-    convert_sorted_idxs_to_ptrs(coo_row, coo_nnz, coo_row_ptrs_val,
-                                num_rows + 1);
-
-    // Compute the row offset of Coo without zeros
-    auto coo_offset = Array<IndexType>(exec, num_rows);
-    auto coo_offset_val = coo_offset.get_data();
-    for (size_type i = 0; i < num_rows; i++) {
-        IndexType nonzeros = 0;
-#pragma omp parallel for reduction(+ : nonzeros)
-        for (size_type j = coo_row_ptrs_val[i]; j < coo_row_ptrs_val[i + 1];
-             j++) {
-            nonzeros += coo_val[j] != zero<ValueType>();
-        }
-        coo_offset_val[i] = nonzeros;
-    }
-
-    // Compute row pointer of Csr
-    csr_row_ptrs[0] = 0;
-#pragma omp parallel for
-    for (size_type i = 0; i < num_rows; i++) {
-        csr_row_ptrs[i + 1] = coo_offset_val[i];
-    }
-
-    for (size_type col = 0; col < max_nnz_per_row; col++) {
-#pragma omp parallel for
-        for (size_type row = 0; row < num_rows; row++) {
-            csr_row_ptrs[row + 1] +=
-                (ell->val_at(row, col) != zero<ValueType>());
-        }
-    }
+    auto coo_row_ptrs_array = Array<IndexType>(exec, num_rows + 1);
+    auto coo_row_ptrs = coo_row_ptrs_array.get_data();
+    convert_sorted_idxs_to_ptrs(coo_row, coo_nnz, coo_row_ptrs, num_rows);
 
-    auto workspace = Array<IndexType>(exec, num_rows + 1);
-    auto workspace_val = workspace.get_data();
-    for (size_type i = 1; i < num_rows + 1; i <<= 1) {
+    // Compute the row sizes of Coo without zeros
 #pragma omp parallel for
-        for (size_type j = i; j < num_rows + 1; j++) {
-            workspace_val[j] = csr_row_ptrs[j] + csr_row_ptrs[j - i];
+    for (size_type row = 0; row < num_rows; row++) {
+        IndexType nonzeros{};
+        for (auto j = coo_row_ptrs[row]; j < coo_row_ptrs[row + 1]; j++) {
+            nonzeros += coo_val[j] != zero<ValueType>();
         }
-#pragma omp parallel for
-        for (size_type j = i; j < num_rows + 1; j++) {
-            csr_row_ptrs[j] = workspace_val[j];
+        for (size_type col = 0; col < max_nnz_per_row; col++) {
+            nonzeros += (ell->val_at(row, col) != zero<ValueType>());
         }
+        csr_row_ptrs[row] = nonzeros;
     }
 
-// Fill in Csr
+    components::prefix_sum(exec, csr_row_ptrs, num_rows + 1);
+
+    // Fill in Csr
 #pragma omp parallel for
     for (IndexType row = 0; row < num_rows; row++) {
         // Ell part
-        size_type csr_idx = csr_row_ptrs[row];
-        size_type coo_idx = coo_offset_val[row];
+        auto csr_idx = csr_row_ptrs[row];
         for (IndexType col = 0; col < max_nnz_per_row; col++) {
             const auto val = ell->val_at(row, col);
             if (val != zero<ValueType>()) {
@@ -173,11 +145,10 @@ void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
             }
         }
         // Coo part
-        for (auto coo_idx = coo_row_ptrs_val[row];
-             coo_idx < coo_row_ptrs_val[row + 1]; coo_idx++) {
-            if (coo_val[coo_idx] != zero<ValueType>()) {
-                csr_val[csr_idx] = coo_val[coo_idx];
-                csr_col_idxs[csr_idx] = coo_col[coo_idx];
+        for (auto j = coo_row_ptrs[row]; j < coo_row_ptrs[row + 1]; j++) {
+            if (coo_val[j] != zero<ValueType>()) {
+                csr_val[csr_idx] = coo_val[j];
+                csr_col_idxs[csr_idx] = coo_col[j];
                 csr_idx++;
             }
         }
diff --git a/omp/test/matrix/hybrid_kernels.cpp b/omp/test/matrix/hybrid_kernels.cpp
index 39f56d8442c..2ebe179d15b 100644
--- a/omp/test/matrix/hybrid_kernels.cpp
+++ b/omp/test/matrix/hybrid_kernels.cpp
@@ -55,6 +55,7 @@ namespace {
 
 class Hybrid : public ::testing::Test {
 protected:
+    using value_type = double;
     using Mtx = gko::matrix::Hybrid<>;
     using Vec = gko::matrix::Dense<>;
     using ComplexVec = gko::matrix::Dense<std::complex<double>>;
@@ -77,10 +78,17 @@ class Hybrid : public ::testing::Test {
     template <typename MtxType = Vec>
     std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
                                      int min_nnz_row)
+    {
+        return gen_mtx<MtxType>(num_rows, num_cols, min_nnz_row, num_cols);
+    }
+
+    template <typename MtxType = Vec>
+    std::unique_ptr<MtxType> gen_mtx(int num_rows, int num_cols,
+                                     int min_nnz_row, int max_nnz_row)
     {
         return gko::test::generate_random_matrix<MtxType>(
             num_rows, num_cols,
-            std::uniform_int_distribution<>(min_nnz_row, num_cols),
+            std::uniform_int_distribution<>(min_nnz_row, max_nnz_row),
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
@@ -230,6 +238,49 @@ TEST_F(Hybrid, CountNonzerosIsEquivalentToRef)
 }
 
 
+TEST_F(Hybrid, ConvertEmptyCooToCsrIsEquivalentToRef)
+{
+    auto balanced_mtx =
+        Mtx::create(ref, std::make_shared<Mtx::column_limit>(4));
+    balanced_mtx->copy_from(gen_mtx(400, 200, 4, 4).get());
+    auto dbalanced_mtx =
+        Mtx::create(omp, std::make_shared<Mtx::column_limit>(4));
+    dbalanced_mtx->copy_from(balanced_mtx.get());
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(omp);
+
+    balanced_mtx->convert_to(csr_mtx.get());
+    dbalanced_mtx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
+}
+
+
+TEST_F(Hybrid, ConvertWithEmptyFirstAndLastRowToCsrIsEquivalentToRef)
+{
+    // create a dense matrix for easier manipulation
+    auto dense_mtx = gen_mtx(400, 200, 0, 4);
+    // set first and last row to zero
+    for (gko::size_type col = 0; col < dense_mtx->get_size()[1]; col++) {
+        dense_mtx->at(0, col) = gko::zero<value_type>();
+        dense_mtx->at(dense_mtx->get_size()[0] - 1, col) =
+            gko::zero<value_type>();
+    }
+    // now convert them to hybrid matrices
+    auto balanced_mtx = Mtx::create(ref);
+    balanced_mtx->copy_from(dense_mtx.get());
+    auto dbalanced_mtx = Mtx::create(omp);
+    dbalanced_mtx->copy_from(balanced_mtx.get());
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(omp);
+
+    balanced_mtx->convert_to(csr_mtx.get());
+    dbalanced_mtx->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 1e-14);
+}
+
+
 TEST_F(Hybrid, ConvertToCsrIsEquivalentToRef)
 {
     set_up_apply_data(1, std::make_shared<Mtx::column_limit>(2));