diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index bbc4eab0e03..7cb51f2f404 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -184,6 +184,15 @@ void Bicg::apply_impl(const LinOp *b, LinOp *x) const int iter = -1; + /* Memory movement summary: + * 28n * values + matrix/preconditioner storage + conj storage + * 2x SpMV: 4n * values + storage + conj storage + * 2x Preconditioner: 4n * values + storage + conj storage + * 2x dot 4n + * 1x step 1 (axpys) 6n + * 1x step 2 (axpys) 9n + * 1x norm2 residual n + */ while (true) { get_preconditioner()->apply(r.get(), z.get()); conj_trans_preconditioner->apply(r2.get(), z2.get()); @@ -201,21 +210,21 @@ void Bicg::apply_impl(const LinOp *b, LinOp *x) const break; } - exec->run(bicg::make_step_1(p.get(), z.get(), p2.get(), z2.get(), - rho.get(), prev_rho.get(), &stop_status)); // tmp = rho / prev_rho // p = z + tmp * p // p2 = z2 + tmp * p2 + exec->run(bicg::make_step_1(p.get(), z.get(), p2.get(), z2.get(), + rho.get(), prev_rho.get(), &stop_status)); system_matrix_->apply(p.get(), q.get()); conj_trans_A->apply(p2.get(), q2.get()); p2->compute_dot(q.get(), beta.get()); - exec->run(bicg::make_step_2(dense_x, r.get(), r2.get(), p.get(), - q.get(), q2.get(), beta.get(), rho.get(), - &stop_status)); // tmp = rho / beta // x = x + tmp * p // r = r - tmp * q // r2 = r2 - tmp * q2 + exec->run(bicg::make_step_2(dense_x, r.get(), r2.get(), p.get(), + q.get(), q2.get(), beta.get(), rho.get(), + &stop_status)); swap(prev_rho, rho); } } diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index 0ee788387db..f3828797fdb 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -137,6 +137,18 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const rr->copy_from(r.get()); int iter = -1; + + /* Memory movement summary: + * 31n * values + 2 * matrix/preconditioner storage + * 2x SpMV: 4n * values + 2 * storage + * 2x Preconditioner: 4n * values + 2 * storage + * 3x dot 6n + * 1x norm2 n + * 1x step 1 (fused axpys) 4n + * 1x step 2 (axpy) 3n + * 1x step 3 (fused axpys) 7n + * 2x norm2 residual 2n + */ while (true) { ++iter; this->template log(this, iter, r.get(), @@ -152,21 +164,20 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const break; } + // tmp = rho / prev_rho * alpha / omega + // p = r + tmp * (p - omega * v) exec->run(bicgstab::make_step_1(r.get(), p.get(), v.get(), rho.get(), prev_rho.get(), alpha.get(), omega.get(), &stop_status)); - // tmp = rho / prev_rho * alpha / omega - // p = r + tmp * (p - omega * v) get_preconditioner()->apply(p.get(), y.get()); system_matrix_->apply(y.get(), v.get()); rr->compute_dot(v.get(), beta.get()); - exec->run(bicgstab::make_step_2(r.get(), s.get(), v.get(), rho.get(), - alpha.get(), beta.get(), &stop_status)); // alpha = rho / beta // s = r - alpha * v + exec->run(bicgstab::make_step_2(r.get(), s.get(), v.get(), rho.get(), + alpha.get(), beta.get(), &stop_status)); - ++iter; auto all_converged = stop_criterion->update() .num_iterations(iter) @@ -178,8 +189,6 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const exec->run(bicgstab::make_finalize(dense_x, y.get(), alpha.get(), &stop_status)); } - this->template log(this, iter, - r.get()); if (all_converged) { break; } @@ -188,12 +197,12 @@ void Bicgstab::apply_impl(const LinOp *b, LinOp *x) const system_matrix_->apply(z.get(), t.get()); s->compute_dot(t.get(), gamma.get()); t->compute_dot(t.get(), beta.get()); - exec->run(bicgstab::make_step_3( - dense_x, r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(), - beta.get(), gamma.get(), omega.get(), &stop_status)); // omega = gamma / beta // x = x + alpha * y + omega * z // r = s - omega * t + exec->run(bicgstab::make_step_3( + dense_x, r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(), + beta.get(), gamma.get(), omega.get(), &stop_status)); swap(prev_rho, rho); } } // namespace solver diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp index dd0558acf75..5a76392abff 100644 --- a/core/solver/cg.cpp +++ b/core/solver/cg.cpp @@ -128,6 +128,15 @@ void Cg::apply_impl(const LinOp *b, LinOp *x) const x, r.get()); int iter = -1; + /* Memory movement summary: + * 18n * values + matrix/preconditioner storage + * 1x SpMV: 2n * values + storage + * 1x Preconditioner: 2n * values + storage + * 2x dot 4n + * 1x step 1 (axpy) 3n + * 1x step 2 (axpys) 6n + * 1x norm2 residual n + */ while (true) { get_preconditioner()->apply(r.get(), z.get()); r->compute_dot(z.get(), rho.get()); @@ -144,17 +153,17 @@ void Cg::apply_impl(const LinOp *b, LinOp *x) const break; } - exec->run(cg::make_step_1(p.get(), z.get(), rho.get(), prev_rho.get(), - &stop_status)); // tmp = rho / prev_rho // p = z + tmp * p + exec->run(cg::make_step_1(p.get(), z.get(), rho.get(), prev_rho.get(), + &stop_status)); system_matrix_->apply(p.get(), q.get()); p->compute_dot(q.get(), beta.get()); - exec->run(cg::make_step_2(dense_x, r.get(), p.get(), q.get(), - beta.get(), rho.get(), &stop_status)); // tmp = rho / beta // x = x + tmp * p // r = r - tmp * q + exec->run(cg::make_step_2(dense_x, r.get(), p.get(), q.get(), + beta.get(), rho.get(), &stop_status)); swap(prev_rho, rho); } } diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index febacbb03bf..99438ba4a1f 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -138,34 +138,40 @@ void Cgs::apply_impl(const LinOp *b, LinOp *x) const r_tld->copy_from(r.get()); int iter = 0; + /* Memory movement summary: + * 28n * values + 2 * matrix/preconditioner storage + * 2x SpMV: 4n * values + 2 * storage + * 2x Preconditioner: 4n * values + 2 * storage + * 2x dot 4n + * 1x step 1 (fused axpys) 5n + * 1x step 2 (fused axpys) 4n + * 1x step 3 (axpys) 6n + * 1x norm2 residual n + */ while (true) { r->compute_dot(r_tld.get(), rho.get()); + // beta = rho / rho_prev + // u = r + beta * q + // p = u + beta * ( q + beta * p ) exec->run(cgs::make_step_1(r.get(), u.get(), p.get(), q.get(), beta.get(), rho.get(), rho_prev.get(), &stop_status)); - // beta = rho / rho_prev - // u = r + beta * q; - // p = u + beta * ( q + beta * p ); get_preconditioner()->apply(p.get(), t.get()); system_matrix_->apply(t.get(), v_hat.get()); r_tld->compute_dot(v_hat.get(), gamma.get()); + // alpha = rho / gamma + // q = u - alpha * v_hat + // t = u + q exec->run(cgs::make_step_2(u.get(), v_hat.get(), q.get(), t.get(), alpha.get(), rho.get(), gamma.get(), &stop_status)); - ++iter; - this->template log(this, iter, r.get(), - dense_x); - - // alpha = rho / gamma - // q = u - alpha * v_hat - // t = u + q get_preconditioner()->apply(t.get(), u_hat.get()); system_matrix_->apply(u_hat.get(), t.get()); + // r = r - alpha * t + // x = x + alpha * u_hat exec->run(cgs::make_step_3(t.get(), u_hat.get(), r.get(), dense_x, alpha.get(), &stop_status)); - // r = r -alpha * t - // x = x + alpha * u_hat ++iter; this->template log(this, iter, r.get(), diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp index dea9aeff494..1e38cdae02d 100644 --- a/core/solver/fcg.cpp +++ b/core/solver/fcg.cpp @@ -130,6 +130,15 @@ void Fcg::apply_impl(const LinOp *b, LinOp *x) const x, r.get()); int iter = -1; + /* Memory movement summary: + * 21n * values + matrix/preconditioner storage + * 1x SpMV: 2n * values + storage + * 1x Preconditioner: 2n * values + storage + * 3x dot 6n + * 1x step 1 (axpy) 3n + * 1x step 2 (fused axpys) 7n + * 1x norm2 residual n + */ while (true) { get_preconditioner()->apply(r.get(), z.get()); r->compute_dot(z.get(), rho.get()); @@ -147,19 +156,19 @@ void Fcg::apply_impl(const LinOp *b, LinOp *x) const break; } - exec->run(fcg::make_step_1(p.get(), z.get(), rho_t.get(), - prev_rho.get(), &stop_status)); // tmp = rho_t / prev_rho // p = z + tmp * p + exec->run(fcg::make_step_1(p.get(), z.get(), rho_t.get(), + prev_rho.get(), &stop_status)); system_matrix_->apply(p.get(), q.get()); p->compute_dot(q.get(), beta.get()); - exec->run(fcg::make_step_2(dense_x, r.get(), t.get(), p.get(), q.get(), - beta.get(), rho.get(), &stop_status)); // tmp = rho / beta // [prev_r = r] in registers // x = x + tmp * p // r = r - tmp * q // t = r - [prev_r] + exec->run(fcg::make_step_2(dense_x, r.get(), t.get(), p.get(), q.get(), + beta.get(), rho.get(), &stop_status)); swap(prev_rho, rho); } } diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp index 3ee99cd3ed3..a5727cdb151 100644 --- a/core/solver/gmres.cpp +++ b/core/solver/gmres.cpp @@ -159,6 +159,24 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const auto after_preconditioner = matrix::Dense::create_with_config_of(dense_x); + /* Memory movement summary for average iteration with krylov_dim d: + * (5/2d+21/2+14/d)n * values + (1+1/d) * matrix/preconditioner storage + * 1x SpMV: 2n * values + storage + * 1x Preconditioner: 2n * values + storage + * MGS: (5/2d+11/2)n = sum k=0 to d-1 of (5k+8)n/d + * 1x dots 2(k+1)n in iteration k (0-based) + * 1x axpys 3(k+1)n in iteration k (0-based) + * 1x norm2 n + * 1x scal 2n + * Restart: (1+14/d)n (every dth iteration) + * 1x gemv (d+1)n + * 1x Preconditioner 2n * values + storage + * 1x axpy 3n + * 1x copy 2n + * 1x Advanced SpMV 3n * values + storage + * 1x norm2 n + * 1x scal 2n + */ while (true) { ++total_iter; this->template log( @@ -175,32 +193,31 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const if (restart_iter == krylov_dim_) { // Restart + // Solve upper triangular. + // y = hessenberg \ residual_norm_collection + // before_preconditioner = krylov_bases * y exec->run(gmres::make_step_2(residual_norm_collection.get(), krylov_bases.get(), hessenberg.get(), y.get(), before_preconditioner.get(), &final_iter_nums)); - // Solve upper triangular. - // y = hessenberg \ residual_norm_collection - // before_preconditioner = krylov_bases * y + // x = x + get_preconditioner() * before_preconditioner get_preconditioner()->apply(before_preconditioner.get(), after_preconditioner.get()); dense_x->add_scaled(one_op.get(), after_preconditioner.get()); - // Solve x - // x = x + get_preconditioner() * before_preconditioner - residual->copy_from(dense_b); // residual = dense_b + residual->copy_from(dense_b); + // residual = residual - Ax system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(), residual.get()); - // residual = residual - Ax - exec->run(gmres::make_initialize_2( - residual.get(), residual_norm.get(), - residual_norm_collection.get(), krylov_bases.get(), - &final_iter_nums, krylov_dim_)); // residual_norm = norm(residual) // residual_norm_collection = {residual_norm, unchanged} // krylov_bases(:, 1) = residual / residual_norm // final_iter_nums = {0, ..., 0} + exec->run(gmres::make_initialize_2( + residual.get(), residual_norm.get(), + residual_norm_collection.get(), krylov_bases.get(), + &final_iter_nums, krylov_dim_)); restart_iter = 0; } auto this_krylov = krylov_bases->create_submatrix( @@ -212,9 +229,9 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const span{system_matrix_->get_size()[0] * (restart_iter + 1), system_matrix_->get_size()[0] * (restart_iter + 2)}, span{0, dense_b->get_size()[1]}); + // preconditioned_vector = get_preconditioner() * this_krylov get_preconditioner()->apply(this_krylov.get(), preconditioned_vector.get()); - // preconditioned_vector = get_preconditioner() * this_krylov // Do Arnoldi and givens rotation auto hessenberg_iter = hessenberg->create_submatrix( @@ -223,14 +240,9 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const dense_b->get_size()[1] * (restart_iter + 1)}); // Start of arnoldi - system_matrix_->apply(preconditioned_vector.get(), next_krylov.get()); // next_krylov = A * preconditioned_vector + system_matrix_->apply(preconditioned_vector.get(), next_krylov.get()); - exec->run(gmres::make_step_1( - dense_b->get_size()[0], givens_sin.get(), givens_cos.get(), - residual_norm.get(), residual_norm_collection.get(), - krylov_bases.get(), hessenberg_iter.get(), restart_iter, - &final_iter_nums, &stop_status)); // final_iter_nums += 1 (unconverged) // next_krylov_basis is alias for (restart_iter + 1)-th krylov_bases // for i in 0:restart_iter(include) @@ -267,6 +279,11 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const // residual_norm_collection(restart_iter) = cos(restart_iter) * this_rnc // residual_norm = abs(next_rnc) // residual_norm_collection(restart_iter + 1) = next_rnc + exec->run(gmres::make_step_1( + dense_b->get_size()[0], givens_sin.get(), givens_cos.get(), + residual_norm.get(), residual_norm_collection.get(), + krylov_bases.get(), hessenberg_iter.get(), restart_iter, + &final_iter_nums, &stop_status)); restart_iter++; } @@ -279,18 +296,17 @@ void Gmres::apply_impl(const LinOp *b, LinOp *x) const span{0, restart_iter}, span{0, dense_b->get_size()[1] * (restart_iter)}); + // Solve upper triangular. + // y = hessenberg \ residual_norm_collection + // before_preconditioner = krylov_bases * y exec->run(gmres::make_step_2( residual_norm_collection.get(), krylov_bases_small.get(), hessenberg_small.get(), y.get(), before_preconditioner.get(), &final_iter_nums)); - // Solve upper triangular. - // y = hessenberg \ residual_norm_collection - // before_preconditioner = krylov_bases * y + // x = x + get_preconditioner() * before_preconditioner get_preconditioner()->apply(before_preconditioner.get(), after_preconditioner.get()); dense_x->add_scaled(one_op.get(), after_preconditioner.get()); - // Solve x - // x = x + get_preconditioner() * before_preconditioner } diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index 88683136a00..8dcde2e00d9 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -171,6 +171,24 @@ void Idr::iterate(const LinOp *b, LinOp *x) const int total_iter = -1; + /* Memory movement summary for iteration with subspace dimension s + * Per iteration: + * (11/2s^2+31/2s+18)n * values + (s+1) * matrix/preconditioner storage + * (s+1)x SpMV: 2(s+1)n * values + (s+1) * storage + * (s+1)x Preconditioner: 2(s+1)n * values + (s+1) * storage + * 1x multidot (gemv) (s+1)n + * sx step 1 (fused axpys) s(s/2+5/2)n = sum k=[0,s) of (s-k+2)n + * sx step 2 (fused axpys) s(s/2+5/2)n = sum k=[0,s) of (s-k+2)n + * sx step 3: s(9/2s+11/2)n = sum k=[0,s) of (8k+2+s-k+1+6)n + * 1x orthogonalize g+u (8k+2)n in iteration k (0-based) + * 1x multidot (gemv) (s-k+1)n in iteration k (0-based) + * 2x axpy 6n + * 1x dot 2n + * 2x norm2 2n + * 1x scale 2n + * 2x axpy 6n + * 1x norm2 residual n + */ while (true) { ++total_iter; this->template log( @@ -185,44 +203,45 @@ void Idr::iterate(const LinOp *b, LinOp *x) const break; } - subspace_vectors->apply(residual.get(), f.get()); // f = P^H * residual + subspace_vectors->apply(residual.get(), f.get()); for (size_type k = 0; k < subspace_dim_; k++) { + // c = M \ f = (c_1, ..., c_s)^T + // v = residual - sum i=[k,s) of (c_i * g_i) exec->run(idr::make_step_1(nrhs, k, m.get(), f.get(), residual.get(), g.get(), c.get(), v.get(), &stop_status)); - // c = M \ f = (c_1, ..., c_s)^T - // v = residual - c_k * g_k - ... - c_s * g_s get_preconditioner()->apply(v.get(), helper.get()); + // u_k = omega * precond_vector + sum i=[k,s) of (c_i * u_i) exec->run(idr::make_step_2(nrhs, k, omega.get(), helper.get(), c.get(), u.get(), &stop_status)); - // u_k = omega * preconditioned_vector + c_k * u_k + ... + c_s * u_s auto u_k = u->create_submatrix(span{0, problem_size}, span{k * nrhs, (k + 1) * nrhs}); - system_matrix_->apply(u_k.get(), helper.get()); // g_k = Au_k + system_matrix_->apply(u_k.get(), helper.get()); - exec->run(idr::make_step_3(nrhs, k, subspace_vectors.get(), g.get(), - helper.get(), u.get(), m.get(), f.get(), - alpha.get(), residual.get(), dense_x, - &stop_status)); - // for i = 1 to k - 1 do + // for i = [0,k) // alpha = p^H_i * g_k / m_i,i // g_k -= alpha * g_i // u_k -= alpha * u_i // end for - // for i = k to s do + // store g_k to g + // for i = [k,s) // m_i,k = p^H_i * g_k // end for // beta = f_k / m_k,k // residual -= beta * g_k // dense_x += beta * u_k - // f = (0,...,0,f_k+1 - beta * m_k+1,k,...,f_s - beta * m_s,k) + // f = (0,...,0,f_k+1 - beta * m_k+1,k,...,f_s-1 - beta * m_s-1,k) + exec->run(idr::make_step_3(nrhs, k, subspace_vectors.get(), g.get(), + helper.get(), u.get(), m.get(), f.get(), + alpha.get(), residual.get(), dense_x, + &stop_status)); } get_preconditioner()->apply(residual.get(), helper.get()); @@ -232,14 +251,6 @@ void Idr::iterate(const LinOp *b, LinOp *x) const t->compute_dot(t.get(), tht.get()); residual->compute_norm2(residual_norm.get()); - exec->run(idr::make_compute_omega(nrhs, kappa_, tht.get(), - residual_norm.get(), omega.get(), - &stop_status)); - - t->scale(subspace_neg_one_op.get()); - residual->add_scaled(omega.get(), t.get()); - dense_x->add_scaled(omega.get(), helper.get()); - // omega = (t^H * residual) / (t^H * t) // rho = (t^H * residual) / (norm(t) * norm(residual)) // if abs(rho) < kappa then @@ -247,6 +258,13 @@ void Idr::iterate(const LinOp *b, LinOp *x) const // end if // residual -= omega * t // dense_x += omega * v + exec->run(idr::make_compute_omega(nrhs, kappa_, tht.get(), + residual_norm.get(), omega.get(), + &stop_status)); + + t->scale(subspace_neg_one_op.get()); + residual->add_scaled(omega.get(), t.get()); + dense_x->add_scaled(omega.get(), helper.get()); } } diff --git a/cuda/solver/idr_kernels.cu b/cuda/solver/idr_kernels.cu index a493b2c8105..13dd40cd762 100644 --- a/cuda/solver/idr_kernels.cu +++ b/cuda/solver/idr_kernels.cu @@ -33,8 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/idr_kernels.hpp" +#include #include -#include #include diff --git a/hip/solver/idr_kernels.hip.cpp b/hip/solver/idr_kernels.hip.cpp index 5c6b3bd6e92..a467504e95c 100644 --- a/hip/solver/idr_kernels.hip.cpp +++ b/hip/solver/idr_kernels.hip.cpp @@ -33,8 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/solver/idr_kernels.hpp" +#include #include -#include #include diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp index debfaf86eb8..44dc5d870be 100644 --- a/omp/solver/idr_kernels.cpp +++ b/omp/solver/idr_kernels.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp index c1e40c1786c..18a152cbf91 100644 --- a/reference/solver/idr_kernels.cpp +++ b/reference/solver/idr_kernels.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include