Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix solver kernel documentation and add memory estimates #691

Merged
merged 16 commits into from
Mar 2, 2021
Merged
19 changes: 14 additions & 5 deletions core/solver/bicg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,15 @@ void Bicg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const

int iter = -1;

/* Memory movement summary:
* 28n * values + matrix/preconditioner storage + conj storage
* 2x SpMV: 4n * values + storage + conj storage
* 2x Preconditioner: 4n * values + storage + conj storage
* 2x dot 4n
* 1x step 1 (axpys) 6n
* 1x step 2 (axpys) 9n
* 1x norm2 residual n
*/
while (true) {
get_preconditioner()->apply(r.get(), z.get());
conj_trans_preconditioner->apply(r2.get(), z2.get());
Expand All @@ -201,21 +210,21 @@ void Bicg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
break;
}

exec->run(bicg::make_step_1(p.get(), z.get(), p2.get(), z2.get(),
rho.get(), prev_rho.get(), &stop_status));
// tmp = rho / prev_rho
// p = z + tmp * p
// p2 = z2 + tmp * p2
exec->run(bicg::make_step_1(p.get(), z.get(), p2.get(), z2.get(),
rho.get(), prev_rho.get(), &stop_status));
system_matrix_->apply(p.get(), q.get());
conj_trans_A->apply(p2.get(), q2.get());
p2->compute_dot(q.get(), beta.get());
exec->run(bicg::make_step_2(dense_x, r.get(), r2.get(), p.get(),
q.get(), q2.get(), beta.get(), rho.get(),
&stop_status));
// tmp = rho / beta
// x = x + tmp * p
// r = r - tmp * q
// r2 = r2 - tmp * q2
exec->run(bicg::make_step_2(dense_x, r.get(), r2.get(), p.get(),
q.get(), q2.get(), beta.get(), rho.get(),
&stop_status));
swap(prev_rho, rho);
}
}
Expand Down
29 changes: 19 additions & 10 deletions core/solver/bicgstab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,18 @@ void Bicgstab<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
rr->copy_from(r.get());

int iter = -1;

/* Memory movement summary:
* 31n * values + 2 * matrix/preconditioner storage
* 2x SpMV: 4n * values + 2 * storage
* 2x Preconditioner: 4n * values + 2 * storage
* 3x dot 6n
* 1x norm2 n
* 1x step 1 (fused axpys) 4n
* 1x step 2 (axpy) 3n
* 1x step 3 (fused axpys) 7n
* 2x norm2 residual 2n
*/
while (true) {
++iter;
this->template log<log::Logger::iteration_complete>(this, iter, r.get(),
Expand All @@ -152,21 +164,20 @@ void Bicgstab<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
break;
}

// tmp = rho / prev_rho * alpha / omega
// p = r + tmp * (p - omega * v)
exec->run(bicgstab::make_step_1(r.get(), p.get(), v.get(), rho.get(),
prev_rho.get(), alpha.get(),
omega.get(), &stop_status));
// tmp = rho / prev_rho * alpha / omega
// p = r + tmp * (p - omega * v)

get_preconditioner()->apply(p.get(), y.get());
system_matrix_->apply(y.get(), v.get());
rr->compute_dot(v.get(), beta.get());
exec->run(bicgstab::make_step_2(r.get(), s.get(), v.get(), rho.get(),
alpha.get(), beta.get(), &stop_status));
// alpha = rho / beta
// s = r - alpha * v
exec->run(bicgstab::make_step_2(r.get(), s.get(), v.get(), rho.get(),
alpha.get(), beta.get(), &stop_status));

++iter;
auto all_converged =
stop_criterion->update()
.num_iterations(iter)
upsj marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -178,8 +189,6 @@ void Bicgstab<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
exec->run(bicgstab::make_finalize(dense_x, y.get(), alpha.get(),
&stop_status));
}
this->template log<log::Logger::iteration_complete>(this, iter,
r.get());
if (all_converged) {
break;
}
Expand All @@ -188,12 +197,12 @@ void Bicgstab<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
system_matrix_->apply(z.get(), t.get());
s->compute_dot(t.get(), gamma.get());
t->compute_dot(t.get(), beta.get());
exec->run(bicgstab::make_step_3(
dense_x, r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(),
beta.get(), gamma.get(), omega.get(), &stop_status));
// omega = gamma / beta
// x = x + alpha * y + omega * z
// r = s - omega * t
exec->run(bicgstab::make_step_3(
dense_x, r.get(), s.get(), t.get(), y.get(), z.get(), alpha.get(),
beta.get(), gamma.get(), omega.get(), &stop_status));
swap(prev_rho, rho);
}
} // namespace solver
Expand Down
17 changes: 13 additions & 4 deletions core/solver/cg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ void Cg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
x, r.get());

int iter = -1;
/* Memory movement summary:
* 18n * values + matrix/preconditioner storage
* 1x SpMV: 2n * values + storage
* 1x Preconditioner: 2n * values + storage
* 2x dot 4n
* 1x step 1 (axpy) 3n
* 1x step 2 (axpys) 6n
* 1x norm2 residual n
*/
while (true) {
get_preconditioner()->apply(r.get(), z.get());
r->compute_dot(z.get(), rho.get());
Expand All @@ -144,17 +153,17 @@ void Cg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
break;
}

exec->run(cg::make_step_1(p.get(), z.get(), rho.get(), prev_rho.get(),
&stop_status));
// tmp = rho / prev_rho
// p = z + tmp * p
exec->run(cg::make_step_1(p.get(), z.get(), rho.get(), prev_rho.get(),
&stop_status));
system_matrix_->apply(p.get(), q.get());
p->compute_dot(q.get(), beta.get());
exec->run(cg::make_step_2(dense_x, r.get(), p.get(), q.get(),
beta.get(), rho.get(), &stop_status));
// tmp = rho / beta
// x = x + tmp * p
// r = r - tmp * q
exec->run(cg::make_step_2(dense_x, r.get(), p.get(), q.get(),
beta.get(), rho.get(), &stop_status));
swap(prev_rho, rho);
}
}
Expand Down
30 changes: 18 additions & 12 deletions core/solver/cgs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,34 +138,40 @@ void Cgs<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
r_tld->copy_from(r.get());

int iter = 0;
/* Memory movement summary:
* 28n * values + 2 * matrix/preconditioner storage
* 2x SpMV: 4n * values + 2 * storage
* 2x Preconditioner: 4n * values + 2 * storage
* 2x dot 4n
* 1x step 1 (fused axpys) 5n
* 1x step 2 (fused axpys) 4n
* 1x step 3 (axpys) 6n
* 1x norm2 residual n
*/
while (true) {
r->compute_dot(r_tld.get(), rho.get());
// beta = rho / rho_prev
// u = r + beta * q
// p = u + beta * ( q + beta * p )
exec->run(cgs::make_step_1(r.get(), u.get(), p.get(), q.get(),
beta.get(), rho.get(), rho_prev.get(),
&stop_status));
// beta = rho / rho_prev
// u = r + beta * q;
// p = u + beta * ( q + beta * p );
get_preconditioner()->apply(p.get(), t.get());
system_matrix_->apply(t.get(), v_hat.get());
r_tld->compute_dot(v_hat.get(), gamma.get());
// alpha = rho / gamma
// q = u - alpha * v_hat
// t = u + q
exec->run(cgs::make_step_2(u.get(), v_hat.get(), q.get(), t.get(),
alpha.get(), rho.get(), gamma.get(),
&stop_status));

++iter;
this->template log<log::Logger::iteration_complete>(this, iter, r.get(),
dense_x);

// alpha = rho / gamma
// q = u - alpha * v_hat
// t = u + q
get_preconditioner()->apply(t.get(), u_hat.get());
system_matrix_->apply(u_hat.get(), t.get());
// r = r - alpha * t
// x = x + alpha * u_hat
exec->run(cgs::make_step_3(t.get(), u_hat.get(), r.get(), dense_x,
alpha.get(), &stop_status));
// r = r -alpha * t
// x = x + alpha * u_hat

++iter;
this->template log<log::Logger::iteration_complete>(this, iter, r.get(),
Expand Down
17 changes: 13 additions & 4 deletions core/solver/fcg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,15 @@ void Fcg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
x, r.get());

int iter = -1;
/* Memory movement summary:
* 21n * values + matrix/preconditioner storage
* 1x SpMV: 2n * values + storage
* 1x Preconditioner: 2n * values + storage
* 3x dot 6n
* 1x step 1 (axpy) 3n
* 1x step 2 (fused axpys) 7n
* 1x norm2 residual n
*/
while (true) {
get_preconditioner()->apply(r.get(), z.get());
r->compute_dot(z.get(), rho.get());
Expand All @@ -147,19 +156,19 @@ void Fcg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
break;
}

exec->run(fcg::make_step_1(p.get(), z.get(), rho_t.get(),
prev_rho.get(), &stop_status));
// tmp = rho_t / prev_rho
// p = z + tmp * p
exec->run(fcg::make_step_1(p.get(), z.get(), rho_t.get(),
prev_rho.get(), &stop_status));
system_matrix_->apply(p.get(), q.get());
p->compute_dot(q.get(), beta.get());
exec->run(fcg::make_step_2(dense_x, r.get(), t.get(), p.get(), q.get(),
beta.get(), rho.get(), &stop_status));
// tmp = rho / beta
// [prev_r = r] in registers
// x = x + tmp * p
// r = r - tmp * q
// t = r - [prev_r]
exec->run(fcg::make_step_2(dense_x, r.get(), t.get(), p.get(), q.get(),
beta.get(), rho.get(), &stop_status));
swap(prev_rho, rho);
}
}
Expand Down
62 changes: 39 additions & 23 deletions core/solver/gmres.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,24 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
auto after_preconditioner =
matrix::Dense<ValueType>::create_with_config_of(dense_x);

/* Memory movement summary for average iteration with krylov_dim d:
* (5/2d+21/2+14/d)n * values + (1+1/d) * matrix/preconditioner storage
* 1x SpMV: 2n * values + storage
* 1x Preconditioner: 2n * values + storage
* MGS: (5/2d+11/2)n = sum k=0 to d-1 of (5k+8)n/d
upsj marked this conversation as resolved.
Show resolved Hide resolved
yhmtsai marked this conversation as resolved.
Show resolved Hide resolved
* 1x dots 2(k+1)n in iteration k (0-based)
* 1x axpys 3(k+1)n in iteration k (0-based)
* 1x norm2 n
* 1x scal 2n
* Restart: (1+14/d)n (every dth iteration)
* 1x gemv (d+1)n
* 1x Preconditioner 2n * values + storage
* 1x axpy 3n
* 1x copy 2n
* 1x Advanced SpMV 3n * values + storage
* 1x norm2 n
* 1x scal 2n
*/
while (true) {
++total_iter;
this->template log<log::Logger::iteration_complete>(
Expand All @@ -175,32 +193,31 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const

if (restart_iter == krylov_dim_) {
// Restart
// Solve upper triangular.
// y = hessenberg \ residual_norm_collection
// before_preconditioner = krylov_bases * y
exec->run(gmres::make_step_2(residual_norm_collection.get(),
krylov_bases.get(), hessenberg.get(),
y.get(), before_preconditioner.get(),
&final_iter_nums));
// Solve upper triangular.
// y = hessenberg \ residual_norm_collection
// before_preconditioner = krylov_bases * y

// x = x + get_preconditioner() * before_preconditioner
get_preconditioner()->apply(before_preconditioner.get(),
after_preconditioner.get());
dense_x->add_scaled(one_op.get(), after_preconditioner.get());
// Solve x
// x = x + get_preconditioner() * before_preconditioner
residual->copy_from(dense_b);
// residual = dense_b
residual->copy_from(dense_b);
// residual = residual - Ax
system_matrix_->apply(neg_one_op.get(), dense_x, one_op.get(),
residual.get());
// residual = residual - Ax
exec->run(gmres::make_initialize_2(
residual.get(), residual_norm.get(),
residual_norm_collection.get(), krylov_bases.get(),
&final_iter_nums, krylov_dim_));
// residual_norm = norm(residual)
// residual_norm_collection = {residual_norm, unchanged}
// krylov_bases(:, 1) = residual / residual_norm
// final_iter_nums = {0, ..., 0}
exec->run(gmres::make_initialize_2(
residual.get(), residual_norm.get(),
residual_norm_collection.get(), krylov_bases.get(),
&final_iter_nums, krylov_dim_));
restart_iter = 0;
}
auto this_krylov = krylov_bases->create_submatrix(
Expand All @@ -212,9 +229,9 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
span{system_matrix_->get_size()[0] * (restart_iter + 1),
system_matrix_->get_size()[0] * (restart_iter + 2)},
span{0, dense_b->get_size()[1]});
// preconditioned_vector = get_preconditioner() * this_krylov
get_preconditioner()->apply(this_krylov.get(),
preconditioned_vector.get());
// preconditioned_vector = get_preconditioner() * this_krylov

// Do Arnoldi and givens rotation
auto hessenberg_iter = hessenberg->create_submatrix(
Expand All @@ -223,14 +240,9 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
dense_b->get_size()[1] * (restart_iter + 1)});

// Start of arnoldi
system_matrix_->apply(preconditioned_vector.get(), next_krylov.get());
// next_krylov = A * preconditioned_vector
system_matrix_->apply(preconditioned_vector.get(), next_krylov.get());

exec->run(gmres::make_step_1(
dense_b->get_size()[0], givens_sin.get(), givens_cos.get(),
residual_norm.get(), residual_norm_collection.get(),
krylov_bases.get(), hessenberg_iter.get(), restart_iter,
&final_iter_nums, &stop_status));
// final_iter_nums += 1 (unconverged)
// next_krylov_basis is alias for (restart_iter + 1)-th krylov_bases
// for i in 0:restart_iter(include)
Expand Down Expand Up @@ -267,6 +279,11 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
// residual_norm_collection(restart_iter) = cos(restart_iter) * this_rnc
// residual_norm = abs(next_rnc)
// residual_norm_collection(restart_iter + 1) = next_rnc
exec->run(gmres::make_step_1(
dense_b->get_size()[0], givens_sin.get(), givens_cos.get(),
residual_norm.get(), residual_norm_collection.get(),
krylov_bases.get(), hessenberg_iter.get(), restart_iter,
&final_iter_nums, &stop_status));

restart_iter++;
}
Expand All @@ -279,18 +296,17 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
span{0, restart_iter},
span{0, dense_b->get_size()[1] * (restart_iter)});

// Solve upper triangular.
// y = hessenberg \ residual_norm_collection
// before_preconditioner = krylov_bases * y
exec->run(gmres::make_step_2(
residual_norm_collection.get(), krylov_bases_small.get(),
hessenberg_small.get(), y.get(), before_preconditioner.get(),
&final_iter_nums));
// Solve upper triangular.
// y = hessenberg \ residual_norm_collection
// before_preconditioner = krylov_bases * y
// x = x + get_preconditioner() * before_preconditioner
get_preconditioner()->apply(before_preconditioner.get(),
after_preconditioner.get());
dense_x->add_scaled(one_op.get(), after_preconditioner.get());
// Solve x
// x = x + get_preconditioner() * before_preconditioner
}


Expand Down
Loading