AffineConstraints::condense does not work properly in certain cases with MPI #11195

yor-dev · 2020-11-17T01:30:31Z

The following code tests AffineConstraints::condense(const VectorType &vec_ghosted, VectorType &output) for Trilinos MPI vector. The way of the test is by comparing two solution vector u_1 and u2 obtained by the following simple equations.
M * u_1 = v_1 ... eq.1,
M * u_2 = v_2 ...eq.2,
where M is the mass matrix, v_1 is a r.h.s vector assembled each cell contribution with distribute_local_to_global, and v_2 is also a r.h.s vector defined as M * u_1 (v_2 = M * u_1). v_2 is calculated after solving eq.1.
For v_2, I need to condense this vector to solve eq.2, and it is done by AffineConstraints::condense(const VectorType &vec_ghosted, VectorType &output).

#include <iostream>
#include <mpi.h>

#include <deal.II/base/conditional_ostream.h>
#include <deal.II/base/utilities.h>
#include <deal.II/distributed/grid_refinement.h>
#include <deal.II/distributed/tria.h>
#include <deal.II/dofs/dof_accessor.h>
#include <deal.II/dofs/dof_handler.h>
#include <deal.II/grid/grid_generator.h>

#include <deal.II/dofs/dof_tools.h>
#include <deal.II/fe/fe_q.h>
#include <deal.II/fe/fe_values.h>

#include <deal.II/lac/dynamic_sparsity_pattern.h>
#include <deal.II/lac/sparsity_tools.h>

#include <deal.II/base/quadrature_lib.h>
#include <deal.II/lac/generic_linear_algebra.h>
#include <deal.II/lac/trilinos_sparse_matrix.h>
#include <deal.II/lac/vector.h>

#include <deal.II/lac/precondition.h>
#include <deal.II/lac/solver_cg.h>

#define USE_TRILINOS_LA

void condense(dealii::AffineConstraints<double> &constraints,
              dealii::IndexSet &locally_owned_dofs,
              dealii::IndexSet &locally_relevant_dofs,
              dealii::TrilinosWrappers::MPI::Vector &rhs,
              dealii::TrilinosWrappers::MPI::Vector &condensed_rhs)

{
  dealii::TrilinosWrappers::MPI::Vector nonghost_output, ghosted_vec;
  nonghost_output.reinit(locally_owned_dofs, MPI_COMM_WORLD);
  bool vector_writable = false; // why does this change the result
  ghosted_vec.reinit(locally_owned_dofs, locally_relevant_dofs, MPI_COMM_WORLD,
                     vector_writable);

  ghosted_vec = rhs;
  constraints.condense(ghosted_vec, nonghost_output);
  condensed_rhs = nonghost_output;
}

void solve(dealii::AffineConstraints<double> &constraints,
           dealii::TrilinosWrappers::SparseMatrix &mat,
           dealii::TrilinosWrappers::MPI::Vector &completely_distributed_solution,
           dealii::TrilinosWrappers::MPI::Vector &condensed_rhs)
{
  int this_mpi_process = dealii::Utilities::MPI::this_mpi_process(MPI_COMM_WORLD);
  dealii::ConditionalOStream pout(std::cout, this_mpi_process == 0);

  dealii::SolverControl solver_control(1000, 1.0E-15);
  dealii::SolverCG<dealii::TrilinosWrappers::MPI::Vector> solver(solver_control);
  dealii::TrilinosWrappers::PreconditionIdentity preconditioner;
  solver.solve(mat,
               completely_distributed_solution,
               condensed_rhs,
               preconditioner);
  constraints.distribute(completely_distributed_solution);

  pout << "condensed_rhs norm at solve : " << condensed_rhs.l2_norm() << std::endl;
}

int main(int argc, char **argv)
{
  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  //input
  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  int fe_degree = 2;

  dealii::Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv, 1);
  int this_mpi_process = dealii::Utilities::MPI::this_mpi_process(MPI_COMM_WORLD);
  dealii::ConditionalOStream pout(std::cout, this_mpi_process == 0);

  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  //mesh initialization
  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  dealii::parallel::distributed::Triangulation<3> tria(
      MPI_COMM_WORLD,
      typename dealii::Triangulation<3>::MeshSmoothing(dealii::Triangulation<3>::limit_level_difference_at_vertices),
      dealii::parallel::distributed::Triangulation<3>::Settings(dealii::parallel::distributed::Triangulation<3>::mesh_reconstruction_after_repartitioning));

  std::vector<unsigned int> subdividion({3, 3, 3});
  dealii::Point<3> p0(-3, -3, -3);
  dealii::Point<3> p1(3, 3, 3);
  dealii::GridGenerator::subdivided_hyper_rectangle(tria, subdividion, p0, p1);
  dealii::DoFHandler<3> dof_handler(tria);

  dealii::FE_Q<3> fe(fe_degree);
  dof_handler.initialize(tria, fe);

  double rval;
  for (const auto &cell : dof_handler.active_cell_iterators()) {
    if (cell->is_locally_owned()) {
      rval = cell->center().distance(dealii::Point<3>(0, 0, 0));
      if (rval < 1.0) cell->set_refine_flag();
    }
  }
  tria.prepare_coarsening_and_refinement();
  tria.execute_coarsening_and_refinement();

  dealii::AffineConstraints<double> constraints;

  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  //setup dofs
  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  dof_handler.distribute_dofs(fe);
  dealii::IndexSet locally_owned_dofs = dof_handler.locally_owned_dofs();
  dealii::IndexSet locally_relevant_dofs;
  dealii::DoFTools::extract_locally_relevant_dofs(dof_handler,
                                                  locally_relevant_dofs);

  constraints.clear();
  constraints.reinit(locally_relevant_dofs);
  dealii::DoFTools::make_hanging_node_constraints(dof_handler, constraints);
  constraints.close();

  // matrix sparsity pattern
  dealii::DynamicSparsityPattern dsp(locally_relevant_dofs);
  dealii::DoFTools::make_sparsity_pattern(dof_handler, dsp, constraints, true);

  dealii::SparsityTools::distribute_sparsity_pattern(
      dsp, dof_handler.n_locally_owned_dofs_per_processor(),
      MPI_COMM_WORLD, locally_relevant_dofs);

  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  //matrix and rhs assembling
  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  dealii::LinearAlgebraTrilinos::MPI::SparseMatrix mass_matrix, condensed_mass_matrix;
  condensed_mass_matrix.reinit(locally_owned_dofs, locally_owned_dofs, dsp, MPI_COMM_WORLD);
  mass_matrix.reinit(locally_owned_dofs, locally_owned_dofs, dsp, MPI_COMM_WORLD);
  mass_matrix *= 0.0;

  dealii::TrilinosWrappers::MPI::Vector rhs;
  rhs.reinit(locally_owned_dofs, locally_relevant_dofs, MPI_COMM_WORLD, true);
  rhs = 0.0;

  dealii::Quadrature<3>
      quadrature_formula;
  quadrature_formula = dealii::QGaussLobatto<3>(fe.degree + 1);
  //quadrature_formula = dealii::QGauss<3>(fe.degree + 1);
  dealii::FEValues<3> fe_values(
      fe, quadrature_formula,
      dealii::update_values | dealii::update_gradients |
          dealii::update_quadrature_points | dealii::update_JxW_values);
  const unsigned int dofs_per_cell = fe.dofs_per_cell;
  const unsigned int n_q_points = quadrature_formula.size();

  dealii::FullMatrix<double> cell_mass(dofs_per_cell, dofs_per_cell);
  dealii::Vector<double> cell_rhs(dofs_per_cell);
  std::vector<dealii::types::global_dof_index> local_dof_indices(dofs_per_cell);
  dealii::Point<3> qp;
  for (const auto &cell : dof_handler.active_cell_iterators()) {
    if (cell->is_locally_owned()) {
      fe_values.reinit(cell);
      cell_mass = 0.0;
      cell_rhs = 0.0;
      for (unsigned int q_index = 0; q_index < n_q_points; ++q_index) {
        qp = fe_values.quadrature_point(q_index);

        //mass matrix
        for (unsigned int i = 0; i < dofs_per_cell; ++i) {
          for (unsigned int j = 0; j < dofs_per_cell; ++j) {
            cell_mass(i, j) +=
                (fe_values.shape_value(i, q_index) * // grad phi_i(x_q)
                 fe_values.shape_value(j, q_index) * // grad phi_j(x_q)
                 fe_values.JxW(q_index));            // dx
          }
        }
        //rhs (constant function : all the values are 1.0)
        for (unsigned int i = 0; i < dofs_per_cell; ++i) {
          cell_rhs[i] += 1.0 * fe_values.shape_value(i, q_index) * fe_values.JxW(q_index);
        }
      }

      cell->get_dof_indices(local_dof_indices);
      constraints.distribute_local_to_global(cell_mass, local_dof_indices,
                                             condensed_mass_matrix);
      constraints.distribute_local_to_global(cell_rhs, local_dof_indices,
                                             rhs);
      for (unsigned int i = 0; i < dofs_per_cell; ++i) {
        for (unsigned int j = 0; j < dofs_per_cell; ++j) {
          mass_matrix.add(local_dof_indices[i], local_dof_indices[j],
                          cell_mass(i, j));
        }
      }
    }
  }
  condensed_mass_matrix.compress(dealii::VectorOperation::add);
  mass_matrix.compress(dealii::VectorOperation::add);
  rhs.compress(dealii::VectorOperation::add);

  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  // solve linear equations
  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  dealii::TrilinosWrappers::MPI::Vector solution1,
      solution2;
  dealii::TrilinosWrappers::MPI::Vector condensed_rhs;
  solution1.reinit(locally_owned_dofs, locally_owned_dofs, MPI_COMM_WORLD);
  solution2.reinit(locally_owned_dofs, locally_owned_dofs, MPI_COMM_WORLD);
  condensed_rhs.reinit(locally_owned_dofs, locally_relevant_dofs, MPI_COMM_WORLD, true);

  condense(constraints, locally_owned_dofs, locally_relevant_dofs, rhs, condensed_rhs);
  solve(constraints, condensed_mass_matrix, solution1, condensed_rhs);

  dealii::TrilinosWrappers::MPI::Vector new_rhs;
  new_rhs.reinit(locally_owned_dofs, MPI_COMM_WORLD);
  mass_matrix.vmult(new_rhs, solution1);

  condense(constraints, locally_owned_dofs, locally_relevant_dofs, new_rhs, condensed_rhs);
  solve(constraints, condensed_mass_matrix, solution2, condensed_rhs);

  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  // check the results
  ////////////////////////////////////////////////
  ////////////////////////////////////////////////
  pout << "v_1norm : " << rhs.l2_norm() << std::endl;
  pout << "v_2 norm : " << new_rhs.l2_norm() << std::endl;
  pout << "u_1 norm: " << solution1.l2_norm() << std::endl;
  pout << "u_2 norm: " << solution2.l2_norm() << std::endl;
  solution1 -= solution2;
  pout << "error norm between u_1 and u_2 : "
       << solution1.l2_norm() << std::endl;
}

In principle, u_1 and u_2 must be identical within the machine accuracy (about 15 digits). However, the sample code shows wrong results in some MPI cases.

In the cases of 1 and 2 MPI process computations, the results are perfect as below.

(1 process result)
v_1 norm (before condense) : 16.0874
v_2 norm (before condense) : 15.9653
v_1 norm (after condense) : 16.0874
v_2 norm (after condense) : 16.0874
u_1 norm: 21.4243
u_2 norm: 21.4243
error norm between u_1 and u_2 : 5.44689e-15

(2 process result)
v_1 norm (before condense) : 16.0874
v_2 norm (before condense) : 15.9653
v_1 norm (after condense) : 16.0874
v_2 norm (after condense) : 16.0874
u_1 norm: 21.4243
u_2 norm: 21.4243
error norm between u_1 and u_2 : 4.01833e-15

In the case of 3 MPI process computation, we can find some differences from the previous results, which are v_2 norm (after condense) : 16.0877 and u_2 norm: 21.4271, which leads to error norm between u_1 and u_2 : 0.0262412.

(3 process result)
v_1 norm (before condense) : 16.0874
v_2 norm (before condense) : 15.9653
v_1 norm (after condense) : 16.0874
v_2 norm (after condense) : 16.0877
u_1 norm: 21.4243
u_2 norm: 21.4271
error norm between u_1 and u_2 : 0.0262412

In the case of 4 and 5 MPI process computations, we can still find the same differences.
Since v_1 norm (before condense) , v_2 norm (before condense), v_1 norm (after condense) and u_1 norm show the same values regardless of the number of MPI processes, these results mean that constraints.condense(ghosted_vec, nonghost_output);, which is used to condense v_2 in a void condense(...) defined at the top of the sample code, does something strange.

And, in the case of 6 MPI process computations, this code is stuck on the same function constraints.condense(ghosted_vec, nonghost_output);.

When I change the degree of finite element basis from 2 to 1, which is defined as fe_degree at the top of the main function, this code is stuck with 2 and more MPI process computations. For fe_degree = 1, this code can work well only in the case of a single MPI process.

I can not find the reason why AffineConstraints::condense(const VectorType &vec_ghosted, VectorType &output) does not condense properly.

For the problem that AffineConstraints::condense(const VectorType &vec_ghosted, VectorType &output) is stuck, I have take a look at the implementation of this function written in include/deal.II/lac/affine_constraints.templates.h.
I just paste the implementation below.

template <typename number>
template <class VectorType>
void
AffineConstraints<number>::condense(const VectorType &vec_ghosted,
                                    VectorType &      vec) const
{
  Assert(sorted == true, ExcMatrixNotClosed());

  // if this is called with different arguments, we need to copy the data
  // over:
  if (&vec != &vec_ghosted)
    vec = vec_ghosted;

  // distribute all entries, and set them to zero. do so in two loops
  // because in the first one we need to add to elements and in the second
  // one we need to set elements to zero. for parallel vectors, this can
  // only work if we can put a compress() in between, but we don't want to
  // call compress() twice per entry
  for (const ConstraintLine &line : lines)
    {
      // in case the constraint is inhomogeneous, this function is not
      // appropriate. Throw an exception.
      Assert(line.inhomogeneity == number(0.),
             ExcMessage("Inhomogeneous constraint cannot be condensed "
                        "without any matrix specified."));

      const typename VectorType::value_type old_value = vec_ghosted(line.index);
      for (const std::pair<size_type, number> &entry : line.entries)
        if (vec.in_local_range(entry.first) == true)
          vec(entry.first) +=
            (static_cast<typename VectorType::value_type>(old_value) *
             entry.second);
    }

  vec.compress(VectorOperation::add);

  for (const ConstraintLine &line : lines)
    if (vec.in_local_range(line.index) == true)
      vec(line.index) = 0.;

  vec.compress(VectorOperation::insert);
}

I am not sure about the reason, but I found that we can avoid the deadlock by inserting vec.compress(dealii::VectorOperation::add) or vec.compress(dealii::VectorOperation::insert) right after vec = vec_ghosted;. I hope this information helps something.

Finally, I would like to ask a question, (but this may be irrelevant to this topic).
About AffineConstraints::condense(const VectorType &vec_ghosted, VectorType &output) for Trilinos MPI vector, the input vector vec_ghosted must be a ghosted vector, which can be initialized like below.

  dealii::TrilinosWrappers::MPI::Vector vec_ghosted;
  bool vector_writable = false; 
  vec_ghosted.reinit(locally_owned_dofs, locally_relevant_dofs, MPI_COMM_WORLD,
                     vector_writable);

In this initialization, somehow it seems that vector_writable must be false to use in AffineConstraints::condense(const VectorType &vec_ghosted, VectorType &output).
I have carefully read the documentation of deal.II, but I cannot find this reason because, in my understanding, vector_writable defines just that multiple threads can write into the vector at a time.
I would appreciate it if you can briefly tell me about this.

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AffineConstraints::condense does not work properly in certain cases with MPI #11195

AffineConstraints::condense does not work properly in certain cases with MPI #11195

yor-dev commented Nov 17, 2020 •

edited

AffineConstraints::condense does not work properly in certain cases with MPI #11195

AffineConstraints::condense does not work properly in certain cases with MPI #11195

Comments

yor-dev commented Nov 17, 2020 • edited

yor-dev commented Nov 17, 2020 •

edited