-
Notifications
You must be signed in to change notification settings - Fork 707
/
matrix_free.h
5638 lines (4967 loc) · 212 KB
/
matrix_free.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// ---------------------------------------------------------------------
//
// Copyright (C) 2011 - 2022 by the deal.II authors
//
// This file is part of the deal.II library.
//
// The deal.II library is free software; you can use it, redistribute
// it, and/or modify it under the terms of the GNU Lesser General
// Public License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// The full text of the license can be found in the file LICENSE.md at
// the top level directory of deal.II.
//
// ---------------------------------------------------------------------
#ifndef dealii_matrix_free_h
#define dealii_matrix_free_h
#include <deal.II/base/config.h>
#include <deal.II/base/aligned_vector.h>
#include <deal.II/base/exceptions.h>
#include <deal.II/base/quadrature.h>
#include <deal.II/base/template_constraints.h>
#include <deal.II/base/thread_local_storage.h>
#include <deal.II/base/vectorization.h>
#include <deal.II/dofs/dof_handler.h>
#include <deal.II/fe/fe.h>
#include <deal.II/fe/mapping.h>
#include <deal.II/hp/mapping_collection.h>
#include <deal.II/hp/q_collection.h>
#include <deal.II/lac/affine_constraints.h>
#include <deal.II/lac/block_vector_base.h>
#include <deal.II/lac/la_parallel_vector.h>
#include <deal.II/lac/vector_operation.h>
#include <deal.II/matrix_free/dof_info.h>
#include <deal.II/matrix_free/mapping_info.h>
#include <deal.II/matrix_free/shape_info.h>
#include <deal.II/matrix_free/task_info.h>
#include <deal.II/matrix_free/type_traits.h>
#include <cstdlib>
#include <limits>
#include <list>
#include <memory>
DEAL_II_NAMESPACE_OPEN
/**
* This class collects all the data that is stored for the matrix free
* implementation. The storage scheme is tailored towards several loops
* performed with the same data, i.e., typically doing many matrix-vector
* products or residual computations on the same mesh. The class is used in
* step-37 and step-48.
*
* This class does not implement any operations involving finite element basis
* functions, i.e., regarding the operation performed on the cells. For these
* operations, the class FEEvaluation is designed to use the data collected in
* this class.
*
* The stored data can be subdivided into three main components:
*
* - DoFInfo: It stores how local degrees of freedom relate to global degrees
* of freedom. It includes a description of constraints that are evaluated as
* going through all local degrees of freedom on a cell.
*
* - MappingInfo: It stores the transformations from real to unit cells that
* are necessary in order to build derivatives of finite element functions and
* find location of quadrature weights in physical space.
*
* - ShapeInfo: It contains the shape functions of the finite element,
* evaluated on the unit cell.
*
* Besides the initialization routines, this class implements only a single
* operation, namely a loop over all cells (cell_loop()). This loop is
* scheduled in such a way that cells that share degrees of freedom are not
* worked on simultaneously, which implies that it is possible to write to
* vectors (or matrices) in parallel without having to explicitly synchronize
* access to these vectors and matrices. This class does not implement any
* shape values, all it does is to cache the respective data. To implement
* finite element operations, use the class FEEvaluation (or some of the
* related classes).
*
* This class traverses the cells in a different order than the usual
* Triangulation class in deal.II, in order to be flexible with respect to
* parallelization in shared memory and vectorization.
*
* Vectorization is implemented by merging several topological cells into one
* so-called `cell batch`. This enables the application of all cell-related
* operations for several cells with one CPU instruction and is one of the
* main features of this framework.
*
* For details on usage of this class, see the description of FEEvaluation or
* the
* @ref matrixfree "matrix-free module".
*
* @ingroup matrixfree
*/
template <int dim,
typename Number = double,
typename VectorizedArrayType = VectorizedArray<Number>>
class MatrixFree : public Subscriptor
{
static_assert(
std::is_same<Number, typename VectorizedArrayType::value_type>::value,
"Type of Number and of VectorizedArrayType do not match.");
public:
/**
* An alias for the underlying number type specified by the template
* argument.
*/
using value_type = Number;
using vectorized_value_type = VectorizedArrayType;
/**
* The dimension set by the template argument `dim`.
*/
static constexpr unsigned int dimension = dim;
/**
* Collects the options for initialization of the MatrixFree class. The
* parameter @p tasks_parallel_scheme specifies the
* parallelization options in shared memory (task-based parallelism, where
* one can choose between no parallelism and three schemes that avoid that
* cells with access to the same vector entries are accessed
* simultaneously), and the parameter @p tasks_block_size the block size for
* task parallel scheduling. The parameters @p mapping_update_flags,
* @p mapping_update_flags_boundary_faces, @p mapping_update_flags_inner_faces,
* and @p mapping_update_flags_faces_by_cells specify the update flags that
* should be stored by this class.
*
* The parameter @p mg_level specifies the level in the triangulation from which
* the indices are to be used. If the level is set to
* `numbers::invalid_unsigned_int`, the active cells are traversed, and
* otherwise the cells in the given level. This option has no effect in case
* a DoFHandler is given.
*
* The parameter @p store_plain_indices indicates whether the DoFInfo
* class should also allow for access to vectors without resolving
* constraints.
*
* The two parameters @p initialize_indices and @p initialize_mapping allow
* the user to disable some of the initialization processes. For example, if
* only the scheduling that avoids touching the same vector/matrix indices
* simultaneously is to be found, the mapping needs not be
* initialized. Likewise, if the mapping has changed from one iteration to
* the next but the topology has not (like when using a deforming mesh with
* MappingQEulerian), it suffices to initialize the mapping only.
*
* The two parameters @p cell_vectorization_categories and
* @p cell_vectorization_categories_strict control the formation of batches
* for vectorization over several cells. It is used implicitly when working
* with hp-adaptivity but can also be useful in other contexts, such as in
* local time stepping where one would like to control which elements
* together form a batch of cells. The array @p cell_vectorization_categories
* is accessed by the number given by cell->active_cell_index() when working
* on the active cells with @p mg_level set to `numbers::invalid_unsigned_int`
* and by cell->index() for the level cells. By default, the different
* categories in @p cell_vectorization_category can be mixed and the algorithm
* is allowed to merge lower category numbers with the next higher categories
* if it is necessary inside the algorithm, in order to avoid partially
* filled SIMD lanes as much as possible. This gives a better utilization of
* the vectorization but might need special treatment, in particular for
* face integrals. If set to `true', the algorithm will instead keep
* different categories separate and not mix them in a single vectorized
* array.
*
* Finally, @p allow_ghosted_vectors_in_loops allows to enable and disable
* checks and @p communicator_sm gives the MPI communicator to be used
* if MPI-3.0 shared-memory features should be used.
*/
struct AdditionalData
{
/**
* Provide the type of the surrounding MatrixFree class.
*/
using MatrixFreeType = MatrixFree<dim, Number, VectorizedArrayType>;
/**
* Collects options for task parallelism. See the documentation of the
* member variable MatrixFree::AdditionalData::tasks_parallel_scheme for a
* thorough description.
*/
enum TasksParallelScheme
{
/**
* Perform application in serial.
*/
none = internal::MatrixFreeFunctions::TaskInfo::none,
/**
* Partition the cells into two levels and afterwards form chunks.
*/
partition_partition =
internal::MatrixFreeFunctions::TaskInfo::partition_partition,
/**
* Partition on the global level and color cells within the partitions.
*/
partition_color =
internal::MatrixFreeFunctions::TaskInfo::partition_color,
/**
* Use the traditional coloring algorithm: this is like
* TasksParallelScheme::partition_color, but only uses one partition.
*/
color = internal::MatrixFreeFunctions::TaskInfo::color
};
/**
* Constructor for AdditionalData.
*/
AdditionalData(
const TasksParallelScheme tasks_parallel_scheme = partition_partition,
const unsigned int tasks_block_size = 0,
const UpdateFlags mapping_update_flags = update_gradients |
update_JxW_values,
const UpdateFlags mapping_update_flags_boundary_faces = update_default,
const UpdateFlags mapping_update_flags_inner_faces = update_default,
const UpdateFlags mapping_update_flags_faces_by_cells = update_default,
const unsigned int mg_level = numbers::invalid_unsigned_int,
const bool store_plain_indices = true,
const bool initialize_indices = true,
const bool initialize_mapping = true,
const bool overlap_communication_computation = true,
const bool hold_all_faces_to_owned_cells = false,
const bool cell_vectorization_categories_strict = false,
const bool allow_ghosted_vectors_in_loops = true)
: tasks_parallel_scheme(tasks_parallel_scheme)
, tasks_block_size(tasks_block_size)
, mapping_update_flags(mapping_update_flags)
, mapping_update_flags_boundary_faces(mapping_update_flags_boundary_faces)
, mapping_update_flags_inner_faces(mapping_update_flags_inner_faces)
, mapping_update_flags_faces_by_cells(mapping_update_flags_faces_by_cells)
, mg_level(mg_level)
, store_plain_indices(store_plain_indices)
, initialize_indices(initialize_indices)
, initialize_mapping(initialize_mapping)
, overlap_communication_computation(overlap_communication_computation)
, hold_all_faces_to_owned_cells(hold_all_faces_to_owned_cells)
, cell_vectorization_categories_strict(
cell_vectorization_categories_strict)
, allow_ghosted_vectors_in_loops(allow_ghosted_vectors_in_loops)
, communicator_sm(MPI_COMM_SELF)
{}
/**
* Copy constructor.
*/
AdditionalData(const AdditionalData &other)
: tasks_parallel_scheme(other.tasks_parallel_scheme)
, tasks_block_size(other.tasks_block_size)
, mapping_update_flags(other.mapping_update_flags)
, mapping_update_flags_boundary_faces(
other.mapping_update_flags_boundary_faces)
, mapping_update_flags_inner_faces(other.mapping_update_flags_inner_faces)
, mapping_update_flags_faces_by_cells(
other.mapping_update_flags_faces_by_cells)
, mg_level(other.mg_level)
, store_plain_indices(other.store_plain_indices)
, initialize_indices(other.initialize_indices)
, initialize_mapping(other.initialize_mapping)
, overlap_communication_computation(
other.overlap_communication_computation)
, hold_all_faces_to_owned_cells(other.hold_all_faces_to_owned_cells)
, cell_vectorization_category(other.cell_vectorization_category)
, cell_vectorization_categories_strict(
other.cell_vectorization_categories_strict)
, allow_ghosted_vectors_in_loops(other.allow_ghosted_vectors_in_loops)
, communicator_sm(other.communicator_sm)
{}
/**
* Copy assignment.
*/
AdditionalData &
operator=(const AdditionalData &other)
{
tasks_parallel_scheme = other.tasks_parallel_scheme;
tasks_block_size = other.tasks_block_size;
mapping_update_flags = other.mapping_update_flags;
mapping_update_flags_boundary_faces =
other.mapping_update_flags_boundary_faces;
mapping_update_flags_inner_faces = other.mapping_update_flags_inner_faces;
mapping_update_flags_faces_by_cells =
other.mapping_update_flags_faces_by_cells;
mg_level = other.mg_level;
store_plain_indices = other.store_plain_indices;
initialize_indices = other.initialize_indices;
initialize_mapping = other.initialize_mapping;
overlap_communication_computation =
other.overlap_communication_computation;
hold_all_faces_to_owned_cells = other.hold_all_faces_to_owned_cells;
cell_vectorization_category = other.cell_vectorization_category;
cell_vectorization_categories_strict =
other.cell_vectorization_categories_strict;
allow_ghosted_vectors_in_loops = other.allow_ghosted_vectors_in_loops;
communicator_sm = other.communicator_sm;
return *this;
}
/**
* Set the scheme for task parallelism. There are four options available.
* If set to @p none, the operator application is done in serial without
* shared memory parallelism. If this class is used together with MPI and
* MPI is also used for parallelism within the nodes, this flag should be
* set to @p none. The default value is @p partition_partition, i.e. we
* actually use multithreading with the first option described below.
*
* The first option @p partition_partition is to partition the cells on
* two levels in onion-skin-like partitions and forming chunks of
* tasks_block_size after the partitioning. The partitioning finds sets of
* independent cells that enable working in parallel without accessing the
* same vector entries at the same time.
*
* The second option @p partition_color is to use a partition on the
* global level and color cells within the partitions (where all chunks
* within a color are independent). Here, the subdivision into chunks of
* cells is done before the partitioning, which might give worse
* partitions but better cache performance if degrees of freedom are not
* renumbered.
*
* The third option @p color is to use a traditional algorithm of coloring
* on the global level. This scheme is a special case of the second option
* where only one partition is present. Note that for problems with
* hanging nodes, there are quite many colors (50 or more in 3d), which
* might degrade parallel performance (bad cache behavior, many
* synchronization points).
*
* @note Threading support is currently experimental for the case inner
* face integrals are performed and it is recommended to use MPI
* parallelism if possible. While the scheme has been verified to work
* with the `partition_partition` option in case of usual DG elements, no
* comprehensive tests have been performed for systems of more general
* elements, like combinations of continuous and discontinuous elements
* that add face integrals to all terms.
*/
TasksParallelScheme tasks_parallel_scheme;
/**
* Set the number of so-called cell batches that should form one
* partition. If zero size is given, the class tries to find a good size
* for the blocks based on MultithreadInfo::n_threads() and the number of
* cells present. Otherwise, the given number is used. If the given number
* is larger than one third of the number of total cells, this means no
* parallelism. Note that in the case vectorization is used, a cell batch
* consists of more than one physical cell.
*/
unsigned int tasks_block_size;
/**
* This flag determines what data needs to be computed and cached on cells.
*
* If your computations require operations like quadrature point locations
* or Hessians, these need to specified here (update_quadrature_points
* or update_hessians, respectively). Note that values, gradients, and
* Jacobian determinants (JxW values) are always computed regardless of the
* flags specified here.
*
* Note that some additional flags might be set automatically (for example
* second derivatives might be evaluated on Cartesian cells since there
* the Jacobian describes the mapping completely).
*/
UpdateFlags mapping_update_flags;
/**
* This flag determines the mapping data on boundary faces to be
* cached. Note that MatrixFree uses a separate loop layout for face
* integrals in order to effectively vectorize also in the case of hanging
* nodes (which require different subface settings on the two sides) or
* some cells in the batch of a VectorizedArray of cells that are adjacent
* to the boundary and others that are not.
*
* If set to a value different from update_general (default), the face
* information is explicitly built. Currently, MatrixFree supports to
* cache the following data on faces: inverse Jacobians, Jacobian
* determinants (JxW), quadrature points, data for Hessians (derivative of
* Jacobians), and normal vectors.
*
* @note In order to be able to perform a `boundary_operation` in the
* MatrixFree::loop(), this field must be set to a value different from
* UpdateFlags::update_default.
*/
UpdateFlags mapping_update_flags_boundary_faces;
/**
* This flag determines the mapping data on interior faces to be
* cached. Note that MatrixFree uses a separate loop layout for face
* integrals in order to effectively vectorize also in the case of hanging
* nodes (which require different subface settings on the two sides) or
* some cells in the batch of a VectorizedArray of cells that are adjacent
* to the boundary and others that are not.
*
* If set to a value different from update_general (default), the face
* information is explicitly built. Currently, MatrixFree supports to
* cache the following data on faces: inverse Jacobians, Jacobian
* determinants (JxW), quadrature points, data for Hessians (derivative of
* Jacobians), and normal vectors.
*
* @note In order to be able to perform a `face_operation`
* in the MatrixFree::loop(), this field must be set to a value different
* from UpdateFlags::update_default.
*/
UpdateFlags mapping_update_flags_inner_faces;
/**
* This flag determines the mapping data for faces in a different layout
* with respect to vectorizations. Whereas
* `mapping_update_flags_inner_faces` and
* `mapping_update_flags_boundary_faces` trigger building the data in a
* face-centric way with proper vectorization, the current data field
* attaches the face information to the cells and their way of
* vectorization. This is only needed in special situations, as for
* example for block-Jacobi methods where the full operator to a cell
* including its faces are evaluated. This data is accessed by
* <code>FEFaceEvaluation::reinit(cell_batch_index,
* face_number)</code>. However, currently no coupling terms to neighbors
* can be computed with this approach because the neighbors are not laid
* out by the VectorizedArray data layout with an
* array-of-struct-of-array-type data structures.
*
* Note that you should only compute this data field in case you really
* need it as it more than doubles the memory required by the mapping data
* on faces.
*
* If set to a value different from update_general (default), the face
* information is explicitly built. Currently, MatrixFree supports to
* cache the following data on faces: inverse Jacobians, Jacobian
* determinants (JxW), quadrature points, data for Hessians (derivative of
* Jacobians), and normal vectors.
*/
UpdateFlags mapping_update_flags_faces_by_cells;
/**
* This option can be used to define whether we work on a certain level of
* the mesh, and not the active cells. If set to invalid_unsigned_int
* (which is the default value), the active cells are gone through,
* otherwise the level given by this parameter. Note that if you specify
* to work on a level, its dofs must be distributed by using
* <code>dof_handler.distribute_mg_dofs(fe);</code>.
*/
unsigned int mg_level;
/**
* Controls whether to enable reading from vectors without resolving
* constraints, i.e., just read the local values of the vector. By
* default, this option is enabled. In case you want to use
* FEEvaluationBase::read_dof_values_plain, this flag needs to be set.
*/
bool store_plain_indices;
/**
* Option to control whether the indices stored in the DoFHandler
* should be read and the pattern for task parallelism should be
* set up in the initialize method of MatrixFree. The default
* value is true. Can be disabled in case the mapping should be
* recomputed (e.g. when using a deforming mesh described through
* MappingEulerian) but the topology of cells has remained the
* same.
*/
bool initialize_indices;
/**
* Option to control whether the mapping information should be
* computed in the initialize method of MatrixFree. The default
* value is true. Can be disabled when only some indices should be
* set up (e.g. when only a set of independent cells should be
* computed).
*/
bool initialize_mapping;
/**
* Option to control whether the loops should overlap communications and
* computations as far as possible in case the vectors passed to the loops
* support non-blocking data exchange. In most situations, overlapping is
* faster in case the amount of data to be sent is more than a few
* kilobytes. If less data is sent, the communication is latency bound on
* most clusters (point-to-point latency is around 1 microsecond on good
* clusters by 2016 standards). Depending on the MPI implementation and
* the fabric, it may be faster to not overlap and wait for the data to
* arrive. The default is true, i.e., communication and computation are
* overlapped.
*/
bool overlap_communication_computation;
/**
* By default, the face part will only hold those faces (and ghost
* elements behind faces) that are going to be processed locally. In case
* MatrixFree should have access to all neighbors on locally owned cells,
* this option enables adding the respective faces at the end of the face
* range.
*/
bool hold_all_faces_to_owned_cells;
/**
* This data structure allows to assign a fraction of cells to different
* categories when building the information for vectorization. It is used
* implicitly when working with hp-adaptivity (with each active index
* being a category) but can also be useful in other contexts where one
* would like to control which cells together can form a batch of cells.
* Such an example is "local time stepping", where cells of different
* categories progress with different time-step sizes and, as a
* consequence, can only processed together with cells with the same
* category.
*
* This array is accessed by the number given by cell->active_cell_index()
* when working on the active cells (with
* @p mg_level set to numbers::invalid_unsigned_int) and by cell->index()
* for the level cells.
*
* @note This field is empty upon construction of AdditionalData. It is
* the responsibility of the user to resize this field to
* `triangulation.n_active_cells()` or `triangulation.n_cells(level)` when
* filling data.
*/
std::vector<unsigned int> cell_vectorization_category;
/**
* By default, the different categories in @p cell_vectorization_category
* can be mixed and the algorithm is allowed to merge lower categories with
* the next higher categories if it is necessary inside the algorithm. This
* gives a better utilization of the vectorization but might need special
* treatment, in particular for face integrals. If set to @p true, the
* algorithm will instead keep different categories separate and not mix
* them in a single vectorized array.
*/
bool cell_vectorization_categories_strict;
/**
* Assert that vectors passed to the MatrixFree loops are not ghosted.
* This variable is primarily intended to reveal bugs or performance
* problems caused by vectors that are involuntarily in ghosted mode,
* by adding a check that this is not the case. In terms of correctness,
* the MatrixFree::loop() and MatrixFree::cell_loop() methods support
* both cases and perform similar operations. In particular, ghost values
* are always updated on the source vector within the loop, and the
* difference is only in whether the initial non-ghosted state is restored.
*/
bool allow_ghosted_vectors_in_loops;
/**
* Shared-memory MPI communicator. Default: MPI_COMM_SELF.
*/
MPI_Comm communicator_sm;
};
/**
* @name 1: Construction and initialization
*/
/** @{ */
/**
* Default empty constructor. Does nothing.
*/
MatrixFree();
/**
* Copy constructor, calls copy_from
*/
MatrixFree(const MatrixFree<dim, Number, VectorizedArrayType> &other);
/**
* Destructor.
*/
~MatrixFree() override = default;
/**
* Extracts the information needed to perform loops over cells. The
* DoFHandler and AffineConstraints objects describe the layout of degrees
* of freedom, the DoFHandler and the mapping describe the
* transformations from unit to real cell, and the finite element
* underlying the DoFHandler together with the quadrature formula
* describe the local operations. Note that the finite element underlying
* the DoFHandler must either be scalar or contain several copies of the
* same element. Mixing several different elements into one FESystem is
* not allowed. In that case, use the initialization function with
* several DoFHandler arguments.
*/
template <typename QuadratureType, typename number2, typename MappingType>
void
reinit(const MappingType & mapping,
const DoFHandler<dim> & dof_handler,
const AffineConstraints<number2> &constraint,
const QuadratureType & quad,
const AdditionalData & additional_data = AdditionalData());
/**
* Extracts the information needed to perform loops over cells. The
* DoFHandler and AffineConstraints objects describe the layout of degrees of
* freedom, the DoFHandler and the mapping describe the transformations from
* unit to real cell, and the finite element underlying the DoFHandler
* together with the quadrature formula describe the local operations. As
* opposed to the scalar case treated with the other initialization
* functions, this function allows for problems with two or more different
* finite elements. The DoFHandlers to each element must be passed as
* pointers to the initialization function. Alternatively, a system of
* several components may also be represented by a single DoFHandler with an
* FESystem element. The prerequisite for this case is that each base
* element of the FESystem must be compatible with the present class, such
* as the FE_Q or FE_DGQ classes.
*
* This function also allows for using several quadrature formulas, e.g.
* when the description contains independent integrations of elements of
* different degrees. However, the number of different quadrature formulas
* can be sets independently from the number of DoFHandlers, when several
* elements are always integrated with the same quadrature formula.
*/
template <typename QuadratureType, typename number2, typename MappingType>
void
reinit(const MappingType & mapping,
const std::vector<const DoFHandler<dim> *> & dof_handler,
const std::vector<const AffineConstraints<number2> *> &constraint,
const std::vector<QuadratureType> & quad,
const AdditionalData &additional_data = AdditionalData());
/**
* Initializes the data structures. Same as before, but now the index set
* description of the locally owned range of degrees of freedom is taken
* from the DoFHandler. Moreover, only a single quadrature formula is used,
* as might be necessary when several components in a vector-valued problem
* are integrated together based on the same quadrature formula.
*/
template <typename QuadratureType, typename number2, typename MappingType>
void
reinit(const MappingType & mapping,
const std::vector<const DoFHandler<dim> *> & dof_handler,
const std::vector<const AffineConstraints<number2> *> &constraint,
const QuadratureType & quad,
const AdditionalData &additional_data = AdditionalData());
/**
* Copy function. Creates a deep copy of all data structures. It is usually
* enough to keep the data for different operations once, so this function
* should not be needed very often.
*/
void
copy_from(
const MatrixFree<dim, Number, VectorizedArrayType> &matrix_free_base);
/**
* Refreshes the geometry data stored in the MappingInfo fields when the
* underlying geometry has changed (e.g. by a mapping that can deform
* through a change in the spatial configuration like MappingFEField)
* whereas the topology of the mesh and unknowns have remained the
* same. Compared to reinit(), this operation only has to re-generate the
* geometry arrays and can thus be significantly cheaper (depending on the
* cost to evaluate the geometry).
*/
void
update_mapping(const Mapping<dim> &mapping);
/**
* Same as above but with hp::MappingCollection.
*/
void
update_mapping(const std::shared_ptr<hp::MappingCollection<dim>> &mapping);
/**
* Clear all data fields and brings the class into a condition similar to
* after having called the default constructor.
*/
void
clear();
/** @} */
/**
* This class defines the type of data access for face integrals in loop ()
* that is passed on to the `update_ghost_values` and `compress` functions
* of the parallel vectors, with the purpose of being able to reduce the
* amount of data that must be exchanged. The data exchange is a real
* bottleneck in particular for high-degree DG methods, therefore a more
* restrictive way of exchange is clearly beneficial. Note that this
* selection applies to FEFaceEvaluation objects assigned to the exterior
* side of cells accessing `FaceToCellTopology::exterior_cells` only; all
* <i>interior</i> objects are available in any case.
*/
enum class DataAccessOnFaces
{
/**
* The loop does not involve any FEFaceEvaluation access into neighbors,
* as is the case with only boundary integrals (but no interior face
* integrals) or when doing mass matrices in a MatrixFree::cell_loop()
* like setup.
*/
none,
/**
* The loop does only involve FEFaceEvaluation access into neighbors by
* function values, such as FEFaceEvaluation::gather_evaluate() with
* argument EvaluationFlags::values, but no access to shape function
* derivatives (which typically need to access more data). For FiniteElement
* types where only some of the shape functions have support on a face, such
* as an FE_DGQ element with Lagrange polynomials with nodes on the element
* surface, the data exchange is reduced from `(k+1)^dim` to
* `(k+1)^(dim-1)`.
*/
values,
/**
* Same as above. To be used if data has to be accessed from exterior faces
* if FEFaceEvaluation was reinitialized by providing the cell batch number
* and a face number. This configuration is useful in the context of
* cell-centric loops.
*
* @pre AdditionalData::hold_all_faces_to_owned_cells has to enabled.
*/
values_all_faces,
/**
* The loop does involve FEFaceEvaluation access into neighbors by
* function values and gradients, but no second derivatives, such as
* FEFaceEvaluation::gather_evaluate() with EvaluationFlags::values and
* EvaluationFlags::gradients set. For FiniteElement types where only some
* of the shape functions have non-zero value and first derivative on a
* face, such as an FE_DGQHermite element, the data exchange is reduced,
* e.g. from `(k+1)^dim` to `2(k+1)^(dim-1)`. Note that for bases that do
* not have this special property, the full neighboring data is sent anyway.
*/
gradients,
/**
* Same as above. To be used if data has to be accessed from exterior faces
* if FEFaceEvaluation was reinitialized by providing the cell batch number
* and a face number. This configuration is useful in the context of
* cell-centric loops.
*
* @pre AdditionalData::hold_all_faces_to_owned_cells has to enabled.
*/
gradients_all_faces,
/**
* General setup where the user does not want to make a restriction. This
* is typically more expensive than the other options, but also the most
* conservative one because the full data of elements behind the faces to
* be computed locally will be exchanged.
*/
unspecified
};
/**
* @name 2: Matrix-free loops
*/
/** @{ */
/**
* This method runs the loop over all cells (in parallel) and performs the
* MPI data exchange on the source vector and destination vector.
*
* @param cell_operation `std::function` with the signature <tt>cell_operation
* (const MatrixFree<dim,Number> &, OutVector &, InVector &,
* std::pair<unsigned int,unsigned int> &)</tt> where the first argument
* passes the data of the calling class and the last argument defines the
* range of cells which should be worked on (typically more than one cell
* should be worked on in order to reduce overheads). One can pass a pointer
* to an object in this place if it has an `operator()` with the correct set
* of arguments since such a pointer can be converted to the function object.
*
* @param dst Destination vector holding the result. If the vector is of
* type LinearAlgebra::distributed::Vector (or composite objects thereof
* such as LinearAlgebra::distributed::BlockVector), the loop calls
* LinearAlgebra::distributed::Vector::compress() at the end of the call
* internally. For other vectors, including parallel Trilinos or PETSc
* vectors, no such call is issued. Note that Trilinos/Epetra or PETSc
* vectors do currently not work in parallel because the present class uses
* MPI-local index addressing, as opposed to the global addressing implied
* by those external libraries.
*
* @param src Input vector. If the vector is of type
* LinearAlgebra::distributed::Vector (or composite objects thereof such as
* LinearAlgebra::distributed::BlockVector), the loop calls
* LinearAlgebra::distributed::Vector::update_ghost_values() at the start of
* the call internally to make sure all necessary data is locally
* available. Note, however, that the vector is reset to its original state
* at the end of the loop, i.e., if the vector was not ghosted upon entry of
* the loop, it will not be ghosted upon finishing the loop.
*
* @param zero_dst_vector If this flag is set to `true`, the vector `dst`
* will be set to zero inside the loop. Use this case in case you perform a
* typical `vmult()` operation on a matrix object, as it will typically be
* faster than calling `dst = 0;` before the loop separately. This is
* because the vector entries are set to zero only on subranges of the
* vector, making sure that the vector entries stay in caches as much as
* possible.
*/
template <typename OutVector, typename InVector>
void
cell_loop(const std::function<void(
const MatrixFree<dim, Number, VectorizedArrayType> &,
OutVector &,
const InVector &,
const std::pair<unsigned int, unsigned int> &)> &cell_operation,
OutVector & dst,
const InVector & src,
const bool zero_dst_vector = false) const;
/**
* This is the second variant to run the loop over all cells, now providing
* a function pointer to a member function of class `CLASS`. This method
* obviates the need to define a lambda function or to call std::bind to bind
* the class into the given
* function in case the local function needs to access data in the class
* (i.e., it is a non-static member function).
*
* @param cell_operation Pointer to member function of `CLASS` with the
* signature <tt>cell_operation (const MatrixFree<dim,Number> &, OutVector &,
* InVector &, std::pair<unsigned int,unsigned int> &)</tt> where the first
* argument passes the data of the calling class and the last argument
* defines the range of cells which should be worked on (typically more than
* one cell should be worked on in order to reduce overheads).
*
* @param owning_class The object which provides the `cell_operation`
* call. To be compatible with this interface, the class must allow to call
* `owning_class->cell_operation(...)`.
*
* @param dst Destination vector holding the result. If the vector is of
* type LinearAlgebra::distributed::Vector (or composite objects thereof
* such as LinearAlgebra::distributed::BlockVector), the loop calls
* LinearAlgebra::distributed::Vector::compress() at the end of the call
* internally. For other vectors, including parallel Trilinos or PETSc
* vectors, no such call is issued. Note that Trilinos/Epetra or PETSc
* vectors do currently not work in parallel because the present class uses
* MPI-local index addressing, as opposed to the global addressing implied
* by those external libraries.
*
* @param src Input vector. If the vector is of type
* LinearAlgebra::distributed::Vector (or composite objects thereof such as
* LinearAlgebra::distributed::BlockVector), the loop calls
* LinearAlgebra::distributed::Vector::update_ghost_values() at the start of
* the call internally to make sure all necessary data is locally
* available. Note, however, that the vector is reset to its original state
* at the end of the loop, i.e., if the vector was not ghosted upon entry of
* the loop, it will not be ghosted upon finishing the loop.
*
* @param zero_dst_vector If this flag is set to `true`, the vector `dst`
* will be set to zero inside the loop. Use this case in case you perform a
* typical `vmult()` operation on a matrix object, as it will typically be
* faster than calling `dst = 0;` before the loop separately. This is
* because the vector entries are set to zero only on subranges of the
* vector, making sure that the vector entries stay in caches as much as
* possible.
*/
template <typename CLASS, typename OutVector, typename InVector>
void
cell_loop(void (CLASS::*cell_operation)(
const MatrixFree &,
OutVector &,
const InVector &,
const std::pair<unsigned int, unsigned int> &) const,
const CLASS * owning_class,
OutVector & dst,
const InVector &src,
const bool zero_dst_vector = false) const;
/**
* Same as above, but for class member functions which are non-const.
*/
template <typename CLASS, typename OutVector, typename InVector>
void
cell_loop(void (CLASS::*cell_operation)(
const MatrixFree &,
OutVector &,
const InVector &,
const std::pair<unsigned int, unsigned int> &),
CLASS * owning_class,
OutVector & dst,
const InVector &src,
const bool zero_dst_vector = false) const;
/**
* This function is similar to the cell_loop with an std::function object to
* specify to operation to be performed on cells, but adds two additional
* functors to execute some additional work before and after the cell
* integrals are computed.
*
* The two additional functors work on a range of degrees of freedom,
* expressed in terms of the degree-of-freedom numbering of the selected
* DoFHandler `dof_handler_index_pre_post` in MPI-local indices. The
* arguments to the functors represent a range of degrees of freedom at a
* granularity of
* internal::MatrixFreeFunctions::DoFInfo::chunk_size_zero_vector entries
* (except for the last chunk which is set to the number of locally owned
* entries) in the form `[first, last)`. The idea of these functors is to
* bring operations on vectors closer to the point where they accessed in a
* matrix-free loop, with the goal to increase cache hits by temporal
* locality. This loop guarantees that the `operation_before_loop` hits all
* relevant unknowns before they are first touched in the cell_operation
* (including the MPI data exchange), allowing to execute some vector update
* that the `src` vector depends upon. The `operation_after_loop` is similar
* - it starts to execute on a range of DoFs once all DoFs in that range
* have been touched for the last time by the `cell_operation`
* (including the MPI data exchange), allowing e.g. to compute some vector
* operations that depend on the result of the current cell loop in `dst` or
* want to modify `src`. The efficiency of caching depends on the numbering
* of the degrees of freedom because of the granularity of the ranges.
*
* @param cell_operation Pointer to member function of `CLASS` with the
* signature <tt>cell_operation (const MatrixFree<dim,Number> &, OutVector &,
* InVector &, std::pair<unsigned int,unsigned int> &)</tt> where the first
* argument passes the data of the calling class and the last argument
* defines the range of cells which should be worked on (typically more than
* one cell should be worked on in order to reduce overheads).
*
* @param owning_class The object which provides the `cell_operation`
* call. To be compatible with this interface, the class must allow to call
* `owning_class->cell_operation(...)`.
*
* @param dst Destination vector holding the result. If the vector is of
* type LinearAlgebra::distributed::Vector (or composite objects thereof
* such as LinearAlgebra::distributed::BlockVector), the loop calls
* LinearAlgebra::distributed::Vector::compress() at the end of the call
* internally. For other vectors, including parallel Trilinos or PETSc
* vectors, no such call is issued. Note that Trilinos/Epetra or PETSc
* vectors do currently not work in parallel because the present class uses
* MPI-local index addressing, as opposed to the global addressing implied
* by those external libraries.
*
* @param src Input vector. If the vector is of type
* LinearAlgebra::distributed::Vector (or composite objects thereof such as
* LinearAlgebra::distributed::BlockVector), the loop calls
* LinearAlgebra::distributed::Vector::update_ghost_values() at the start of
* the call internally to make sure all necessary data is locally
* available. Note, however, that the vector is reset to its original state
* at the end of the loop, i.e., if the vector was not ghosted upon entry of
* the loop, it will not be ghosted upon finishing the loop.
*
* @param operation_before_loop This functor can be used to perform an
* operation on entries of the `src` and `dst` vectors (or other vectors)
* before the operation on cells first touches a particular DoF according to
* the general description in the text above. This function is passed a
* range of the locally owned degrees of freedom on the selected
* `dof_handler_index_pre_post` (in MPI-local numbering).
*
* @param operation_after_loop This functor can be used to perform an
* operation on entries of the `src` and `dst` vectors (or other vectors)
* after the operation on cells last touches a particular DoF according to
* the general description in the text above. This function is passed a
* range of the locally owned degrees of freedom on the selected
* `dof_handler_index_pre_post` (in MPI-local numbering).
*
* @param dof_handler_index_pre_post Since MatrixFree can be initialized
* with a vector of DoFHandler objects, each of them will in general have
* different vector sizes and thus different ranges returned to
* `operation_before_loop` and `operation_after_loop`. Use this variable to
* specify which one of the DoFHandler objects the index range should be
* associated to. Defaults to the `dof_handler_index` 0.
*
* @note The close locality of the `operation_before_loop` and
* `operation_after_loop` is currently only implemented for the MPI-only
* case. In case threading is enabled, the complete `operation_before_loop`
* is scheduled before the parallel loop, and `operation_after_loop` is
* scheduled strictly afterwards, due to the complicated dependencies.
*/
template <typename CLASS, typename OutVector, typename InVector>
void
cell_loop(void (CLASS::*cell_operation)(
const MatrixFree &,
OutVector &,
const InVector &,
const std::pair<unsigned int, unsigned int> &) const,
const CLASS * owning_class,
OutVector & dst,
const InVector &src,
const std::function<void(const unsigned int, const unsigned int)>
&operation_before_loop,
const std::function<void(const unsigned int, const unsigned int)>
& operation_after_loop,
const unsigned int dof_handler_index_pre_post = 0) const;
/**
* Same as above, but for class member functions which are non-const.
*/
template <typename CLASS, typename OutVector, typename InVector>
void
cell_loop(void (CLASS::*cell_operation)(
const MatrixFree &,
OutVector &,
const InVector &,
const std::pair<unsigned int, unsigned int> &),
CLASS * owning_class,
OutVector & dst,
const InVector &src,
const std::function<void(const unsigned int, const unsigned int)>
&operation_before_loop,
const std::function<void(const unsigned int, const unsigned int)>
& operation_after_loop,
const unsigned int dof_handler_index_pre_post = 0) const;
/**
* Same as above, but taking an `std::function` as the `cell_operation`
* rather than a class member function.
*/
template <typename OutVector, typename InVector>