-
Notifications
You must be signed in to change notification settings - Fork 707
/
la_parallel_vector.h
1999 lines (1751 loc) · 72.6 KB
/
la_parallel_vector.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// ---------------------------------------------------------------------
//
// Copyright (C) 2011 - 2022 by the deal.II authors
//
// This file is part of the deal.II library.
//
// The deal.II library is free software; you can use it, redistribute
// it, and/or modify it under the terms of the GNU Lesser General
// Public License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// The full text of the license can be found in the file LICENSE.md at
// the top level directory of deal.II.
//
// ---------------------------------------------------------------------
#ifndef dealii_la_parallel_vector_h
#define dealii_la_parallel_vector_h
#include <deal.II/base/config.h>
#include <deal.II/base/communication_pattern_base.h>
#include <deal.II/base/memory_space.h>
#include <deal.II/base/memory_space_data.h>
#include <deal.II/base/mpi.h>
#include <deal.II/base/numbers.h>
#include <deal.II/base/parallel.h>
#include <deal.II/base/partitioner.h>
#include <deal.II/base/subscriptor.h>
#include <deal.II/base/thread_management.h>
#include <deal.II/lac/vector_operation.h>
#include <deal.II/lac/vector_space_vector.h>
#include <deal.II/lac/vector_type_traits.h>
#include <iomanip>
#include <memory>
DEAL_II_NAMESPACE_OPEN
// Forward declarations
#ifndef DOXYGEN
namespace LinearAlgebra
{
/**
* A namespace for parallel implementations of vectors.
*/
namespace distributed
{
template <typename>
class BlockVector;
}
template <typename>
class ReadWriteVector;
} // namespace LinearAlgebra
# ifdef DEAL_II_WITH_PETSC
namespace PETScWrappers
{
namespace MPI
{
class Vector;
}
} // namespace PETScWrappers
# endif
# ifdef DEAL_II_WITH_TRILINOS
namespace TrilinosWrappers
{
namespace MPI
{
class Vector;
}
} // namespace TrilinosWrappers
# endif
#endif
namespace LinearAlgebra
{
namespace distributed
{
/*! @addtogroup Vectors
*@{
*/
/**
* Implementation of a parallel vector class. The design of this class is
* similar to the standard ::dealii::Vector class in deal.II, with the
* exception that storage is distributed with MPI.
*
* The vector is designed for the following scheme of parallel
* partitioning:
* <ul>
* <li> The indices held by individual processes (locally owned part) in
* the MPI parallelization form a contiguous range
* <code>[my_first_index,my_last_index)</code>.
* <li> Ghost indices residing on arbitrary positions of other processors
* are allowed. It is in general more efficient if ghost indices are
* clustered, since they are stored as a set of intervals. The
* communication pattern of the ghost indices is determined when calling
* the function <code>reinit (locally_owned, ghost_indices,
* communicator)</code>, and retained until the partitioning is changed.
* This allows for efficient parallel communication of indices. In
* particular, it stores the communication pattern, rather than having to
* compute it again for every communication. For more information on ghost
* vectors, see also the
* @ref GlossGhostedVector "glossary entry on vectors with ghost elements".
* <li> Besides the usual global access operator() it is also possible to
* access vector entries in the local index space with the function @p
* local_element(). Locally owned indices are placed first, [0,
* locally_owned_size()), and then all ghost indices follow after them
* contiguously, [locally_owned_size(),
* locally_owned_size()+n_ghost_entries()).
* </ul>
*
* Functions related to parallel functionality:
* <ul>
* <li> The function <code>compress()</code> goes through the data
* associated with ghost indices and communicates it to the owner process,
* which can then add it to the correct position. This can be used e.g.
* after having run an assembly routine involving ghosts that fill this
* vector. Note that the @p insert mode of @p compress() does not set the
* elements included in ghost entries but simply discards them, assuming
* that the owning processor has set them to the desired value already
* (See also the
* @ref GlossCompress "glossary entry on compress").
* <li> The <code>update_ghost_values()</code> function imports the data
* from the owning processor to the ghost indices in order to provide read
* access to the data associated with ghosts.
* <li> It is possible to split the above functions into two phases, where
* the first initiates the communication and the second one finishes it.
* These functions can be used to overlap communication with computations
* in other parts of the code.
* <li> Of course, reduction operations (like norms) make use of
* collective all-to-all MPI communications.
* </ul>
*
* This vector can take two different states with respect to ghost
* elements:
* <ul>
* <li> After creation and whenever zero_out_ghost_values() is called (or
* <code>operator= (0.)</code>), the vector does only allow writing into
* ghost elements but not reading from ghost elements.
* <li> After a call to update_ghost_values(), the vector does not allow
* writing into ghost elements but only reading from them. This is to
* avoid undesired ghost data artifacts when calling compress() after
* modifying some vector entries. The current status of the ghost entries
* (read mode or write mode) can be queried by the method
* has_ghost_elements(), which returns <code>true</code> exactly when
* ghost elements have been updated and <code>false</code> otherwise,
* irrespective of the actual number of ghost entries in the vector layout
* (for that information, use n_ghost_entries() instead).
* </ul>
*
* This vector uses the facilities of the class dealii::Vector<Number> for
* implementing the operations on the local range of the vector. In
* particular, it also inherits thread parallelism that splits most
* vector-vector operations into smaller chunks if the program uses
* multiple threads. This may or may not be desired when working also with
* MPI.
*
* <h4>Limitations regarding the vector size</h4>
*
* This vector class is based on two different number types for indexing.
* The so-called global index type encodes the overall size of the vector.
* Its type is types::global_dof_index. The largest possible value is
* <code>2^32-1</code> or approximately 4 billion in case 64 bit integers
* are disabled at configuration of deal.II (default case) or
* <code>2^64-1</code> or approximately <code>10^19</code> if 64 bit
* integers are enabled (see the glossary entry on
* @ref GlobalDoFIndex
* for further information).
*
* The second relevant index type is the local index used within one MPI
* rank. As opposed to the global index, the implementation assumes 32-bit
* unsigned integers unconditionally. In other words, to actually use a
* vector with more than four billion entries, you need to use MPI with
* more than one rank (which in general is a safe assumption since four
* billion entries consume at least 16 GB of memory for floats or 32 GB of
* memory for doubles) and enable 64-bit indices. If more than 4 billion
* local elements are present, the implementation tries to detect that,
* which triggers an exception and aborts the code. Note, however, that
* the detection of overflow is tricky and the detection mechanism might
* fail in some circumstances. Therefore, it is strongly recommended to
* not rely on this class to automatically detect the unsupported case.
*
* <h4>CUDA support</h4>
*
* This vector class supports two different memory spaces: Host and CUDA. By
* default, the memory space is Host and all the data are allocated on the
* CPU. When the memory space is CUDA, all the data is allocated on the GPU.
* The operations on the vector are performed on the chosen memory space. *
* From the host, there are two methods to access the elements of the Vector
* when using the CUDA memory space:
* <ul>
* <li> use get_values():
* @code
* Vector<double, MemorySpace::CUDA> vector(local_range, comm);
* double* vector_dev = vector.get_values();
* std::vector<double> vector_host(local_range.n_elements(), 1.);
* Utilities::CUDA::copy_to_dev(vector_host, vector_dev);
* @endcode
* <li> use import():
* @code
* Vector<double, MemorySpace::CUDA> vector(local_range, comm);
* ReadWriteVector<double> rw_vector(local_range);
* for (auto & val : rw_vector)
* val = 1.;
* vector.import(rw_vector, VectorOperations::insert);
* @endcode
* </ul>
* The import method is a lot safer and will perform an MPI communication if
* necessary. Since an MPI communication may be performed, import needs to
* be called on all the processors.
*
* @note By default, all the ranks will try to access the device 0. This is
* fine is if you have one rank per node and one gpu per node. If you
* have multiple GPUs on one node, we need each process to access a
* different GPU. If each node has the same number of GPUs, this can be done
* as follows:
* <code> int n_devices = 0; cudaGetDeviceCount(&n_devices); int
* device_id = my_rank % n_devices;
* cudaSetDevice(device_id);
* </code>
*
* <h4>MPI-3 shared-memory support</h4>
*
* In Host mode, this class allows to use MPI-3 shared-memory features
* by providing a separate MPI communicator that consists of processes on
* the same shared-memory domain. By calling
* `vector.shared_vector_data();`,
* users have read-only access to both locally-owned and ghost values of
* processes combined in the shared-memory communicator (@p comm_sm in
* reinit()).
*
* For this to work, you have to call the constructor or one of the reinit()
* functions of this class with a non-default value for the `comm_sm`
* argument, where the argument corresponds to a communicator consisting of
* all processes on the same shared-memory domain. This kind of communicator
* can be created using the following code snippet:
* @code
* MPI_Comm comm_sm;
* MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL,
* &comm_sm);
* @endcode
*
* @see CUDAWrappers
*/
template <typename Number, typename MemorySpace = MemorySpace::Host>
class Vector : public ::dealii::LinearAlgebra::VectorSpaceVector<Number>,
public Subscriptor
{
public:
using memory_space = MemorySpace;
using value_type = Number;
using pointer = value_type *;
using const_pointer = const value_type *;
using iterator = value_type *;
using const_iterator = const value_type *;
using reference = value_type &;
using const_reference = const value_type &;
using size_type = types::global_dof_index;
using real_type = typename numbers::NumberTraits<Number>::real_type;
static_assert(
std::is_same<MemorySpace, ::dealii::MemorySpace::Host>::value ||
std::is_same<MemorySpace, ::dealii::MemorySpace::CUDA>::value,
"MemorySpace should be Host or CUDA");
/**
* @name 1: Basic Object-handling
*/
//@{
/**
* Empty constructor.
*/
Vector();
/**
* Copy constructor. Uses the parallel partitioning of @p in_vector.
* It should be noted that this constructor automatically sets ghost
* values to zero. Call @p update_ghost_values() directly following
* construction if a ghosted vector is required.
*/
Vector(const Vector<Number, MemorySpace> &in_vector);
/**
* Move constructor. Uses the swap method.
*
* @note In order for this constructor to leave the moved-from object in a
* valid state it must allocate memory (in this case, an empty
* partitioner) - hence it cannot be marked as noexcept.
*/
Vector(Vector<Number, MemorySpace> &&in_vector); // NOLINT
/**
* Construct a parallel vector of the given global size without any
* actual parallel distribution.
*/
Vector(const size_type size);
/**
* Construct a parallel vector. The local range is specified by @p
* locally_owned_set (note that this must be a contiguous interval,
* multiple intervals are not possible). The IndexSet @p ghost_indices
* specifies ghost indices, i.e., indices which one might need to read
* data from or accumulate data from. It is allowed that the set of
* ghost indices also contains the local range, but it does not need to.
*
* This function involves global communication, so it should only be
* called once for a given layout. Use the constructor with
* Vector<Number> argument to create additional vectors with the same
* parallel layout.
*
* @see
* @ref GlossGhostedVector "vectors with ghost elements"
*/
Vector(const IndexSet &local_range,
const IndexSet &ghost_indices,
const MPI_Comm &communicator);
/**
* Same constructor as above but without any ghost indices.
*/
Vector(const IndexSet &local_range, const MPI_Comm &communicator);
/**
* Create the vector based on the parallel partitioning described in @p
* partitioner. The input argument is a shared pointer, which stores the
* partitioner data only once and share it between several vectors with
* the same layout.
*/
Vector(
const std::shared_ptr<const Utilities::MPI::Partitioner> &partitioner);
/**
* Destructor.
*/
virtual ~Vector() override;
/**
* Set the global size of the vector to @p size without any actual
* parallel distribution.
*/
void
reinit(const size_type size, const bool omit_zeroing_entries = false);
/**
* Uses the parallel layout of the input vector @p in_vector and
* allocates memory for this vector. Recommended initialization function
* when several vectors with the same layout should be created.
*
* If the flag @p omit_zeroing_entries is set to false, the memory will
* be initialized with zero, otherwise the memory will be untouched (and
* the user must make sure to fill it with reasonable data before using
* it).
*/
template <typename Number2>
void
reinit(const Vector<Number2, MemorySpace> &in_vector,
const bool omit_zeroing_entries = false);
/**
* Initialize the vector. The local range is specified by @p
* locally_owned_set (note that this must be a contiguous interval,
* multiple intervals are not possible). The IndexSet @p ghost_indices
* specifies ghost indices, i.e., indices which one might need to read
* data from or accumulate data from. It is allowed that the set of
* ghost indices also contains the local range, but it does not need to.
*
* This function involves global communication, so it should only be
* called once for a given layout. Use the @p reinit function with
* Vector<Number> argument to create additional vectors with the same
* parallel layout.
*
* @see
* @ref GlossGhostedVector "vectors with ghost elements"
*/
void
reinit(const IndexSet &local_range,
const IndexSet &ghost_indices,
const MPI_Comm &communicator);
/**
* Same as above, but without ghost entries.
*/
void
reinit(const IndexSet &local_range, const MPI_Comm &communicator);
/**
* Initialize the vector given to the parallel partitioning described in
* @p partitioner. The input argument is a shared pointer, which stores
* the partitioner data only once and share it between several vectors
* with the same layout.
*
* The optional argument @p comm_sm, which consists of processes on
* the same shared-memory domain, allows users have read-only access to
* both locally-owned and ghost values of processes combined in the
* shared-memory communicator. See the general documentation of this class
* for more information about this argument.
*/
void
reinit(
const std::shared_ptr<const Utilities::MPI::Partitioner> &partitioner,
const MPI_Comm &comm_sm = MPI_COMM_SELF);
/**
* Initialize vector with @p local_size locally-owned and @p ghost_size
* ghost degrees of freedoms.
*
* The optional argument @p comm_sm, which consists of processes on
* the same shared-memory domain, allows users have read-only access to
* both locally-owned and ghost values of processes combined in the
* shared-memory communicator. See the general documentation of this class
* for more information about this argument.
*
* @note In the created underlying partitioner, the local index range is
* translated to global indices in an ascending and one-to-one fashion,
* i.e., the indices of process $p$ sit exactly between the indices of
* the processes $p-1$ and $p+1$, respectively. Setting the
* @p ghost_size variable to an appropriate value provides memory space
* for the ghost data in a vector's memory allocation as and allows
* access to it via local_element(). However, the associated global
* indices must be handled externally in this case.
*/
void
reinit(const types::global_dof_index local_size,
const types::global_dof_index ghost_size,
const MPI_Comm & comm,
const MPI_Comm & comm_sm = MPI_COMM_SELF);
/**
* Swap the contents of this vector and the other vector @p v. One could
* do this operation with a temporary variable and copying over the data
* elements, but this function is significantly more efficient since it
* only swaps the pointers to the data of the two vectors and therefore
* does not need to allocate temporary storage and move data around.
*
* This function is analogous to the @p swap function of all C++
* standard containers. Also, there is a global function
* <tt>swap(u,v)</tt> that simply calls <tt>u.swap(v)</tt>, again in
* analogy to standard functions.
*/
void
swap(Vector<Number, MemorySpace> &v);
/**
* Assigns the vector to the parallel partitioning of the input vector
* @p in_vector, and copies all the data.
*
* If one of the input vector or the calling vector (to the left of the
* assignment operator) had ghost elements set before this operation,
* the calling vector will have ghost values set. Otherwise, it will be
* in write mode. If the input vector does not have any ghost elements
* at all, the vector will also update its ghost values in analogy to
* the respective setting the Trilinos and PETSc vectors.
*/
Vector<Number, MemorySpace> &
operator=(const Vector<Number, MemorySpace> &in_vector);
/**
* Assigns the vector to the parallel partitioning of the input vector
* @p in_vector, and copies all the data.
*
* If one of the input vector or the calling vector (to the left of the
* assignment operator) had ghost elements set before this operation,
* the calling vector will have ghost values set. Otherwise, it will be
* in write mode. If the input vector does not have any ghost elements
* at all, the vector will also update its ghost values in analogy to
* the respective setting the Trilinos and PETSc vectors.
*/
template <typename Number2>
Vector<Number, MemorySpace> &
operator=(const Vector<Number2, MemorySpace> &in_vector);
//@}
/**
* @name 2: Parallel data exchange
*/
//@{
/**
* This function copies the data that has accumulated in the data buffer
* for ghost indices to the owning processor. For the meaning of the
* argument @p operation, see the entry on
* @ref GlossCompress "Compressing distributed vectors and matrices"
* in the glossary.
*
* There are four variants for this function. If called with argument @p
* VectorOperation::add adds all the data accumulated in ghost elements
* to the respective elements on the owning processor and clears the
* ghost array afterwards. If called with argument @p
* VectorOperation::insert, a set operation is performed. Since setting
* elements in a vector with ghost elements is ambiguous (as one can set
* both the element on the ghost site as well as the owning site), this
* operation makes the assumption that all data is set correctly on the
* owning processor. Upon call of compress(VectorOperation::insert), all
* ghost entries are thus simply zeroed out (using zero_ghost_values()).
* In debug mode, a check is performed for whether the data set is
* actually consistent between processors, i.e., whenever a non-zero
* ghost element is found, it is compared to the value on the owning
* processor and an exception is thrown if these elements do not agree.
* If called with VectorOperation::min or VectorOperation::max, the
* minimum or maximum on all elements across the processors is set.
* @note This vector class has a fixed set of ghost entries attached to
* the local representation. As a consequence, all ghost entries are
* assumed to be valid and will be exchanged unconditionally according
* to the given VectorOperation. Make sure to initialize all ghost
* entries with the neutral element of the given VectorOperation or
* touch all ghost entries. The neutral element is zero for
* VectorOperation::add and VectorOperation::insert, `+inf` for
* VectorOperation::min, and `-inf` for VectorOperation::max. If all
* values are initialized with values below zero and compress is called
* with VectorOperation::max two times subsequently, the maximal value
* after the second calculation will be zero.
*/
virtual void
compress(::dealii::VectorOperation::values operation) override;
/**
* Fills the data field for ghost indices with the values stored in the
* respective positions of the owning processor. This function is needed
* before reading from ghosts. The function is @p const even though
* ghost data is changed. This is needed to allow functions with a @p
* const vector to perform the data exchange without creating
* temporaries.
*
* After calling this method, write access to ghost elements of the
* vector is forbidden and an exception is thrown. Only read access to
* ghost elements is allowed in this state. Note that all subsequent
* operations on this vector, like global vector addition, etc., will
* also update the ghost values by a call to this method after the
* operation. However, global reduction operations like norms or the
* inner product will always ignore ghost elements in order to avoid
* counting the ghost data more than once. To allow writing to ghost
* elements again, call zero_out_ghost_values().
*
* @see
* @ref GlossGhostedVector "vectors with ghost elements"
*/
void
update_ghost_values() const;
/**
* Initiates communication for the @p compress() function with non-
* blocking communication. This function does not wait for the transfer
* to finish, in order to allow for other computations during the time
* it takes until all data arrives.
*
* Before the data is actually exchanged, the function must be followed
* by a call to @p compress_finish().
*
* In case this function is called for more than one vector before @p
* compress_finish() is invoked, it is mandatory to specify a unique
* communication channel to each such call, in order to avoid several
* messages with the same ID that will corrupt this operation. Any
* communication channel less than 100 is a valid value (in particular,
* the range $[100, 200)$ is reserved for
* LinearAlgebra::distributed::BlockVector).
*/
void
compress_start(
const unsigned int communication_channel = 0,
::dealii::VectorOperation::values operation = VectorOperation::add);
/**
* For all requests that have been initiated in compress_start, wait for
* the communication to finish. Once it is finished, add or set the data
* (depending on the flag operation) to the respective positions in the
* owning processor, and clear the contents in the ghost data fields.
* The meaning of this argument is the same as in compress().
*
* This function should be called exactly once per vector after calling
* compress_start, otherwise the result is undefined. In particular, it
* is not well-defined to call compress_start on the same vector again
* before compress_finished has been called. However, there is no
* warning to prevent this situation.
*
* Must follow a call to the @p compress_start function.
*
* When the MemorySpace is CUDA and MPI is not CUDA-aware, data changed on
* the device after the call to compress_start will be lost.
*/
void
compress_finish(::dealii::VectorOperation::values operation);
/**
* Initiates communication for the @p update_ghost_values() function
* with non-blocking communication. This function does not wait for the
* transfer to finish, in order to allow for other computations during
* the time it takes until all data arrives.
*
* Before the data is actually exchanged, the function must be followed
* by a call to @p update_ghost_values_finish().
*
* In case this function is called for more than one vector before @p
* update_ghost_values_finish() is invoked, it is mandatory to specify a
* unique communication channel to each such call, in order to avoid
* several messages with the same ID that will corrupt this operation.
* Any communication channel less than 100 is a valid value (in
* particular, the range $[100, 200)$ is reserved for
* LinearAlgebra::distributed::BlockVector).
*/
void
update_ghost_values_start(
const unsigned int communication_channel = 0) const;
/**
* For all requests that have been started in update_ghost_values_start,
* wait for the communication to finish.
*
* Must follow a call to the @p update_ghost_values_start function
* before reading data from ghost indices.
*/
void
update_ghost_values_finish() const;
/**
* This method zeros the entries on ghost dofs, but does not touch
* locally owned DoFs.
*
* After calling this method, read access to ghost elements of the
* vector is forbidden and an exception is thrown. Only write access to
* ghost elements is allowed in this state.
*
* @deprecated Use zero_out_ghost_values() instead.
*/
DEAL_II_DEPRECATED void
zero_out_ghosts() const;
/**
* This method zeros the entries on ghost dofs, but does not touch
* locally owned DoFs.
*
* After calling this method, read access to ghost elements of the
* vector is forbidden and an exception is thrown. Only write access to
* ghost elements is allowed in this state.
*/
void
zero_out_ghost_values() const;
/**
* Return whether the vector currently is in a state where ghost values
* can be read or not. This is the same functionality as other parallel
* vectors have. If this method returns false, this only means that
* read-access to ghost elements is prohibited whereas write access is
* still possible (to those entries specified as ghosts during
* initialization), not that there are no ghost elements at all.
*
* @see
* @ref GlossGhostedVector "vectors with ghost elements"
*/
bool
has_ghost_elements() const;
/**
* This method copies the data in the locally owned range from another
* distributed vector @p src into the calling vector. As opposed to
* operator= that also includes ghost entries, this operation ignores
* the ghost range. The only prerequisite is that the local range on the
* calling vector and the given vector @p src are the same on all
* processors. It is explicitly allowed that the two vectors have
* different ghost elements that might or might not be related to each
* other.
*
* Since no data exchange is performed, make sure that neither @p src
* nor the calling vector have pending communications in order to obtain
* correct results.
*/
template <typename Number2>
void
copy_locally_owned_data_from(const Vector<Number2, MemorySpace> &src);
/**
* Import all the elements present in the distributed vector @p src.
* VectorOperation::values @p operation is used to decide if the elements
* in @p V should be added to the current vector or replace the current
* elements. The main purpose of this function is to get data from one
* memory space, e.g. CUDA, to the other, e.g. the Host.
*
* @note The partitioners of the two distributed vectors need to be the
* same as no MPI communication is performed.
*/
template <typename MemorySpace2>
void
import(const Vector<Number, MemorySpace2> &src,
VectorOperation::values operation);
//@}
/**
* @name 3: Implementation of VectorSpaceVector
*/
//@{
/**
* Change the dimension to that of the vector V. The elements of V are not
* copied.
*/
virtual void
reinit(const VectorSpaceVector<Number> &V,
const bool omit_zeroing_entries = false) override;
/**
* Multiply the entire vector by a fixed factor.
*/
virtual Vector<Number, MemorySpace> &
operator*=(const Number factor) override;
/**
* Divide the entire vector by a fixed factor.
*/
virtual Vector<Number, MemorySpace> &
operator/=(const Number factor) override;
/**
* Add the vector @p V to the present one.
*/
virtual Vector<Number, MemorySpace> &
operator+=(const VectorSpaceVector<Number> &V) override;
/**
* Subtract the vector @p V from the present one.
*/
virtual Vector<Number, MemorySpace> &
operator-=(const VectorSpaceVector<Number> &V) override;
/**
* Import all the elements present in the vector's IndexSet from the input
* vector @p V. VectorOperation::values @p operation is used to decide if
* the elements in @p V should be added to the current vector or replace the
* current elements. The last parameter can be used if the same
* communication pattern is used multiple times. This can be used to
* improve performance.
*
* @note If the MemorySpace is CUDA, the data in the ReadWriteVector will
* be moved to the device.
*/
virtual void
import(const LinearAlgebra::ReadWriteVector<Number> &V,
VectorOperation::values operation,
std::shared_ptr<const Utilities::MPI::CommunicationPatternBase>
communication_pattern = {}) override;
/**
* Return the scalar product of two vectors.
*/
virtual Number
operator*(const VectorSpaceVector<Number> &V) const override;
/**
* Add @p a to all components. Note that @p a is a scalar not a vector.
*/
virtual void
add(const Number a) override;
/**
* Simple addition of a multiple of a vector, i.e. <tt>*this += a*V</tt>.
*/
virtual void
add(const Number a, const VectorSpaceVector<Number> &V) override;
/**
* Multiple addition of scaled vectors, i.e. <tt>*this += a*V+b*W</tt>.
*/
virtual void
add(const Number a,
const VectorSpaceVector<Number> &V,
const Number b,
const VectorSpaceVector<Number> &W) override;
/**
* A collective add operation: This function adds a whole set of values
* stored in @p values to the vector components specified by @p indices.
*/
virtual void
add(const std::vector<size_type> &indices,
const std::vector<Number> & values);
/**
* Scaling and simple addition of a multiple of a vector, i.e. <tt>*this =
* s*(*this)+a*V</tt>.
*/
virtual void
sadd(const Number s,
const Number a,
const VectorSpaceVector<Number> &V) override;
/**
* Scale each element of this vector by the corresponding element in the
* argument. This function is mostly meant to simulate multiplication (and
* immediate re-assignment) by a diagonal scaling matrix.
*/
virtual void
scale(const VectorSpaceVector<Number> &scaling_factors) override;
/**
* Assignment <tt>*this = a*V</tt>.
*/
virtual void
equ(const Number a, const VectorSpaceVector<Number> &V) override;
/**
* Return the l<sub>1</sub> norm of the vector (i.e., the sum of the
* absolute values of all entries among all processors).
*/
virtual real_type
l1_norm() const override;
/**
* Return the $l_2$ norm of the vector (i.e., the square root of
* the sum of the square of all entries among all processors).
*/
virtual real_type
l2_norm() const override;
/**
* Return the square of the $l_2$ norm of the vector.
*/
real_type
norm_sqr() const;
/**
* Return the maximum norm of the vector (i.e., the maximum absolute value
* among all entries and among all processors).
*/
virtual real_type
linfty_norm() const override;
/**
* Perform a combined operation of a vector addition and a subsequent
* inner product, returning the value of the inner product. In other
* words, the result of this function is the same as if the user called
* @code
* this->add(a, V);
* return_value = *this * W;
* @endcode
*
* The reason this function exists is that this operation involves less
* memory transfer than calling the two functions separately. This method
* only needs to load three vectors, @p this, @p V, @p W, whereas calling
* separate methods means to load the calling vector @p this twice. Since
* most vector operations are memory transfer limited, this reduces the
* time by 25\% (or 50\% if @p W equals @p this).
*
* For complex-valued vectors, the scalar product in the second step is
* implemented as
* $\left<v,w\right>=\sum_i v_i \bar{w_i}$.
*/
virtual Number
add_and_dot(const Number a,
const VectorSpaceVector<Number> &V,
const VectorSpaceVector<Number> &W) override;
/**
* Return the global size of the vector, equal to the sum of the number of
* locally owned indices among all processors.
*/
virtual size_type
size() const override;
/**
* Return an index set that describes which elements of this vector are
* owned by the current processor. As a consequence, the index sets
* returned on different processors if this is a distributed vector will
* form disjoint sets that add up to the complete index set. Obviously, if
* a vector is created on only one processor, then the result would
* satisfy
* @code
* vec.locally_owned_elements() == complete_index_set(vec.size())
* @endcode
*/
virtual dealii::IndexSet
locally_owned_elements() const override;
/**
* Print the vector to the output stream @p out.
*/
virtual void
print(std::ostream & out,
const unsigned int precision = 3,
const bool scientific = true,
const bool across = true) const override;
/**
* Return the memory consumption of this class in bytes.
*/
virtual std::size_t
memory_consumption() const override;
//@}
/**
* @name 4: Other vector operations not included in VectorSpaceVector
*/
//@{
/**
* Sets all elements of the vector to the scalar @p s. If the scalar is
* zero, also ghost elements are set to zero, otherwise they remain
* unchanged.
*/
virtual Vector<Number, MemorySpace> &
operator=(const Number s) override;
/**
* This is a collective add operation that adds a whole set of values
* stored in @p values to the vector components specified by @p indices.
*/
template <typename OtherNumber>
void
add(const std::vector<size_type> & indices,
const ::dealii::Vector<OtherNumber> &values);
/**
* Take an address where n_elements are stored contiguously and add them
* into the vector.
*/
template <typename OtherNumber>
void
add(const size_type n_elements,
const size_type * indices,
const OtherNumber *values);
/**
* Scaling and simple vector addition, i.e. <tt>*this =
* s*(*this)+V</tt>.
*/
void
sadd(const Number s, const Vector<Number, MemorySpace> &V);
//@}
/**
* @name 5: Entry access and local data representation
*/
//@{
/**
* Return the local size of the vector, i.e., the number of indices
* owned locally.
*
* @deprecated Use locally_owned_size() instead.
*/
DEAL_II_DEPRECATED
size_type
local_size() const;
/**
* Return the local size of the vector, i.e., the number of indices
* owned locally.
*/
size_type
locally_owned_size() const;
/**
* Return true if the given global index is in the local range of this
* processor.
*/
bool
in_local_range(const size_type global_index) const;
/**
* Make the @p Vector class a bit like the <tt>vector<></tt> class of
* the C++ standard library by returning iterators to the start and end
* of the <i>locally owned</i> elements of this vector.
*
* It holds that end() - begin() == locally_owned_size().
*
* @note For the CUDA memory space, the iterator points to memory on the
* device.
*/
iterator
begin();
/**
* Return constant iterator to the start of the locally owned elements
* of the vector.
*
* @note For the CUDA memory space, the iterator points to memory on the
* device.
*/
const_iterator
begin() const;
/**
* Return an iterator pointing to the element past the end of the array
* of locally owned entries.
*
* @note For the CUDA memory space, the iterator points to memory on the
* device.
*/
iterator
end();
/**
* Return a constant iterator pointing to the element past the end of
* the array of the locally owned entries.
*