-
Notifications
You must be signed in to change notification settings - Fork 58
/
relocation.h
165 lines (141 loc) · 6.84 KB
/
relocation.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#ifndef PARLAY_RELOCATION_H_
#define PARLAY_RELOCATION_H_
#include <cstring>
#include <algorithm>
#include <iterator>
#include <memory>
#include <new> // IWYU pragma: keep
#include <type_traits>
#include <utility>
#include "parallel.h"
#include "range.h" // IWYU pragma: keep
#include "type_traits.h" // IWYU pragma: keep
#include "utilities.h" // IWYU pragma: keep
#include "internal/debug_uninitialized.h"
namespace parlay {
/*
Relocation (a.k.a. "destructive move")
The relocation of object a into memory b is equivalent to a move
construction of a into b, followed by the destruction of what
remains at a. In other words, it is
new (b) T(std::move(*a));
a->~T();
For many types, however, this can be optimized by replacing it with just
std::memcpy(b, a, sizeof(T));
We call any such types trivially relocatable. This is clearly
true for any trivial type, but it also turns out to be true
for most other standard types, such as vectors, unique_ptrs,
and more. The key observation is that the only reason that
the move operations of these types is non-trivial is because
they must clear out the source object after moving from it.
If, however, the source object is treated as uninitialized
memory after relocation, then it does not have to be cleared
out, since its destructor will not run.
A strong motivating use case for relocation is in dynamically
sized containers (e.g. vector, or parlay::sequence). When
performing a resize operation, one has to move all of the
contents of the old buffer into the new one, and destroy
the contents of the old buffer, like so (ignoring allocator
details)
parallel_for(0, n, [&](size_t i) {
new (&new_buffer[i]) std::move(current_buffer[i]));
current_buffer[i].~value_type();
});
This can be replaced with
parallel_for(0, n, [&](size_t i) {
uninitialized_relocate(&new_buffer[i], ¤t_buffer[i]);
});
or even, for better performance yet
uninitialized_relocate_n(new_buffer, current_buffer, n);
The uninitialized_relocate functions will use the optimized
memcpy-based approach for any types for which it is suitable,
and otherwise, will fall back to the generic approach.
*/
// Relocate the given range of n elements [from, from + n) into uninitialized
// memory at [to, to + n), such that both the source and destination memory
// were allocated by the given allocator.
template<typename It1, typename It2, typename Alloc>
inline void uninitialized_relocate_n_a(It1 to, It2 from, size_t n, Alloc& alloc) {
using T = typename std::iterator_traits<It1>::value_type;
static_assert(std::is_same_v<typename std::iterator_traits<It2>::value_type, T>);
static_assert(std::is_same_v<typename std::allocator_traits<Alloc>::value_type, T>);
constexpr bool trivially_relocatable = is_trivially_relocatable_v<T>;
constexpr bool trivial_alloc = is_trivial_allocator_v<Alloc, T>;
constexpr bool contiguous = is_contiguous_iterator_v<It1> && is_contiguous_iterator_v<It2>;
constexpr bool random_access = is_random_access_iterator_v<It1> && is_random_access_iterator_v<It2>;
// The most efficient scenario -- The objects are trivially relocatable, the allocator
// has no special behaviour, and the iterators point to contiguous memory so we can
// memcpy chunks of more than one T object at a time.
if constexpr (trivially_relocatable && contiguous && trivial_alloc) {
constexpr size_t chunk_size = 1024 * sizeof(size_t) / sizeof(T);
const size_t n_chunks = (n + chunk_size - 1) / chunk_size;
parallel_for(0, n_chunks, [&](size_t i) {
size_t n_objects = (std::min)(chunk_size, n - i * chunk_size);
size_t n_bytes = sizeof(T) * n_objects;
void* src = static_cast<void*>(std::addressof(*(from + i * chunk_size)));
void* dest = static_cast<void*>(std::addressof(*(to + i * chunk_size)));
std::memcpy(dest, src, n_bytes);
}, 1);
// The next best thing -- If the objects are trivially relocatable and the allocator
// has no special behaviour, so long as the iterators are random access, we can still
// relocate everything in parallel, just not by memcpying multiple objects at a time
} else if constexpr (trivially_relocatable && random_access && trivial_alloc) {
constexpr size_t chunk_size = 1024 * sizeof(size_t) / sizeof(T);
const size_t n_chunks = (n + chunk_size - 1) / chunk_size;
parallel_for(0, n_chunks, [&](size_t i) {
for (size_t j = 0; j < chunk_size && (j + i *chunk_size < n); j++) {
void* src = static_cast<void*>(std::addressof(from[j + i * chunk_size]));
void* dest = static_cast<void*>(std::addressof(to[j + i * chunk_size]));
std::memcpy(dest, src, sizeof(T));
}
}, 1);
}
// The iterators are not random access, but we can still relocate, just not in parallel
else if constexpr (trivially_relocatable && trivial_alloc) {
for (size_t i = 0; i < n; i++) {
std::memcpy(std::addressof(*to), std::addressof(*from), sizeof(T));
to++;
from++;
}
}
// After this point, the object can not be trivially relocated, either because it is not
// trivially relocatable, or because the allocator has specialized behaviour. We now fall
// back to just doing a "destructive move" manually.
else if constexpr (random_access) {
static_assert(std::is_move_constructible<T>::value);
static_assert(std::is_destructible<T>::value);
parallel_for(0, n, [&](size_t i) {
PARLAY_ASSERT_UNINITIALIZED(to[i]);
std::allocator_traits<Alloc>::construct(alloc, std::addressof(to[i]), std::move(from[i]));
std::allocator_traits<Alloc>::destroy(alloc, std::addressof(from[i]));
PARLAY_ASSERT_UNINITIALIZED(from[i]);
});
}
// The worst case. No parallelism and no fast relocation.
else {
static_assert(std::is_move_constructible<T>::value);
static_assert(std::is_destructible<T>::value);
for (size_t i = 0; i < n; i++) {
PARLAY_ASSERT_UNINITIALIZED(*to);
std::allocator_traits<Alloc>::construct(alloc, std::addressof(*to), std::move(*from));
std::allocator_traits<Alloc>::destroy(alloc, std::addressof(*from));
PARLAY_ASSERT_UNINITIALIZED(*from);
to++;
from++;
}
}
}
// Relocate the given range of n elements [from, from + n) into uninitialized
// memory at [to, to + n). Relocation is done as if the memory was allocated
// by a standard allocator (e.g. std::allocator, parlay::allocator, malloc)
//
// For an allocator-aware version that respects the construction and destruction
// behaviour of the allocator, use uninitialized_relocate_n_a.
template<typename Iterator1, typename Iterator2>
inline void uninitialized_relocate_n(Iterator1 to, Iterator2 from, size_t n) {
using T = typename std::iterator_traits<Iterator1>::value_type;
std::allocator<T> a;
uninitialized_relocate_n_a(to, from, n, a);
}
}
#endif // PARLAY_RELOCATION_H_