264 changes: 184 additions & 80 deletions benchmark/bench_standard.cpp

Large diffs are not rendered by default.

11 changes: 6 additions & 5 deletions cmake/dtest.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ endfunction()
function(add_dtests)

# Parse arguments
set(options OPTIONAL "")
set(options NO_SANITIZE)
set(oneValueArgs NAME)
set(multiValueArgs FILES LIBS FLAGS)
cmake_parse_arguments(CONFIGURE_ADD_DTESTS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
Expand All @@ -237,27 +237,28 @@ function(add_dtests)
set(TESTFILES ${CONFIGURE_ADD_DTESTS_FILES})
set(TESTLIBS ${CONFIGURE_ADD_DTESTS_LIBS})
set(TESTFLAGS ${CONFIGURE_ADD_DTESTS_FLAGS})
set(NO_SANITIZE ${CONFIGURE_ADD_DTESTS_NO_SANITIZE})

# Vanilla test -- no instrumentation
add_dtest("${TESTNAME}" "${TESTFILES}" "" "nosan" gtest_main "${TESTLIBS}" "${TESTFLAGS}")

# Test with ASAN (AddressSanitizer)
if(BUILD_ASAN_TESTS)
if(BUILD_ASAN_TESTS AND NOT NO_SANITIZE)
add_dtest("${TESTNAME}" "${TESTFILES}" "${DTEST_ASAN_FLAGS}" "asan" gtest_main "${TESTLIBS}" "${TESTFLAGS}")
endif()

# Test with UBSAN (UndefinedBehaviourSanitizer)
if(BUILD_UBSAN_TESTS)
if(BUILD_UBSAN_TESTS AND NOT NO_SANITIZE)
add_dtest("${TESTNAME}" "${TESTFILES}" "${DTEST_UBSAN_FLAGS}" "ubsan" gtest_main "${TESTLIBS}" "${TESTFLAGS}")
endif()

# Test with TSAN (ThreadSanitizer)
if(BUILD_TSAN_TESTS)
if(BUILD_TSAN_TESTS AND NOT NO_SANITIZE)
add_dtest("${TESTNAME}" "${TESTFILES}" "${DTEST_TSAN_FLAGS}" "tsan" gtest_main "${TESTLIBS}" "${TESTFLAGS}")
endif()

# Test with MSAN (MemorySanitizer)
if (BUILD_MSAN_TESTS)
if (BUILD_MSAN_TESTS AND NOT NO_SANITIZE)
add_dtest("${TESTNAME}" "${TESTFILES}" "${MSAN_FLAGS}" "msan" gtest_main-msan "${TESTLIBS}" "${TESTFLAGS}")
add_msan_instrumentation(${TESTNAME}-msan)
endif()
Expand Down
15 changes: 9 additions & 6 deletions include/parlay/delayed_sequence.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <iterator>
#include <stdexcept>
#include <string>
#include <type_traits>

namespace parlay {

Expand All @@ -44,7 +45,7 @@ class delayed_sequence {
// Note that, counterintuitively, reference is not necessarily
// a reference type, but rather, it refers to the type returned
// by dereferencing the iterator. For a delayed sequence, this
// must be a value type since elements are generated on demand.
// could be a value type since elements are generated on demand.
using value_type = T;
using reference = T;
using const_reference = T;
Expand All @@ -65,7 +66,7 @@ class delayed_sequence {
using iterator_category = std::random_access_iterator_tag;
using value_type = T;
using difference_type = std::ptrdiff_t;
using pointer = const T*;
using pointer = typename std::add_pointer<T>::type;
using reference = T;

friend class delayed_sequence<T, F>;
Expand All @@ -91,9 +92,10 @@ class delayed_sequence {

// ---- Requirements for input iterator ----

iterator operator++(int) const {
iterator operator++(int) {
assert(index < parent->last);
return iterator(parent, index+1);
index++;
return *this;
}

bool operator==(const iterator& other) const {
Expand All @@ -112,9 +114,10 @@ class delayed_sequence {
return *this;
}

iterator operator--(int) const {
iterator operator--(int) {
assert(index > parent->first);
return iterator(parent, index-1);
index--;
return *this;
}

// ---- Requirements for random access iterator ----
Expand Down
56 changes: 36 additions & 20 deletions include/parlay/internal/bucket_sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,45 @@ void radix_step_(slice<InIterator, InIterator> A,

for (std::ptrdiff_t j = n - 1; j >= 0; j--) {
auto x = --counts[keys[j]];
B[x] = std::move(A[j]);
uninitialized_relocate(&B[x], &A[j]);
}
}

template <typename InIterator, typename OutIterator>
void to_heap_order(InIterator In,
OutIterator Out,
size_t root,
size_t l,
size_t r) {
// Given the sequence In[l, r), write to Out, the values of
// In arranged in a balanced tree. Values are indexed using
// the standard implicit tree ordering (a la binary heaps)
//
// root (i)
// / \ |
// left (2i+1) right(2i+2)
//
// i.e. the root of the tree is at index 0, and the left
// and right children of the node at index i are at indices
// 2i+1 and 2i+2 respectively.
//
// Template parameters: assignment_tag: one of copy_assign_tag,
// move_assign_tag, uninitialized_copy_tag, uninitialized_move_tag,
// or unintialized_relocate_tag. Specifies the mode by which
// values are relocated from In to Out.
template <typename assignment_tag, typename InIterator, typename OutIterator>
void to_balanced_tree(InIterator In,
OutIterator Out,
size_t root,
size_t l,
size_t r) {
size_t n = r - l;
size_t m = l + n / 2;
Out[root] = std::move(In[m]);
assign_dispatch(Out[root], In[m], assignment_tag());
if (n == 1) return;
to_heap_order(In, Out, 2 * root + 1, l, m);
to_heap_order(In, Out, 2 * root + 2, m + 1, r);
to_balanced_tree<assignment_tag>(In, Out, 2 * root + 1, l, m);
to_balanced_tree<assignment_tag>(In, Out, 2 * root + 2, m + 1, r);
}

// returns true if all equal
// TODO: Can we avoid the indirection of having to refer to
// the pivots via their indices? Copying the elements is
// bad because it doesn't work non-copyable types, but
// there should be some middle ground.
template <typename Iterator, typename BinaryOp>
bool get_buckets(slice<Iterator, Iterator> A,
unsigned char* buckets,
Expand All @@ -74,12 +94,12 @@ bool get_buckets(slice<Iterator, Iterator> A,

if (!f(A[pivots[0]], A[pivots[num_pivots - 1]])) return true;

auto heap = sample_set.begin();
to_heap_order(pivots.begin(), heap, 0, 0, num_pivots);
auto pivot_tree = sample_set.begin();
to_balanced_tree<copy_assign_tag>(pivots.begin(), pivot_tree, 0, 0, num_pivots);

for (size_t i = 0; i < n; i++) {
size_t j = 0;
for (size_t k = 0; k < rounds; k++) j = 1 + 2 * j + !f(A[i], A[heap[j]]);
for (size_t k = 0; k < rounds; k++) j = 1 + 2 * j + !f(A[i], A[pivot_tree[j]]);
assert(j - num_pivots <= (std::numeric_limits<unsigned char>::max)());
buckets[i] = static_cast<unsigned char>(j - num_pivots);
}
Expand All @@ -93,16 +113,12 @@ void base_sort(slice<InIterator, InIterator> in,
bool stable,
bool inplace) {
if (stable) {
auto n = in.size();
size_t levels = n > MERGE_SORT_BASE ? log2_up(n / MERGE_SORT_BASE) : 0;
merge_sort_(in, out, f, levels, inplace);
merge_sort_(in, out, f, inplace);
}
else {
quicksort(in.begin(), in.size(), f);
if (!inplace) {
std::copy(std::make_move_iterator(in.begin()),
std::make_move_iterator(in.end()),
out.begin());
uninitialized_relocate_n(out.begin(), in.begin(), in.size());
}
}
}
Expand Down Expand Up @@ -140,7 +156,7 @@ template <typename Iterator, typename BinaryOp>
void bucket_sort(slice<Iterator, Iterator> in, BinaryOp f, bool stable = false) {
using T = typename slice<Iterator, Iterator>::value_type;
size_t n = in.size();
auto tmp = sequence<T>::uninitialized(n);
auto tmp = uninitialized_sequence<T>(n);
bucket_sort_r(make_slice(in), make_slice(tmp), f, stable, true);
}

Expand Down
5 changes: 3 additions & 2 deletions include/parlay/internal/collect_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "integer_sort.h"
#include "sequence_ops.h"
#include "transpose.h"
#include "uninitialized_sequence.h"

#include "../utilities.h"

Expand Down Expand Up @@ -230,11 +231,11 @@ auto collect_reduce(Seq const &A, Key const &get_key, Value const &get_value,
using hasheq = hasheq_mask_low<key_type>;
get_bucket<T, key_type, hasheq, Key> gb(A, hasheq(), get_key, bits);
sequence<T> B = sequence<T>::uninitialized(n);
sequence<T> Tmp = sequence<T>::uninitialized(n);
auto Tmp = uninitialized_sequence<T>(n);

// first partition into blocks based on hash using a counting sort
sequence<size_t> block_offsets;
block_offsets = integer_sort_<std::false_type, std::true_type, std::true_type, std::true_type>(
block_offsets = integer_sort_<std::false_type, uninitialized_copy_tag>(
make_slice(A), make_slice(B), make_slice(Tmp) , gb, bits, num_blocks);

// note that this is cache line alligned
Expand Down
71 changes: 44 additions & 27 deletions include/parlay/internal/counting_sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@

#include "sequence_ops.h"
#include "transpose.h"
#include "uninitialized_sequence.h"

#include "../utilities.h"

// TODO
// Make sure works for inplace or not with regards to move_uninitialized

namespace parlay {
namespace internal {
Expand All @@ -40,33 +39,30 @@ void seq_count_(InSeq In, KeySeq Keys, CountIterator counts, size_t num_buckets)
}

// write to destination, where offsets give start of each bucket
template <typename copy_tag, typename out_uninitialized_tag,
typename InSeq, typename OutIterator, typename CountIterator, typename KeySeq>
template <typename assignment_tag, typename InSeq, typename OutIterator, typename CountIterator, typename KeySeq>
void seq_write_(InSeq In, OutIterator Out, KeySeq Keys,
CountIterator offsets, size_t num_buckets) {
// copy to local offsets to avoid false sharing
auto local_offsets = sequence<size_t>::uninitialized(num_buckets);
for (size_t i = 0; i < num_buckets; i++) local_offsets[i] = offsets[i];
for (size_t j = 0; j < In.size(); j++) {
size_t k = local_offsets[Keys[j]]++;
assign_dispatch(Out[k], In[j], copy_tag(), out_uninitialized_tag());
assign_dispatch(Out[k], In[j], assignment_tag());
}
}

// write to destination, where offsets give end of each bucket
template <typename copy_tag, typename out_uninitialized_tag,
typename InSeq, typename OutIterator, typename OffsetIterator, typename KeySeq>
template <typename assignment_tag, typename InSeq, typename OutIterator, typename OffsetIterator, typename KeySeq>
void seq_write_down_(InSeq In, OutIterator Out, KeySeq Keys,
OffsetIterator offsets, size_t) { // num_buckets) {
for (std::ptrdiff_t j = In.size() - 1; j >= 0; j--) {
auto k = --offsets[Keys[j]];
assign_dispatch(Out[k], In[j], copy_tag(), out_uninitialized_tag());
assign_dispatch(Out[k], In[j], assignment_tag());
}
}

// Sequential counting sort internal
template <typename copy_tag, typename out_uninitialized_tag,
typename InS, typename OutS, typename CountIterator, typename KeyS>
template <typename assignment_tag, typename InS, typename OutS, typename CountIterator, typename KeyS>
void seq_count_sort_(InS In, OutS Out, KeyS Keys, CountIterator counts,
size_t num_buckets) {
// count size of each bucket
Expand All @@ -80,18 +76,17 @@ void seq_count_sort_(InS In, OutS Out, KeyS Keys, CountIterator counts,
}

// send to destination
seq_write_down_<copy_tag, out_uninitialized_tag>(In, Out.begin(), Keys, counts, num_buckets);
seq_write_down_<assignment_tag>(In, Out.begin(), Keys, counts, num_buckets);
}

// Sequential counting sort
template <typename copy_tag, typename out_uninitialized_tag,
typename InIterator, typename OutIterator, typename KeyIterator>
template <typename assignment_tag, typename InIterator, typename OutIterator, typename KeyIterator>
sequence<size_t> seq_count_sort(slice<InIterator, InIterator> In,
slice<OutIterator, OutIterator> Out,
slice<KeyIterator, KeyIterator> Keys,
size_t num_buckets) {
auto counts = sequence<size_t>::uninitialized(num_buckets + 1);
seq_count_sort_<copy_tag, out_uninitialized_tag>(In, Out, Keys, counts.begin(), num_buckets);
seq_count_sort_<assignment_tag>(In, Out, Keys, counts.begin(), num_buckets);
counts[num_buckets] = In.size();
return counts;
}
Expand All @@ -100,8 +95,12 @@ sequence<size_t> seq_count_sort(slice<InIterator, InIterator> In,
// returns counts, and a flag
// If skip_if_in_one and returned flag is true, then the Input was alread
// sorted, and it has not been moved to the output.
template <typename copy_tag, typename out_uninitialized_tag,
typename s_size_t, typename InIterator, typename OutIterator, typename KeyIterator>
//
// Values are transferred from In to Out as per the type of assignment_tag.
// E.g. If assignment_tag is parlay::copy_assign_tag, values are copied,
// if it is parlay::uninitialized_move_tag, they are moved assuming that
// Out is uninitialized, etc.
template <typename assignment_tag, typename s_size_t, typename InIterator, typename OutIterator, typename KeyIterator>
std::pair<sequence<size_t>, bool> count_sort_(slice<InIterator, InIterator> In,
slice<OutIterator, OutIterator> Out,
slice<KeyIterator, KeyIterator> Keys,
Expand All @@ -124,7 +123,7 @@ std::pair<sequence<size_t>, bool> count_sort_(slice<InIterator, InIterator> In,
// if insufficient parallelism, sort sequentially
if (n < SEQ_THRESHOLD || num_blocks == 1 || num_threads == 1) {
return std::make_pair(
seq_count_sort<copy_tag, out_uninitialized_tag>(In, Out, Keys, num_buckets),
seq_count_sort<assignment_tag>(In, Out, Keys, num_buckets),
false);
}

Expand Down Expand Up @@ -158,7 +157,7 @@ std::pair<sequence<size_t>, bool> count_sort_(slice<InIterator, InIterator> In,
size_t num_non_zero = 0;
for (size_t i = 0; i < num_buckets; i++)
num_non_zero += (bucket_offsets[i] > 0);
size_t total = scan_inplace(make_slice(bucket_offsets), addm<size_t>());
[[maybe_unused]] size_t total = scan_inplace(make_slice(bucket_offsets), addm<size_t>());
if (skip_if_in_one && num_non_zero == 1) {
return std::make_pair(std::move(bucket_offsets), true);
}
Expand Down Expand Up @@ -191,7 +190,7 @@ std::pair<sequence<size_t>, bool> count_sort_(slice<InIterator, InIterator> In,
[&](size_t i) {
size_t start = (std::min)(i * block_size, n);
size_t end = (std::min)(start + block_size, n);
seq_write_<copy_tag, out_uninitialized_tag>(In.cut(start, end), Out.begin(),
seq_write_<assignment_tag>(In.cut(start, end), Out.begin(),
make_slice(Keys).cut(start, end),
counts2.begin() + i * num_buckets, num_buckets);
},
Expand All @@ -202,8 +201,14 @@ std::pair<sequence<size_t>, bool> count_sort_(slice<InIterator, InIterator> In,

// If skip_if_in_one and returned flag is true, then the Input was alread
// sorted, and it has not been moved to the output.
template <typename copy_tag, typename out_uninitialized_tag,
typename InIterator, typename OutIterator, typename KeyIterator>
//
// Values are transferred from In to Out as per the type of assignment_tag.
// E.g. If assignment_tag is parlay::copy_assign_tag, values are copied,
// if it is parlay::uninitialized_move_tag, they are moved assuming that
// Out is uninitialized, etc. assignment_tag can be uninitialized_relocate_tag,
// in which case the inputs are destructively moved, leaving the input
// as uninitialized memory that must not be destroyed.
template <typename assignment_tag, typename InIterator, typename OutIterator, typename KeyIterator>
auto count_sort(slice<InIterator, InIterator> In,
slice<OutIterator, OutIterator> Out,
slice<KeyIterator, KeyIterator> Keys,
Expand All @@ -217,20 +222,32 @@ auto count_sort(slice<InIterator, InIterator> In,
size_t max32 = static_cast<size_t>((std::numeric_limits<uint32_t>::max)());
if (n < max32 && num_buckets < max32)
// use 4-byte counters when larger ones not needed
return count_sort_<copy_tag, out_uninitialized_tag, uint32_t>(In, Out, Keys, num_buckets, parallelism,
return count_sort_<assignment_tag, uint32_t>(In, Out, Keys, num_buckets, parallelism,
skip_if_in_one);
return count_sort_<copy_tag, out_uninitialized_tag, size_t>(In, Out, Keys, num_buckets, parallelism,
return count_sort_<assignment_tag, size_t>(In, Out, Keys, num_buckets, parallelism,
skip_if_in_one);
}

template <typename InIterator, typename KeyS>
auto count_sort(slice<InIterator, InIterator> In, KeyS const& Keys, size_t num_buckets) {

template <typename InIterator, typename GetKey>
auto count_sort(slice<InIterator, InIterator> In, GetKey get_key, size_t num_buckets) {
using value_type = typename slice<InIterator, InIterator>::value_type;
sequence<value_type> Out(In.size());
auto a = count_sort<std::true_type, std::false_type>(In, Out.slice(), Keys, num_buckets);
auto Keys = delayed_seq<decltype(get_key(*In.begin()))>(In.size(), [&](size_t i) { return get_key(In[i]); });
auto Out = sequence<value_type>::uninitialized(In.size());
auto a = count_sort<copy_assign_tag>(In, make_slice(Out), make_slice(Keys), num_buckets);
return std::make_pair(std::move(Out), std::move(a.first));
}

template <typename InIterator, typename GetKey>
auto count_sort_inplace(slice<InIterator, InIterator> In, GetKey get_key, size_t num_buckets) {
using value_type = typename slice<InIterator, InIterator>::value_type;
auto Keys = delayed_seq<decltype(get_key(*In.begin()))>(In.size(), [&](size_t i) { return get_key(In[i]); });
auto Tmp = uninitialized_sequence<value_type>(In.size());
auto a = count_sort<uninitialized_relocate_tag>(In, make_slice(Tmp), make_slice(Keys), num_buckets);
uninitialized_relocate_n(In.begin(), Tmp.begin(), In.size());
return a.first;
}

} // namespace internal
} // namespace parlay

Expand Down
144 changes: 120 additions & 24 deletions include/parlay/internal/debug_uninitialized.h
Original file line number Diff line number Diff line change
@@ -1,49 +1,145 @@
#ifndef PARLAY_INTERNAL_DEBUG_UNINITIALIZED_H_
#define PARLAY_INTERNAL_DEBUG_UNINITIALIZED_H_

#include <type_traits>

namespace parlay {
namespace internal {

// A simple type to help look for uninitialized memory bugs.
// By default, debug_uninitialized contains x = -1, which is
// a placeholder that pretends to be uninitialized memory.
// UninitializedTracker is essentially an integer type, but
// also tracks whether it is currently in an initialized
// or uninitialized state.
//
// Attempting to assign to an uninitialized state will
// trigger an assertion failure. Attempting to copy or
// move an uninitialized object will also cause failure.
//
// Attempting to copy or assign to an "uninitialized" object
// will result in an assertion failure.
// This code will technically invoke undefined behaviour
// at some point, since we set the initialized flag to
// false in the destructor for the sole purpose of
// checking whether it is indeed false the next time
// someone comes along and wants to do an uninitialized
// in place construction at that memory location.
//
// The functions assign_uninitialized and move_uninitialized
// are also augmented to check whether their destinations are
// "uninitialized" when using this type.
// Some compilers or debugging tools might therefore not
// work correctly with this class. Defining the initialized
// flag as volatile seems to prevent compilers from optimizing
// it out, and therefore leaves its value in memory after
// the object is destroyed, but this can not be guaranteed.
//
// Any object with x != -1 is considered to be initialized
// memory.
struct debug_uninitialized {
// For correctness, this type should only ever be used
// if its memory is allocated inside a parlay::sequence
// or parlay::uninitialized_sequence since they know how
// to set the initialized/uninitialized flag
struct UninitializedTracker {

UninitializedTracker() : x(0), initialized(true) {}

debug_uninitialized() : x(-1) { }
// cppcheck-suppress noExplicitConstructor
/* implicit */ UninitializedTracker(int _x) : x(_x), initialized(true) {}

debug_uninitialized(const debug_uninitialized& other) {
// Ensure that the other object is initialized
assert(other.x != -1 && "Copying an uninitialized variable");
UninitializedTracker(const UninitializedTracker &other) {
assert(other.initialized && "Attempting to copy an uninitialized object!");
x = other.x;
initialized = true;
}

// Copy
UninitializedTracker &operator=(const UninitializedTracker &other) {
assert(initialized && "Attempting to assign to an uninitialized object!");
assert(other.initialized && "Copy assigning an uninitialized object!");
x = other.x;
return *this;
}

debug_uninitialized& operator=(const debug_uninitialized& other) {
// Ensure that this object is initialized
assert(x != -1 && "Assigning to an uninitialized variable");
~UninitializedTracker() {
assert(initialized && "Destructor called on uninitialized object!");
initialized = false;
}

// Ensure that the other object is initialized
assert(other.x != -1 && "Copy assigning an uninitialized variable");
bool operator==(const UninitializedTracker& other) const {
assert(initialized && "Trying to compare an uninitialized object!");
assert(other.initialized && "Trying to compare against an uninitialized object!");
return x == other.x;
}

// Copy
x = other.x;
bool operator<(const UninitializedTracker& other) const {
assert(initialized && "Trying to compare an uninitialized object!");
assert(other.initialized && "Trying to compare against an uninitialized object!");
return x < other.x;
}

return *this;
void swap(UninitializedTracker& other) {
assert(initialized && "Trying to swap uninitialized object!");
assert(other.initialized && "Trying to swap with uninitialize object!");
std::swap(x, other.x);
}

int x;
volatile bool initialized; // Volatile required to prevent optimizing out the store in the destructor
};


} // namespace internal
} // namespace parlay

namespace std {
// Specialize the swap function for UninitializedTracker
inline void swap(parlay::internal::UninitializedTracker& a, parlay::internal::UninitializedTracker& b) {
a.swap(b);
}
}

// Macros for testing initialized/uninitializedness

#ifdef PARLAY_DEBUG_UNINITIALIZED

// Checks that the given UninitializedTracker object is uninitialized
//
// Usage:
// PARLAY_ASSERT_UNINITIALIZED(x)
// Result:
// Terminates the program if the expression (x) refers to an object of
// type UninitializedTracker such that (x).initialized is true, i.e.
// the object is in an initialized state
//
// Note: This macro should only ever be used on memory that was allocated
// by a parlay::sequence or a parlay::uninitialized_sequence, since they
// are required to appropriately flag all newly allocated memory as being
// uninitialized. False positives will occur if this macro is invoked on
// memory that is not managed by one of these containers.
#define PARLAY_ASSERT_UNINITIALIZED(x) \
do { \
using PARLAY_AU_T = std::remove_cv_t<std::remove_reference_t<decltype(x)>>; \
if constexpr (std::is_same_v<PARLAY_AU_T, ::parlay::internal::UninitializedTracker>) { \
assert(!((x).initialized) && "Memory required to be uninitialized is initialized!"); \
} \
} while (0)

// Checks that the given UninitializedTracker object is initialized
//
// Usage:
// PARLAY_ASSERT_INITIALIZED(x)
// Result:
// Terminates the program if the expression (x) refers to an object of
// type UninitializedTracker such that (x).initialized is false, i.e.
// the object is in an uninitialized state
//
// Note: This macro should only ever be used on memory that was allocated
// by a parlay::sequence or a parlay::uninitialized_sequence, since they
// are required to appropriately flag all newly allocated memory as being
// uninitialized. False positives will occur if this macro is invoked on
// memory that is not managed by one of these containers.
#define PARLAY_ASSERT_INITIALIZED(x) \
do { \
using PARLAY_AI_T = std::remove_cv_t<std::remove_reference_t<decltype(x)>>; \
if constexpr (std::is_same_v<PARLAY_AI_T, ::parlay::internal::UninitializedTracker>) { \
assert((x).initialized && "Memory required to be initialized is uninitialized!"); \
} \
} while (0)

#else
#define PARLAY_ASSERT_UNINITIALIZED(x)
#define PARLAY_ASSERT_INITIALIZED(x)
#endif // PARLAY_DEBUG_UNINITIALIZED

#endif // PARLAY_INTERNAL_DEBUG_UNINITIALIZED_H_
2 changes: 1 addition & 1 deletion include/parlay/internal/file_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class file_map {
#endif // PARLAY_NO_FILE_MAP

namespace std {
void swap(parlay::file_map& first, parlay::file_map& second) {
inline void swap(parlay::file_map& first, parlay::file_map& second) {
first.swap(second);
}
}
Expand Down
201 changes: 121 additions & 80 deletions include/parlay/internal/integer_sort.h

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions include/parlay/internal/merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,23 @@ void seq_merge(slice<InIterator1, InIterator1> A,
while (true) {
if (i == nA) {
while (j < nB) {
assign_dispatch(B[j], R[i + j], assignment_tag{});
assign_dispatch(R[i + j], B[j], assignment_tag{});
j++;
}
break;
}
if (j == nB) {
while (i < nA) {
assign_dispatch(A[i], R[i + j], assignment_tag{});
assign_dispatch(R[i + j], A[i], assignment_tag{});
i++;
}
break;
}
if (f(B[j], A[i])) {
assign_dispatch(B[j], R[i + j], assignment_tag{});
assign_dispatch(R[i + j], B[j], assignment_tag{});
j++;
} else {
assign_dispatch(A[i], R[i + j], assignment_tag{});
assign_dispatch(R[i + j], A[i], assignment_tag{});
i++;
}
}
Expand All @@ -66,12 +66,12 @@ void merge_into(slice<InIterator1, InIterator1> A,
}
else if (nA == 0) {
parallel_for(0, nB, [&](size_t i) {
assign_dispatch(B[i], R[i], assignment_tag{});
assign_dispatch(R[i], B[i], assignment_tag{});
});
}
else if (nB == 0) {
parallel_for(0, nA, [&](size_t i) {
assign_dispatch(A[i], R[i], assignment_tag{});
assign_dispatch(R[i], A[i], assignment_tag{});
});
}
else {
Expand Down
28 changes: 10 additions & 18 deletions include/parlay/internal/merge_sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,37 +21,30 @@ template <typename InIterator, typename OutIterator, typename BinaryOp>
void merge_sort_(slice<InIterator, InIterator> In,
slice<OutIterator, OutIterator> Out,
const BinaryOp& f,
size_t levels,
bool inplace) {

size_t n = In.size();
// Base case
if (levels == 0) {
if (n < MERGE_SORT_BASE) {
insertion_sort(In.begin(), In.size(), f);
if (!inplace) {
for (size_t i = 0; i < In.size(); i++) {
assign_uninitialized(Out[i], std::move(In[i]));
uninitialized_relocate(&Out[i], &In[i]);
}
}
}
else {
size_t n = In.size();
size_t m = n / 2;
par_do_if(
n > 64,
[&]() { merge_sort_(In.cut(0, m), Out.cut(0, m), f, levels-1, !inplace); },
[&]() { merge_sort_(In.cut(m, n), Out.cut(m, n), f, levels-1, !inplace); },
[&]() { merge_sort_(In.cut(0, m), Out.cut(0, m), f, !inplace); },
[&]() { merge_sort_(In.cut(m, n), Out.cut(m, n), f, !inplace); },
true);

if (inplace) {
merge_into<move_tag>(Out.cut(0, m), Out.cut(m, n), In, f);
merge_into<uninitialized_relocate_tag>(Out.cut(0, m), Out.cut(m, n), In, f);
}
else {
if (levels == 1) {
merge_into<uninitialized_move_tag>(In.cut(0, m), In.cut(m, n), Out, f);
}
else {
merge_into<move_tag>(In.cut(0, m), In.cut(m, n), Out, f);
}
merge_into<uninitialized_relocate_tag>(In.cut(0, m), In.cut(m, n), Out, f);
}
}
}
Expand All @@ -64,15 +57,14 @@ void merge_sort_inplace(slice<Iterator, Iterator> In, const BinaryOp& f) {
insertion_sort(In.begin(), In.size(), f);
}
else {
auto B = sequence<value_type>::uninitialized(n);
size_t levels = n >= MERGE_SORT_BASE ? log2_up(n / MERGE_SORT_BASE) : 0;
merge_sort_(In, make_slice(B), f, levels, true);
auto B = uninitialized_sequence<value_type>(n);
merge_sort_(In, make_slice(B), f, true);
}
}

// not the most efficent way to do due to extra copy
template <typename Iterator, typename BinaryOp>
auto merge_sort(slice<Iterator, Iterator> In, const BinaryOp& f) {
[[nodiscard]] auto merge_sort(slice<Iterator, Iterator> In, const BinaryOp& f) {
using value_type = typename slice<Iterator, Iterator>::value_type;
auto A = parlay::sequence<value_type>(In.begin(), In.end());
merge_sort_inplace(make_slice(A), f);
Expand Down
72 changes: 52 additions & 20 deletions include/parlay/internal/quicksort.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,55 @@
#define PARLAY_QUICKSORT_H_

#include <algorithm>
#include <new>
#include <utility>

#include "uninitialized_storage.h"
#include "sequence_ops.h"

#include "../utilities.h"

namespace parlay {
namespace internal {

template <class Iterator>
template <typename Iterator>
bool base_case(Iterator x, size_t n) {
using value_type = typename std::iterator_traits<Iterator>::value_type;
bool large = std::is_pointer<value_type>::value || (sizeof(x) > 8);
return large ? (n < 16) : (n < 24);
}

template <class Iterator, class BinPred>
void insertion_sort(Iterator A, size_t n, const BinPred& f) {
// Optimized insertion sort using uninitialized relocate
//
// Note that the strange looking loop body is really just a
// slightly more optimized version of:
//
// T tmp = std::move(A[i])
// std::ptrdiff_t j = i - 1;
// for (; j >= 0 && f(tmp, A[j]); j--) {
// A[j+1] = std::move(A[j])
// }
// A[j+1] = std::move(tmp);
//
// but instead of using move, we use relocation to potentially
// save on wasteful destructor invocations.
//
template <class Iterator, typename BinPred>
auto insertion_sort(Iterator A, size_t n, const BinPred& f) {
using T = typename std::iterator_traits<Iterator>::value_type;
for (size_t i = 1; i < n; i++) {
std::ptrdiff_t j = i;
while (--j >= 0 && f(A[j + 1], A[j])) {
std::swap(A[j + 1], A[j]);
// We want to store the value being moved (A[i]) in a temporary
// variable. In order to take advantage of uninitialized
// relocate, we need uninitialized storage to put it in.
uninitialized_storage<T> temp_storage;
T* tmp = temp_storage.get();
uninitialized_relocate(tmp, &A[i]);

std::ptrdiff_t j = i - 1;
for (; j >= 0 && f(*tmp, A[j]); j--) {
uninitialized_relocate(&A[j+1], &A[j]);
}
uninitialized_relocate(&A[j+1], tmp);
}
}

Expand All @@ -37,10 +64,17 @@ void sort5(Iterator A, size_t n, const BinPred& f) {
insertion_sort(A, size, f);
}

// splits based on two pivots (p1 and p2) into 3 parts:
// less than p1, greater than p2, and the rest in the middle
// If the pivots are the same, returns a true flag to indicate middle need not
// be sorted
// Dual-pivot partition. Picks two pivots from the input A
// and then divides it into three parts:
//
// [x < p1), [p1 <= x <= p2], (p2 < x]
//
// Returns a triple consisting of iterators to the start
// of the second and third part, and a boolean flag, which
// is true if the pivots were equal, and hence the middle
// part contains all equal elements.
//
// Requires that the size of A is at least 5.
template <class Iterator, class BinPred>
std::tuple<Iterator, Iterator, bool> split3(Iterator A, size_t n, const BinPred& f) {
assert(n >= 5);
Expand Down Expand Up @@ -96,9 +130,7 @@ std::tuple<Iterator, Iterator, bool> split3(Iterator A, size_t n, const BinPred&
template <class Iterator, class BinPred>
void quicksort_serial(Iterator A, size_t n, const BinPred& f) {
while (!base_case(A, n)) {
Iterator L, M;
bool mid_eq;
std::tie(L, M, mid_eq) = split3(A, n, f);
auto [L, M, mid_eq] = split3(A, n, f);
if (!mid_eq) quicksort_serial(L+1, M - L-1, f);
quicksort_serial(M, A + n - M, f);
n = L - A;
Expand All @@ -112,13 +144,13 @@ void quicksort(Iterator A, size_t n, const BinPred& f) {
if (n < (1 << 10))
quicksort_serial(A, n, f);
else {
Iterator L;
Iterator M;
bool mid_eq;
std::tie(L, M, mid_eq) = split3(A, n, f);
auto left = [&]() { quicksort(A, L - A, f); };
auto mid = [&]() { quicksort(L + 1, M - L - 1, f); };
auto right = [&]() { quicksort(M, A + n - M, f); };
auto [L, M, mid_eq] = split3(A, n, f);
// Note: generic lambda capture for L and M (i.e. writing L = L, etc.)
// in the capture is required due to a defect in the C++ standard which
// prohibits capturing names resulting from a structured binding!
auto left = [&, L = L]() { quicksort(A, L - A, f); };
auto mid = [&, L = L, M = M]() { quicksort(L + 1, M - L - 1, f); };
auto right = [&, M = M]() { quicksort(M, A + n - M, f); };

if (!mid_eq)
par_do3(left, mid, right);
Expand Down
256 changes: 162 additions & 94 deletions include/parlay/internal/sample_sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,110 +27,183 @@ constexpr const size_t QUICKSORT_THRESHOLD = 16384;
constexpr const size_t OVER_SAMPLE = 8;

// generates counts in Sc for the number of keys in Sa between consecutive
// values of Sb
// Sa and Sb must be sorted
template <typename Iterator, typename PivotIterator, typename CountIterator, typename Compare>
void merge_seq(Iterator sA,
PivotIterator sB,
CountIterator sC,
size_t lA,
size_t lB,
Compare f) {
// values of Sb. Sa and Sb must be sorted
template <typename InIterator, typename PivotIterator, typename CountIterator, typename Compare>
void get_bucket_counts(slice<InIterator, InIterator> sA,
slice<PivotIterator, PivotIterator> sB,
slice<CountIterator, CountIterator> sC,
Compare f) {
using s_size_t = typename std::iterator_traits<CountIterator>::value_type;

if (lA == 0 || lB == 0) return;
auto eA = sA + lA;
auto eB = sB + lB;
for (size_t i = 0; i <= lB; i++) sC[i] = 0;
while (1) {
while (f(*sA, *sB)) {
(*sC)++;
if (++sA == eA) return;
if (sA.size() == 0 || sB.size() == 0) return;
for (auto& c : sC) c = 0;
auto itA = sA.begin();
auto itB = sB.begin();
auto itC = sC.begin();
while (true) {
while (f(*itA, *itB)) {
assert(itC != sC.end());
(*itC)++;
if (++itA == sA.end()) return;
}
sB++;
sC++;
if (sB == eB) break;
if (!(f(*(sB - 1), *sB))) {
while (!f(*sB, *sA)) {
(*sC)++;
if (++sA == eA) return;
itB++;
itC++;
if (itB == sB.end()) break;
if (!(f(*(itB - 1), *itB))) {
while (!f(*itB, *itA)) {
assert(itC != sC.end());
(*itC)++;
if (++itA == sA.end()) return;
}
sB++;
sC++;
if (sB == eB) break;
itB++;
itC++;
if (itB == sB.end()) break;
}
}
*sC = static_cast<s_size_t>(eA - sA);
assert(itC != sC.end());
*itC = static_cast<s_size_t>(sA.end() - itA);
}

template <typename Iterator, typename Compare>
void seq_sort_inplace(slice<Iterator, Iterator> A, const Compare& less, bool stable) {
using value_type = typename slice<Iterator, Iterator>::value_type;
#if defined(OPENMP)
quicksort_serial(A.begin(), A.size(), less);
#else
if (((sizeof(value_type) > 8) || std::is_pointer<value_type>::value) && !stable)
quicksort(A.begin(), A.size(), less);
else
bucket_sort(A, less, stable);
#endif
}

template <typename InIterator, typename OutIterator, typename Compare>
template <typename assignment_tag, typename InIterator, typename OutIterator, typename Compare>
void seq_sort_(slice<InIterator, InIterator> In,
slice<OutIterator, OutIterator> Out,
const Compare& less,
/* inplace */ std::false_type,
bool stable = false) {
size_t l = In.size();
for (size_t j = 0; j < l; j++) assign_uninitialized(Out[j], In[j]);
for (size_t j = 0; j < l; j++) assign_dispatch(Out[j], In[j], assignment_tag());
seq_sort_inplace(Out, less, stable);
}

template <typename InIterator, typename OutIterator, typename Compare>
void seq_sort_(slice<InIterator, InIterator> In,
slice<OutIterator, OutIterator> Out,
const Compare& less,
/* inplace */ std::true_type,
bool stable = false) {
size_t l = In.size();
for (size_t j = 0; j < l; j++) assign_uninitialized(Out[j], std::move(In[j]));
seq_sort_inplace(Out, less, stable);
}

template <class InIterator, class OutIterator, typename Compare>
void small_sort_dispatch(slice<InIterator, InIterator> In,
slice<OutIterator, OutIterator> Out,
const Compare& less,
/* inplace */ std::false_type,
bool stable = false) {
seq_sort_(In, Out, less, std::false_type{}, stable);
}
// Fully in place version of sample sort. Makes no copies of any elements
// in the input array. This version can not be stable, unfortunately.
template <typename s_size_t = size_t,
typename InIterator, typename OutIterator,
typename Compare>
void sample_sort_inplace_(slice<InIterator, InIterator> In,
slice<OutIterator, OutIterator> Out,
const Compare& less) {
using value_type = typename slice<InIterator, InIterator>::value_type;
size_t n = In.size();

if (n < QUICKSORT_THRESHOLD) {
seq_sort_inplace(In, less, false);
} else {
// The larger these are, the more comparisons are done but less
// overhead for the transpose.
size_t bucket_quotient = 4;
size_t block_quotient = 4;
if (std::is_pointer<value_type>::value) {
bucket_quotient = 2;
block_quotient = 3;
} else if (sizeof(value_type) > 8) {
bucket_quotient = 3;
block_quotient = 3;
}

// How many samples to take in terms of blocks, i.e.,
// we take sample_blocks * block_size samples
size_t sample_blocks = 4;

size_t sqrt = static_cast<size_t>(std::sqrt(n));
size_t num_blocks = 1 << log2_up((sqrt / block_quotient) + 1);
size_t block_size = ((n - 1) / num_blocks) + 1;
size_t num_buckets = (sqrt / bucket_quotient) + 1;
size_t sample_set_size = sample_blocks * block_size;
size_t m = num_blocks * num_buckets;

template <class InIterator, class OutIterator, typename Compare>
void small_sort_dispatch(slice<InIterator, InIterator> In,
slice<OutIterator, OutIterator>,
const Compare& less,
/* inplace */ std::true_type,
bool stable = false) {
seq_sort_inplace(In, less, stable);
// We want to select evenly spaced pivots from the sorted samples
// Since we sampled sample_set_size many elements, and we need
// exactly num_buckets - 1, we sample at stride:
assert(sample_set_size >= num_buckets - 1);
size_t stride = sample_set_size / (num_buckets - 1);
assert(stride >= 1);

// In place sampling!. Just swap elements to the front of
// the sequence to be used as the samples. This way, no
// copying is required! This is essentially just k iterations
// of a Knuth shuffle.
for (size_t i = 0; i < sample_set_size; i++) {
size_t j = i + hash64(i) % (n - i);
std::swap(In[i], In[j]);
}

// sort the samples
auto sample_set = make_slice(In.begin(), In.begin() + sample_set_size);
quicksort(sample_set.begin(), sample_set_size, less);

// Pivots returns by reference to avoid making copies
auto pivots = delayed_seq<const value_type&>(num_buckets - 1, [&](size_t i) -> const value_type& {
assert(stride * i < sample_set_size);
return sample_set[stride * i];
});

// sort each block and merge with samples to get counts for each bucket
auto Tmp = uninitialized_sequence<value_type>(n);
auto counts = sequence<s_size_t>::uninitialized(m + 1);
counts[m] = 0;

sliced_for(n, block_size, [&](size_t i, size_t start, size_t end) {
// The sample blocks are already sorted, so we don't need to sort them again.
if (i >= sample_blocks) {
seq_sort_<uninitialized_relocate_tag>(In.cut(start, end), make_slice(Tmp).cut(start, end), less, false);
get_bucket_counts(make_slice(Tmp).cut(start, end), make_slice(pivots),
make_slice(counts).cut(i * num_buckets, (i + 1) * num_buckets), less);
}
else {
get_bucket_counts(make_slice(In).cut(start, end), make_slice(pivots),
make_slice(counts).cut(i * num_buckets, (i + 1) * num_buckets), less);
}
});

// Sample block is already sorted, so we don't need to sort it again.
// We can just move it straight over into the other sorted blocks
uninitialized_relocate_n(Tmp.begin(), sample_set.begin(), sample_set_size);

// move data from blocks to buckets
auto bucket_offsets =
transpose_buckets<uninitialized_relocate_tag>(Tmp.begin(), Out.begin(), counts, n, block_size,
num_blocks, num_buckets);

// sort within each bucket
parallel_for(0, num_buckets, [&](size_t i) {
size_t start = bucket_offsets[i];
size_t end = bucket_offsets[i + 1];
// This could be optimized by not sorting a bucket if its two adjacent
// pivots were equal, since this means that all of the contents of the
// bucket are equal. But we don't know where the pivots are anymore
// since we just merged them in. We could chose to precompute flags
// indicating which adjacent pivots are equal, but is it worth it?
seq_sort_inplace(Out.cut(start, end), less, false);
}, 1);
}
}

// if inplace, then In and Out must be the same, i.e. it copies back to itsefl
// if inplace the copy constructor or assignment is never called on the elements
// if not inplace, then the copy constructor is called once per element
template <typename s_size_t = size_t, typename inplace_tag,
typename InIterator, typename OutIterator,
typename Compare>
// Copying version of sample sort. This one makes copies of the input
// elements when sorting them into the output. Roughly (\sqrt{n})
// additional copies are also made to copy the pivots. This one can
// be stable.
template <typename s_size_t = size_t,
typename InIterator, typename OutIterator,
typename Compare>
void sample_sort_(slice<InIterator, InIterator> In,
slice<OutIterator, OutIterator> Out,
const Compare& less,
bool stable = false) {
slice<OutIterator, OutIterator> Out,
const Compare& less,
bool stable = false) {
using value_type = typename slice<InIterator, InIterator>::value_type;
size_t n = In.size();

if (n < QUICKSORT_THRESHOLD) {
small_sort_dispatch(In, Out, less, inplace_tag{}, stable);
seq_sort_<uninitialized_copy_tag>(In, Out, less, stable);
} else {
// The larger these are, the more comparisons are done but less
// overhead for the transpose.
Expand All @@ -143,7 +216,7 @@ void sample_sort_(slice<InIterator, InIterator> In,
bucket_quotient = 3;
block_quotient = 3;
}
size_t sqrt = (size_t)ceil(pow(n, 0.5));
size_t sqrt = static_cast<size_t>(std::sqrt(n));
size_t num_blocks = size_t{1} << log2_up((sqrt / block_quotient) + 1);
size_t block_size = ((n - 1) / num_blocks) + 1;
size_t num_buckets = (sqrt / bucket_quotient) + 1;
Expand All @@ -152,44 +225,40 @@ void sample_sort_(slice<InIterator, InIterator> In,

// generate "random" samples with oversampling
auto sample_set = sequence<value_type>::from_function(sample_set_size,
[&](size_t i) { return In[hash64(i) % n]; });

[&](size_t i) { return In[hash64(i) % n]; });
// sort the samples
quicksort(sample_set.begin(), sample_set_size, less);

// subselect samples at even stride
auto pivots = sequence<value_type>::from_function(num_buckets - 1,
[&](size_t i) { return sample_set[OVER_SAMPLE * i]; });
[&](size_t i) { return sample_set[OVER_SAMPLE * i]; });

sequence<value_type> Tmp = sequence<value_type>::uninitialized(n);
auto Tmp = uninitialized_sequence<value_type>(n);

// sort each block and merge with samples to get counts for each bucket
auto counts = sequence<s_size_t>::uninitialized(m + 1);
counts[m] = 0;
sliced_for(n, block_size, [&](size_t i, size_t start, size_t end) {
seq_sort_(In.cut(start, end), make_slice(Tmp).cut(start, end), less, inplace_tag{},
stable);
merge_seq(Tmp.begin() + start, pivots.begin(),
counts.begin() + i * num_buckets, end - start, num_buckets - 1,
less);
seq_sort_<uninitialized_copy_tag>(In.cut(start, end), make_slice(Tmp).cut(start, end), less, stable);
get_bucket_counts(make_slice(Tmp).cut(start, end), make_slice(pivots),
make_slice(counts).cut(i * num_buckets, (i + 1) * num_buckets), less);
});

// move data from blocks to buckets
auto bucket_offsets =
transpose_buckets(Tmp.begin(), Out.begin(), counts, n, block_size,
num_blocks, num_buckets);
Tmp.clear();
transpose_buckets<uninitialized_relocate_tag>(Tmp.begin(), Out.begin(), counts, n, block_size,
num_blocks, num_buckets);

// sort within each bucket
parallel_for(0, num_buckets, [&](size_t i) {
size_t start = bucket_offsets[i];
size_t end = bucket_offsets[i + 1];

// buckets need not be sorted if two consecutive pivots are equal
if (i == 0 || i == num_buckets - 1 || less(pivots[i - 1], pivots[i])) {
seq_sort_inplace(Out.cut(start, end), less, stable);
}
}, 1);
// buckets need not be sorted if two consecutive pivots are equal
if (i == 0 || i == num_buckets - 1 || less(pivots[i - 1], pivots[i])) {
seq_sort_inplace(Out.cut(start, end), less, stable);
}
}, 1);
}
}

Expand All @@ -200,23 +269,22 @@ auto sample_sort(slice<Iterator, Iterator> A,
using value_type = typename slice<Iterator, Iterator>::value_type;
sequence<value_type> R = sequence<value_type>::uninitialized(A.size());
if (A.size() < (std::numeric_limits<unsigned int>::max)()) {
sample_sort_<unsigned int, std::false_type>(A, make_slice(R), less, stable);
sample_sort_<unsigned int>(A, make_slice(R), less, stable);
}
else {
sample_sort_<size_t, std::false_type>(A, make_slice(R), less, stable);
sample_sort_<size_t>(A, make_slice(R), less, stable);
}
return R;
}

template <class Iterator, typename Compare>
void sample_sort_inplace(slice<Iterator, Iterator> A,
const Compare& less,
bool stable = false) {
const Compare& less) {
if (A.size() < (std::numeric_limits<unsigned int>::max)()) {
sample_sort_<unsigned int, std::true_type>(A, A, less, stable);
sample_sort_inplace_<unsigned int>(A, A, less);
}
else {
sample_sort_<size_t, std::true_type>(A, A, less, stable);
sample_sort_inplace_<size_t>(A, A, less);
}
}

Expand Down
33 changes: 19 additions & 14 deletions include/parlay/internal/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,15 @@ constexpr const size_t TRANS_THRESHHOLD = PAR_GRANULARITY / 4;
constexpr const size_t TRANS_THRESHHOLD = 500;
#endif

#ifdef DEBUG
constexpr const size_t NON_CACHE_OBLIVIOUS_THRESHOLD = 10000;
#else
constexpr const size_t NON_CACHE_OBLIVIOUS_THRESHOLD = 1 << 22;
#endif

inline size_t split(size_t n) { return n / 2; }

template <class Iterator>
template <typename assignment_tag, typename Iterator>
struct transpose {
Iterator A, B;
transpose(Iterator AA, Iterator BB) : A(AA), B(BB) {}
Expand All @@ -25,7 +31,7 @@ struct transpose {
if (cCount * rCount < TRANS_THRESHHOLD) {
for (size_t i = rStart; i < rStart + rCount; i++)
for (size_t j = cStart; j < cStart + cCount; j++)
B[j * cLength + i] = A[i * rLength + j];
assign_dispatch(B[j * cLength + i], A[i * rLength + j], assignment_tag());
} else if (cCount > rCount) {
size_t l1 = split(cCount);
size_t l2 = cCount - l1;
Expand Down Expand Up @@ -58,7 +64,7 @@ struct transpose {
}
};

template <typename InIterator, typename OutIterator, typename CountIterator, typename DestIterator>
template <typename assignment_tag, typename InIterator, typename OutIterator, typename CountIterator, typename DestIterator>
struct blockTrans {
InIterator A;
OutIterator B;
Expand All @@ -76,7 +82,7 @@ struct blockTrans {
size_t sa = OA[i * rLength + j];
size_t sb = OB[j * cLength + i];
size_t l = OA[i * rLength + j + 1] - sa;
for (size_t k = 0; k < l; k++) B[k + sb] = A[k + sa];
for (size_t k = 0; k < l; k++) assign_dispatch(B[k + sb], A[k + sa], assignment_tag());
}

});
Expand Down Expand Up @@ -118,7 +124,7 @@ struct blockTrans {
// From and To are of lenght n
// counts is of length num_blocks * num_buckets
// Data is memcpy'd into To avoiding initializers and overloaded =
template <typename InIterator, typename OutIterator, typename s_size_t>
template <typename assignment_tag, typename InIterator, typename OutIterator, typename s_size_t>
sequence<size_t> transpose_buckets(InIterator From, OutIterator To,
sequence<s_size_t>& counts, size_t n,
size_t block_size, size_t num_blocks,
Expand All @@ -128,12 +134,10 @@ sequence<size_t> transpose_buckets(InIterator From, OutIterator To,
auto add = addm<s_size_t>();

// for smaller input do non-cache oblivious version
if (n < (1 << 22) || num_buckets <= 512 || num_blocks <= 512) {
if (n < NON_CACHE_OBLIVIOUS_THRESHOLD || num_buckets <= 512 || num_blocks <= 512) {
size_t block_bits = log2_up(num_blocks);
size_t block_mask = num_blocks - 1;
if (size_t{1} << block_bits != num_blocks)
throw std::invalid_argument(
"in transpose_buckets: num_blocks must be a power or 2");
assert(size_t{1} << block_bits == num_blocks);

// determine the destination offsets
auto get = [&](size_t i) {
Expand All @@ -152,23 +156,24 @@ sequence<size_t> transpose_buckets(InIterator From, OutIterator To,
size_t d_offset = dest_offsets[i + num_blocks * j];
size_t len = counts[i * num_buckets + j];
for (size_t k = 0; k < len; k++)
To[d_offset++] = From[s_offset++];
assign_dispatch(To[d_offset++], From[s_offset++], assignment_tag());
}
};
parallel_for(0, num_blocks, f, 1);
} else { // for larger input do cache efficient transpose
// sequence<s_size_t> source_offsets(counts,m+1);
dest_offsets = sequence<s_size_t>(m);
transpose<typename sequence<s_size_t>::iterator>(counts.begin(), dest_offsets.begin())
transpose<assignment_tag, typename sequence<s_size_t>::iterator>(counts.begin(), dest_offsets.begin())
.trans(num_blocks, num_buckets);

// do both scans inplace
size_t total = scan_inplace(make_slice(dest_offsets), add);
size_t total2 = scan_inplace(make_slice(counts), add);
[[maybe_unused]] size_t total = scan_inplace(make_slice(dest_offsets), add);
[[maybe_unused]] size_t total2 = scan_inplace(make_slice(counts), add);
assert(total == n && total2 == n);

counts[m] = static_cast<s_size_t>(n);

blockTrans<InIterator, OutIterator,
blockTrans<assignment_tag, InIterator, OutIterator,
typename sequence<s_size_t>::iterator,
typename sequence<s_size_t>::iterator>(
From, To, counts.begin(), dest_offsets.begin())
Expand Down
154 changes: 154 additions & 0 deletions include/parlay/internal/uninitialized_sequence.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@

#ifndef PARLAY_INTERNAL_UNINITIALIZED_SEQUENCE_H_
#define PARLAY_INTERNAL_UNINITIALIZED_SEQUENCE_H_

#include <iterator>
#include <memory>

#include "../alloc.h"

#include "debug_uninitialized.h"

namespace parlay {
namespace internal {

#ifndef PARLAY_USE_STD_ALLOC
template<typename T>
using _uninitialized_sequence_default_allocator = parlay::allocator<T>;
#else
template<typename T>
using _uninitialized_sequence_default_allocator = std::allocator<T>;
#endif

// An uninitialized fixed-size sequence container.
//
// By uninitialized, we mean two things:
// - The constructor uninitialized_sequence<T>(n) does not initialize its elements
// - The destructor ~uninitialized_sequence() also DOES NOT destroy its elements
//
// In other words, the elements of the sequence are uninitialized upon construction,
// and are also required to be uninitialized when the sequence is destroyed.
//
// Q: What on earth is the purpose of such a container??
// A: Its purpose is to be used as temporary storage for out of place algorithms
// that use uninitialized_relocate. Since the container begins in an uninitialized
// state, it is valid to uninitialized_relocate objects into it, and then
// uninitialized_relocate them back out. This leaves the elements uninitialized,
// so that no destructors will accidentally be triggered for moved-out-of objects.
//
template<typename T, typename Alloc = _uninitialized_sequence_default_allocator<T>>
class uninitialized_sequence {
public:
using value_type = T;
using reference = T&;
using const_reference = const T&;
using difference_type = std::ptrdiff_t;
using size_type = size_t;
using pointer = T*;
using const_pointer = const T*;

using iterator = T*;
using const_iterator = const T*;
using reverse_iterator = std::reverse_iterator<iterator>;
using const_reverse_iterator = std::reverse_iterator<const_iterator>;

using allocator_type = Alloc;

private:
struct uninitialized_sequence_impl : public allocator_type {
size_t n;
value_type* data;
explicit uninitialized_sequence_impl(size_t _n, const allocator_type& alloc)
: allocator_type(alloc),
n(_n),
data(std::allocator_traits<allocator_type>::allocate(*this, n)) { }
~uninitialized_sequence_impl() {
std::allocator_traits<allocator_type>::deallocate(*this, data, n);
}
} impl;

public:
explicit uninitialized_sequence(size_t n, const allocator_type& alloc = {})
: impl(n, alloc) {
#ifdef PARLAY_DEBUG_UNINITIALIZED
// If uninitialized memory debugging is turned on, make sure that
// each object of type UninitializedTracker is appropriately set
// to its uninitialized state.
if constexpr (std::is_same_v<value_type, UninitializedTracker>) {
auto buffer = impl.data;
parallel_for(0, n, [&](size_t i) {
buffer[i].initialized = false;
});
}
#endif
}

#ifdef PARLAY_DEBUG_UNINITIALIZED
// If uninitialized memory debugging is turned on, make sure that
// each object of type UninitializedTracker is destroyed or still
// uninitialized by the time this sequence is destroyed
~uninitialized_sequence() {
auto buffer = impl.data;
parallel_for(0, impl.n, [&](size_t i) {
PARLAY_ASSERT_UNINITIALIZED(buffer[i]);
});
}
#endif

iterator begin() { return impl.data; }
iterator end() { return impl.data + impl.n; }

const_iterator begin() const { return impl.data; }
const_iterator end() const { return impl.data + impl.n; }

const_iterator cbegin() const { return impl.data; }
const_iterator cend() const { return impl.data + impl.n; }

reverse_iterator rbegin() { return std::make_reverse_iterator(end()); }
reverse_iterator rend() { return std::make_reverse_iterator(begin()); }

const_reverse_iterator rbegin() const { return std::make_reverse_iterator(end()); }
const_reverse_iterator rend() const { return std::make_reverse_iterator(begin()); }

const_reverse_iterator crbegin() const { return std::make_reverse_iterator(cend()); }
const_reverse_iterator crend() const { return std::make_reverse_iterator(cbegin()); }

void swap(uninitialized_sequence<T, Alloc>& other) {
std::swap(impl.n, other.impl.n);
std::swap(impl.data, other.impl.data);
}

size_type size() const { return impl.n; }

value_type* data() { return impl.data; }

const value_type* data() const { return impl.data; }

value_type& operator[](size_t i) { return impl.data[i]; }
const value_type& operator[](size_t i) const { return impl.data[i]; }

value_type& at(size_t i) {
if (i >= size()) {
throw std::out_of_range("uninitialized_sequence access out of bounds: length = " +
std::to_string(size()) + ", index = " + std::to_string(i));
}
else {
return impl.data[i];
}
}

const value_type& at(size_t i) const {
if (i >= size()) {
throw std::out_of_range("uninitialized_sequence access out of bounds: length = " +
std::to_string(size()) + ", index = " + std::to_string(i));
}
else {
return impl.data[i];
}
}
};

} // namespace internal
} // namespace parlay

#endif // PARLAY_INTERNAL_UNINITIALIZED_SEQUENCE_H_
53 changes: 53 additions & 0 deletions include/parlay/internal/uninitialized_storage.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

#ifndef PARLAY_INTERNAL_UNINITIALIZED_STORAGE_H_
#define PARLAY_INTERNAL_UNINITIALIZED_STORAGE_H_

#include <memory>
#include <new>

#include "debug_uninitialized.h"

namespace parlay {
namespace internal {

// Contains uninitialized correctly aligned storage large enough to
// hold a single object of type T. The object is not initialized
// and its destructor is not ran when the storage goes out of
// scope. The purpose of such an object is to act as temp space
// when using uninitialized_relocate
template<typename T>
class uninitialized_storage {
using value_type = T;
typename std::aligned_storage<sizeof(T), alignof(T)>::type storage;

public:
uninitialized_storage() {
#ifdef PARLAY_DEBUG_UNINITIALIZED
// If uninitialized memory debugging is turned on, make sure that
// each object of type UninitializedTracker is appropriately set
// to its uninitialized state.
if constexpr (std::is_same_v<value_type, UninitializedTracker>) {
get()->initialized = false;
}
#endif
}

#ifdef PARLAY_DEBUG_UNINITIALIZED
~uninitialized_storage() {
PARLAY_ASSERT_UNINITIALIZED(*get());
}
#endif

value_type* get() {
return std::launder(reinterpret_cast<value_type*>(std::addressof(storage)));
}

const value_type* get() const {
return std::launder(reinterpret_cast<value_type*>(std::addressof(storage)));
}
};

} // namespace internal
} // namespace parlay

#endif // PARLAY_INTERNAL_UNINITIALIZED_STORAGE_H_
56 changes: 17 additions & 39 deletions include/parlay/primitives.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include "internal/merge_sort.h"
#include "internal/sequence_ops.h" // IWYU pragma: export
#include "internal/sample_sort.h"
#include "internal/quicksort.h"

#include "delayed_sequence.h"
#include "monoid.h"
Expand All @@ -44,6 +43,14 @@ auto map(R&& r, UnaryOp&& f) {
return f(it[i]); });
}

// Return a delayed sequence consisting of the elements
// f(0), f(1), ... f(n)
template<typename F>
auto delayed_tabulate(size_t n, F&& f) {
using T = decltype(f(n));
return delayed_seq<T, F>(n, std::forward<F>(f));
}

// Return a delayed sequence consisting of the elements
// f(r[0]), f(r[1]), ..., f(r[n-1])
//
Expand All @@ -53,15 +60,17 @@ auto map(R&& r, UnaryOp&& f) {
// r must remain alive as long as the delayed sequence.
template<PARLAY_RANGE_TYPE R, typename UnaryOp>
auto dmap(R&& r, UnaryOp&& f) {
using T = decltype(f(*std::begin(r)));
size_t n = parlay::size(r);
return delayed_seq<typename std::remove_reference<
typename std::remove_cv<
decltype(f(std::declval<range_value_type_t<R>&>()))
>::type>::type>
(n, [ r = std::forward<R>(r), f = std::forward<UnaryOp>(f) ]
return delayed_seq<T>(n, [ r = std::forward<R>(r), f = std::forward<UnaryOp>(f) ]
(size_t i) { return f(std::begin(r)[i]); });
}

template<PARLAY_RANGE_TYPE R, typename UnaryOp>
auto delayed_map(R&& r, UnaryOp&& f) {
return dmap(std::forward<R>(r), std::forward<UnaryOp>(f));
}

/* -------------------- Copying -------------------- */

// Copy the elements from the range in into the range out
Expand Down Expand Up @@ -217,21 +226,6 @@ auto histogram(const R& A, Integer_ m) {

/* -------------------- General Sorting -------------------- */

namespace internal {

// We are happy to copy objects that are trivially copyable
// and at most 16 bytes large. This is used to choose between
// using sample sort, which makes copies, or quicksort and
// merge sort, which do not.
template<typename T>
struct okay_to_copy : public std::integral_constant<bool,
std::is_trivially_copy_constructible_v<T> &&
std::is_trivially_copy_assignable_v<T> &&
std::is_trivially_destructible_v<T>
> {};

}

// Sort the given sequence and return the sorted sequence
template<PARLAY_RANGE_TYPE R>
auto sort(const R& in) {
Expand Down Expand Up @@ -260,15 +254,7 @@ auto stable_sort(const R& in, Compare&& comp) {

template<PARLAY_RANGE_TYPE R, typename Compare>
void sort_inplace(R&& in, Compare&& comp) {
using value_type = range_value_type_t<R>;
// Could use tag dispatch instead of constexpr
// if to make it compatible with C++14
if constexpr (internal::okay_to_copy<value_type>::value) {
internal::sample_sort_inplace(make_slice(in), std::forward<Compare>(comp), false);
}
else {
internal::quicksort(make_slice(in), std::forward<Compare>(comp));
}
internal::sample_sort_inplace(make_slice(in), std::forward<Compare>(comp));
}

template<PARLAY_RANGE_TYPE R>
Expand All @@ -279,15 +265,7 @@ void sort_inplace(R&& in) {

template<PARLAY_RANGE_TYPE R, typename Compare>
void stable_sort_inplace(R&& in, Compare&& comp) {
using value_type = range_value_type_t<R>;
// Could use tag dispatch instead of constexpr
// if to make it compatible with C++14
if constexpr (internal::okay_to_copy<value_type>::value) {
internal::sample_sort_inplace(make_slice(in), std::forward<Compare>(comp), true);
}
else {
internal::merge_sort_inplace(make_slice(in), std::forward<Compare>(comp));
}
internal::merge_sort_inplace(make_slice(in), std::forward<Compare>(comp));
}

template<PARLAY_RANGE_TYPE R>
Expand Down
10 changes: 2 additions & 8 deletions include/parlay/random.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include <cstdint>

#include <tuple>
#include <type_traits>

#include "delayed_sequence.h"
#include "parallel.h"
Expand Down Expand Up @@ -80,8 +79,7 @@ void random_shuffle_(slice<InIterator, InIterator> In,
// first randomly sorts based on random values [0,num_buckets)
sequence<size_t> bucket_offsets;
bool single;
std::tie(bucket_offsets, single)
= count_sort<std::true_type, std::true_type>(
std::tie(bucket_offsets, single) = count_sort<uninitialized_copy_tag>(
make_slice(In), make_slice(Out), make_slice(get_pos), num_buckets);

// now sequentially randomly shuffle within each bucket
Expand All @@ -102,12 +100,8 @@ void random_shuffle_(slice<InIterator, InIterator> In,
template <PARLAY_RANGE_TYPE R>
auto random_shuffle(const R& In, random r = random()) {
using T = range_value_type_t<R>;
// We have to make a redundant copy here due to a const
// correctness bug inside counting sort.
// TODO: Fix this
auto InCopy = to_sequence(In);
auto Out = sequence<T>::uninitialized(In.size());
internal::random_shuffle_(make_slice(InCopy), make_slice(Out), r);
internal::random_shuffle_(make_slice(In), make_slice(Out), r);
return Out;
}

Expand Down
62 changes: 46 additions & 16 deletions include/parlay/sequence.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,12 @@
#include "parallel.h"
#include "range.h"
#include "slice.h"
#include "type_traits.h" // IWYU pragma: keep // for is_trivially_relocatable
#include "utilities.h"

#ifdef PARLAY_DEBUG_UNINITIALIZED
#include "internal/debug_uninitialized.h"
#endif // PARLAY_DEBUG_UNINITIALIZED
#endif

namespace parlay {

Expand Down Expand Up @@ -467,10 +469,7 @@ struct _sequence_base {
auto n = size();
auto dest_buffer = new_buffer.data();
auto current_buffer = data();
parallel_for(0, n, [&](size_t i) {
initialize_explicit(dest_buffer + i, std::move(current_buffer[i]));
current_buffer[i].~value_type();
});
uninitialized_relocate_n_a(dest_buffer, current_buffer, n, *this);

// Destroy the old stuff
if (!is_small()) {
Expand Down Expand Up @@ -545,8 +544,17 @@ class sequence : protected _sequence_base<T, Allocator> {
return *this;
}

// destroys all elements and frees all memory
~sequence() { };
#ifdef PARLAY_DEBUG_UNINITIALIZED
// If uninitialized memory debugging is turned on, make sure that
// each object of type UninitializedTracker has been initialized
// since we are about to run their destructors!
~sequence() {
auto buffer = impl.data();
parallel_for(0, size(), [&](size_t i) {
PARLAY_ASSERT_INITIALIZED(buffer[i]);
});
}
#endif

iterator begin() { return impl.data(); }
iterator end() { return impl.data() + impl.size(); }
Expand Down Expand Up @@ -920,7 +928,7 @@ class sequence : protected _sequence_base<T, Allocator> {
//
// Initializing non-trivial elements must be done using an
// uninitialized assignment (see e.g., std::uninitialized_copy
// or std::uninitialized_move). Ordinary assingment will trigger
// or std::uninitialized_move). Ordinary assignment will trigger
// the destructor of the uninitialized contents which is bad.
static sequence<value_type> uninitialized(size_t n) {
return sequence<value_type>(n, _uninitialized_tag{});
Expand All @@ -935,12 +943,7 @@ class sequence : protected _sequence_base<T, Allocator> {
// generated by f(0), f(1), ..., f(n-1)
template<typename F>
static sequence<value_type> from_function(size_t n, F&& f) {
auto s = sequence<value_type>::uninitialized(n);
auto buffer = s.data();
parallel_for(0, n, [&](size_t i) {
s.impl.initialize(&buffer[i], f(i));
});
return s;
return sequence<value_type>(n, std::forward<F>(f), _from_function_tag());
}

#ifdef _MSC_VER
Expand All @@ -950,20 +953,36 @@ class sequence : protected _sequence_base<T, Allocator> {
private:

struct _uninitialized_tag { };
struct _from_function_tag { };

sequence(size_t n, _uninitialized_tag) : _sequence_base<T, Allocator>() {
impl.initialize_capacity(n);
impl.set_size(n);

// If uninitialized memory debugging is turned on, make sure that
// a buffer of UninitializedTracker is appropriately set to its
// uninitialized state by creating and immediately destroying.
#ifdef PARLAY_DEBUG_UNINITIALIZED
if constexpr (std::is_same_v<value_type, debug_uninitialized>) {
if constexpr (std::is_same_v<value_type, internal::UninitializedTracker>) {
auto buffer = impl.data();
parallel_for(0, n, [&](size_t i) {
buffer[i].x = -1;
buffer[i].initialized = false;
PARLAY_ASSERT_UNINITIALIZED(buffer[i]);
});
}
#endif
}

template<typename F>
sequence(size_t n, F&& f, _from_function_tag) : _sequence_base<T, Allocator>() {
impl.initialize_capacity(n);
impl.set_size(n);
auto buffer = impl.data();
parallel_for(0, n, [&](size_t i) {
impl.initialize(&buffer[i], f(i));
});
}

// Implement initialize_default manually rather than
// calling initialize_fill(n, value_type()) because
// this allows us to store a sequence of uncopyable
Expand Down Expand Up @@ -1164,6 +1183,17 @@ inline auto to_sequence(R&& r) -> sequence<range_value_type_t<R>> {
return {std::begin(r), std::end(r)};
}

// Mark sequences as trivially relocatable. A sequence is always
// trivially relocatable as long as the allocator is, because:
// 1) Sequences only use small-size optimization when the element
// type is trivial, so the buffer of trivial elements is
// trivially relocatable.
// 2) Sequences that are not small-size optimized are just a
// pointer/length pair, which are trivially relocatable
template<typename T, typename Alloc>
struct is_trivially_relocatable<parlay::sequence<T, Alloc>>
: std::bool_constant<is_trivially_relocatable_v<Alloc>> {};

} // namespace parlay

namespace std {
Expand Down
175 changes: 175 additions & 0 deletions include/parlay/type_traits.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
// Useful type traits used mostly internally by Parlay
//
// Many inspired by this video, and the following standards
// proposals:
// - https://www.youtube.com/watch?v=MWBfmmg8-Yo
// - http://open-std.org/JTC1/SC22/WG21/docs/papers/2014/n4034.pdf
// - https://quuxplusone.github.io/blog/code/object-relocation-in-terms-of-move-plus-destroy-draft-7.html
//
// Includes:
// - priority_tag
// - is_contiguous_iterator / is_random_access_iterator
// - is_trivial_allocator
// - is_trivially_relocatable / is_nothrow_relocatable
//

#ifndef PARLAY_TYPE_TRAITS_H
#define PARLAY_TYPE_TRAITS_H

#include <cstddef>

#include <iterator>
#include <memory>
#include <type_traits>

namespace parlay {

/* --------------------- Priority tags. -------------------------
Priority tags are an easy way to force template resolution to
pick the "best" option in the presence of multiple valid
choices. It works because of the facts that priority_tag<K>
is a subtype of priority_tag<K-1>, and template resolution
will always pick the most specialised option when faced with
a choice, so it will prefer priority_tag<K> over
priority_tag<K-1>
*/

template<size_t K>
struct priority_tag : priority_tag<K-1> {};

template<>
struct priority_tag<0> {};

/* --------------------- Contiguous iterators -----------------------
An iterator is a contiguous iterator if it points to memory that
is contiguous. More specifically, it means that given an iterator
a, and an index n such that a+n is a valid, dereferencable iterator,
then *(a + n) is equivalent to *(std::addressof(*a) + n).
C++20 will introduce a concept for detecting contiguous iterators.
Until then, we just do the conservative thing and deduce that any
iterators represented as pointers are contiguous.
We also supply a convenient trait for checking whether an iterator
is a random access iterator. This can be done using current C++,
but is too verbose to type frequently, so we shorten it here.
*/

template<typename It>
struct is_contiguous_iterator : std::is_pointer<It> {};

template<typename It>
inline constexpr bool is_contiguous_iterator_v = is_contiguous_iterator<It>::value;

template<typename It>
struct is_random_access_iterator : std::bool_constant<
std::is_base_of_v<std::random_access_iterator_tag, typename std::iterator_traits<It>::iterator_category>> {};

template<typename It>
inline constexpr bool is_random_access_iterator_v = is_random_access_iterator<It>::value;

/* ----------------- Trivial allocators. ---------------------
Allocator-aware containers and algorithms need to know whether
they can construct/destruct objects directly inside memory given
to them by an allocator, or whether the allocator has custom
behaviour. Since some optimizations require us to circumvent
custom allocator behaviour, we need to detect when an allocator
does not do this.
Specifically, an allocator-aware algorithm must construct objects
inside memory returned by an allocator by writing
std::allocator_traits<allocator_type>::construct(allocator, p, args);
if the allocator type defines a method .construct, then this results
in forwarding the construction to that method. Otherwise, this just
results in a call to
new (p) T(std::forward<Args>(args)...)
If we wish to circumvent calling the constructor, for example,
for a trivially relocatable type in which we would prefer to
copy directly via memcpy, we must ensure that the allocator
does not have a custom .construct method. Otherwise, we can
not optimize, and must continue to use the allocator's own
construct method.
The same discussion is true for destruction as well.
See https://www.youtube.com/watch?v=MWBfmmg8-Yo for more info.
*/

namespace internal {

// Detect the existence of the .destroy method of the type Alloc
template<typename Alloc, typename T>
auto trivial_allocator(Alloc& a, T *p, priority_tag<2>)
-> decltype(void(a.destroy(p)), std::false_type());

// Detect the existence of the .construct method of the type Alloc
template<typename Alloc, typename T>
auto trivial_allocator(Alloc& a, T *p, priority_tag<1>)
-> decltype(void(a.construct(p, std::declval<T&&>())), std::false_type());

// By default, if no .construct or .destroy methods are found, assume
// that the allocator is trivial
template<typename Alloc, typename T>
auto trivial_allocator(Alloc& a, T* p, priority_tag<0>)
-> std::true_type;

} // namespace internal

template<typename Alloc, typename T>
struct is_trivial_allocator
: decltype(internal::trivial_allocator<Alloc, T>(std::declval<Alloc&>(), nullptr, priority_tag<2>())) {};

template<typename Alloc, typename T>
inline constexpr bool is_trivial_allocator_v = is_trivial_allocator<Alloc, T>::value;

// Manually specialize std::allocator since it is trivial, but
// some (maybe all?) implementations still provide a .construct
// and .destroy method anyway.
template<typename T>
struct is_trivial_allocator<std::allocator<T>, T> : std::true_type {};

/* ----------------- Trivially relocatable. ---------------------
A type T is called trivially relocatable if, given a pointer
p to an object of type T, and a pointer q to unintialized
memory large enough for an object of type T, then
new (q) T(std::move(*p));
p->~T();
is equivalent to
std::memcpy(p, q, sizeof(T));
Any type that is trivially move constructible and trivially
destructible is therefore trivially relocatable. User-defined
types that are not obviously trivially relocatable can be
annotated as such by specializing the is_trivially_relocatable
type.
See proposal D1144R0 for copious details:
https://quuxplusone.github.io/blog/code/object-relocation-in-terms-of-move-plus-destroy-draft-7.html
*/

template <typename T>
struct is_trivially_relocatable :
std::bool_constant<std::is_trivially_move_constructible<T>::value &&
std::is_trivially_destructible<T>::value> { };

template <typename T> struct is_nothrow_relocatable :
std::bool_constant<is_trivially_relocatable<T>::value ||
(std::is_nothrow_move_constructible<T>::value &&
std::is_nothrow_destructible<T>::value)> { };

template<typename T>
inline constexpr bool is_trivially_relocatable_v = is_trivially_relocatable<T>::value;

template<typename T>
inline constexpr bool is_nothrow_relocatable_v = is_nothrow_relocatable<T>::value;

} // namespace parlay

#endif //PARLAY_TYPE_TRAITS_H
247 changes: 194 additions & 53 deletions include/parlay/utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,20 @@
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <cstring>

#include <algorithm>
#include <atomic>
#include <iterator>
#include <memory>
#include <new>
#include <type_traits>
#include <utility>

#include "parallel.h"
#include "type_traits.h"

#ifdef PARLAY_DEBUG_UNINITIALIZED
#include "internal/debug_uninitialized.h"
#endif // PARLAY_DEBUG_UNINITIALIZED

namespace parlay {

Expand Down Expand Up @@ -62,37 +65,23 @@ const flags fl_inplace = 16;

template <typename T>
inline void assign_uninitialized(T& a, const T& b) {
#ifdef PARLAY_DEBUG_UNINITIALIZED
if constexpr (std::is_same_v<T, debug_uninitialized>) {
// Ensure that a is uninitialized
assert(a.x == -1 && "Assigning to initialized memory with assign_uninitialized");
}
#endif // PARLAY_DEBUG_UNINITIALIZED
PARLAY_ASSERT_UNINITIALIZED(a);
new (static_cast<T*>(std::addressof(a))) T(b);
}

template <typename T>
inline auto assign_uninitialized(T& a, T&& b) -> typename std::enable_if_t<std::is_rvalue_reference_v<T&&>> {
#ifdef PARLAY_DEBUG_UNINITIALIZED
if constexpr (std::is_same_v<T, debug_uninitialized>) {
// Ensure that a is uninitialized
assert(a.x == -1 && "Assigning to initialized memory with assign_uninitialized");
}
#endif // PARLAY_DEBUG_UNINITIALIZED
PARLAY_ASSERT_UNINITIALIZED(a);
new (static_cast<T*>(std::addressof(a))) T(std::move(b)); // NOLINT: b is guaranteed to be an rvalue reference
}

template <typename T>
inline void move_uninitialized(T& a, T& b) {
#ifdef PARLAY_DEBUG_UNINITIALIZED
if constexpr (std::is_same_v<T, debug_uninitialized>) {
// Ensure that a is uninitialized
assert(a.x == -1 && "Assigning to initialized memory with move_uninitialized");
}
#endif // PARLAY_DEBUG_UNINITIALIZED
PARLAY_ASSERT_UNINITIALIZED(a);
new (static_cast<T*>(std::addressof(a))) T(std::move(b));
}

/* Hashing functions for various integer types */

// a 32-bit hash function
inline uint32_t hash32(uint32_t a) {
Expand Down Expand Up @@ -143,6 +132,7 @@ inline uint64_t hash64_2(uint64_t x) {
return x;
}

/* Atomic write-add, write-min, and write-max */

template <typename T, typename EV>
inline void write_add(std::atomic<T>* a, EV b) {
Expand Down Expand Up @@ -191,66 +181,217 @@ inline size_t granularity(size_t n) {
return (n > 100) ? static_cast<size_t>(ceil(pow(n, 0.5))) : 100;
}

/* For inplace sorting / merging, we need to move values around
rather than making copies. We use tag dispatch to choose between
moving and copying, so that the move algorithm can be written
agnostic to which one it uses */
/* Relocation (a.k.a. "destructive move")
The relocation of object a into memory b is equivalent to a move
construction of a into b, followed by the destruction of what
remains at a. In other words, it is
new (b) T(std::move(*a));
a->~T();
For many types, however, this can be optimized by replacing it
with just
std::memcpy(b, a, sizeof(T));
We call any such types trivially relocatable. This is clearly
true for any trivial type, but it also turns out to be true
for most other standard types, such as vectors, unique_ptrs,
and more. The key observation is that the only reason that
the move operations of these types is non-trivial is because
they must clear out the source object after moving from it.
If, however, the source object is treated as uninitialized
memory after relocation, then it does not have to be cleared
out, since its destructor will not run.
A strong motivating use case for relocation is in dynamically
sized containers (e.g. vector, or parlay::sequence). When
performing a resize operation, one has to move all of the
contents of the old buffer into the new one, and destroy
the contents of the old buffer, like so (ignoring allocator
details)
parallel_for(0, n, [&](size_t i) {
new (&new_buffer[i]) std::move(current_buffer[i]));
current_buffer[i].~value_type();
});
This can be replaced with
parallel_for(0, n, [&](size_t i) {
uninitialized_relocate(&new_buffer[i], &current_buffer[i]);
});
or even, for better performance yet
uninitialized_relocate_n(new_buffer, current_buffer, n);
The uninitialized_relocate functions will use the optimized
memcpy-based approach for any types for which it is suitable,
and otherwise, will fall back to the generic approach.
*/

// Relocate a single object into uninitialized memory, leaving
// the source memory uninitialized afterwards.
template<typename T>
inline void uninitialized_relocate(T* to, T* from) noexcept(is_nothrow_relocatable<T>::value) {
if constexpr (is_trivially_relocatable<T>::value) {
std::memcpy(to, from, sizeof(T));
}
else {
static_assert(std::is_move_constructible<T>::value);
static_assert(std::is_destructible<T>::value);
PARLAY_ASSERT_UNINITIALIZED(*to);
::new (to) T(std::move(*from));
from->~T();
PARLAY_ASSERT_UNINITIALIZED(*from);
}
}

// Relocate the given range of n elements [from, from + n) into uninitialized
// memory at [to, to + n), such that both the source and destination memory
// were allocated by the given allocator.
template<typename It1, typename It2, typename Alloc>
inline void uninitialized_relocate_n_a(It1 to, It2 from, size_t n, Alloc& alloc) {
using T = typename std::iterator_traits<It1>::value_type;
static_assert(std::is_same_v<typename std::iterator_traits<It2>::value_type, T>);
static_assert(std::is_same_v<typename std::allocator_traits<Alloc>::value_type, T>);

constexpr bool trivially_relocatable = is_trivially_relocatable_v<T>;
constexpr bool trivial_alloc = is_trivial_allocator_v<Alloc, T>;
constexpr bool contiguous = is_contiguous_iterator_v<It1> && is_contiguous_iterator_v<It2>;
constexpr bool random_access =
std::is_base_of_v<std::random_access_iterator_tag, typename std::iterator_traits<It1>::iterator_category> &&
std::is_base_of_v<std::random_access_iterator_tag, typename std::iterator_traits<It2>::iterator_category>;

// The most efficient scenario -- The objects are trivially relocatable, the allocator
// has no special behaviour, and the iterators point to contiguous memory so we can
// memcpy chunks of more than one T object at a time.
if constexpr (trivially_relocatable && contiguous && trivial_alloc) {
constexpr size_t chunk_size = 1024 * sizeof(size_t) / sizeof(T);
const size_t n_chunks = (n + chunk_size - 1) / chunk_size;
parallel_for(0, n_chunks, [&](size_t i) {
size_t n_objects = (std::min)(chunk_size, n - i * chunk_size);
size_t n_bytes = sizeof(T) * n_objects;
std::memcpy(to + i * chunk_size, from + i * chunk_size, n_bytes);
}, 1);
// The next best thing -- If the objects are trivially relocatable and the allocator
// has no special behaviour, so long as the iterators are random access, we can still
// relocate everything in parallel, just not by memcpying multiple objects at a time
} else if constexpr (trivially_relocatable && random_access && trivial_alloc) {
constexpr size_t chunk_size = 1024 * sizeof(size_t) / sizeof(T);
const size_t n_chunks = (n + chunk_size - 1) / chunk_size;
parallel_for(0, n_chunks, [&](size_t i) {
for (size_t j = 0; j < chunk_size && (j + i *chunk_size < n); j++) {
std::memcpy(&to[j + i * chunk_size], &from[j + i * chunk_size], sizeof(T));
}
}, 1);
}
// The iterators are not random access, but we can still relocate, just not in parallel
else if constexpr (trivially_relocatable && trivial_alloc) {
for (size_t i = 0; i < n; i++) {
std::memcpy(std::addressof(*to), std::addressof(*from), sizeof(T));
to++;
from++;
}
}
// After this point, the object can not be trivially relocated, either because it is not
// trivially relocatable, or because the allocator has specialize behaviour. We now fall
// back to just doing a "destructive move" manually.
else if constexpr (random_access) {
static_assert(std::is_move_constructible<T>::value);
static_assert(std::is_destructible<T>::value);
parallel_for(0, n, [&](size_t i) {
PARLAY_ASSERT_UNINITIALIZED(to[i]);
std::allocator_traits<Alloc>::construct(alloc, std::addressof(to[i]), std::move(from[i]));
std::allocator_traits<Alloc>::destroy(alloc, std::addressof(from[i]));
PARLAY_ASSERT_UNINITIALIZED(from[i]);
});
}
// The worst case. No parallelism and no fast relocation.
else {
static_assert(std::is_move_constructible<T>::value);
static_assert(std::is_destructible<T>::value);
for (size_t i = 0; i < n; i++) {
PARLAY_ASSERT_UNINITIALIZED(*to);
std::allocator_traits<Alloc>::construct(alloc, std::addressof(*to), std::move(*from));
std::allocator_traits<Alloc>::destroy(alloc, std::addressof(*from));
PARLAY_ASSERT_UNINITIALIZED(*from);
to++;
from++;
}
}
}

// Relocate the given range of n elements [from, from + n) into uninitialized
// memory at [to, to + n). Relocation is done as if the memory was allocated
// by a standard allocator (e.g. std::allocator, parlay::allocator, malloc)
//
// For an allocator-aware version that respects the construction and destruction
// behaviour of the allocator, use uninitialized_relocate_n_a.
template<typename Iterator1, typename Iterator2>
inline void uninitialized_relocate_n(Iterator1 to, Iterator2 from, size_t n) {
using T = typename std::iterator_traits<Iterator1>::value_type;
std::allocator<T> a;
uninitialized_relocate_n_a(to, from, n, a);
}


struct move_tag {};
/* For inplace sorting / merging, we sometimes need to move values
around and sometimes we want to make copies. We use tag dispatch
to choose between moving and copying, so that the move algorithm
can be written agnostic to which one it uses. We also account for
moving / copying between uninitialized memory.
Usage:
assign_dispatch(dest, val, tag_type())
Tag types:
move_assign_tag: The value is moved assigned into dest from val
uninitialized_move_tag: The value is move constructed from val into dest
copy_assign_tag: The value is copy assigned into dest from val
uninitialized_copy_tag: The value is copy constructed from val into dest
uninitialized_relocate_tag: The value is destructively moved from val into dest
*/

struct move_assign_tag {};
struct uninitialized_move_tag {};
struct copy_tag {};
struct copy_assign_tag {};
struct uninitialized_copy_tag {};
struct uninitialized_relocate_tag {};

// Move dispatch -- move val into dest
template<typename T>
void assign_dispatch(T& val, T& dest, move_tag) {
void assign_dispatch(T& dest, T& val, move_assign_tag) {
dest = std::move(val);
}

// Copy dispatch -- copy val into dest
template<typename T>
void assign_dispatch(const T& val, T& dest, copy_tag) {
void assign_dispatch(T& dest, const T& val, copy_assign_tag) {
dest = val;
}

// Uninitialized move dispatch -- move construct dest with val
template<typename T>
void assign_dispatch(T& val, T& dest, uninitialized_move_tag) {
void assign_dispatch(T& dest, T& val, uninitialized_move_tag) {
assign_uninitialized(dest, std::move(val));
}

// Uninitialized copy dispatch -- copy initialize dest with val
template<typename T>
void assign_dispatch(const T& val, T& dest, uninitialized_copy_tag) {
void assign_dispatch(T& dest, const T& val, uninitialized_copy_tag) {
assign_uninitialized(dest, val);
}

/* Tag-dispatch-based assignment that selects between
initialized/uninitialized copies/moves.
param 3 = std::true_type if a copy should be made, otherwise moves
param 4 = std::true_type if the destination is uninitialized
*/

// Uninitialized relocate dispatch -- destructively move val into dest
template<typename T>
void assign_dispatch(T& dest, T& val, /* move */ std::false_type, /* initialized */ std::false_type) {
dest = std::move(val);
}

template<typename T>
void assign_dispatch(T& dest, T& val, /* move */ std::false_type, /* uninitialized */ std::true_type) {
assign_uninitialized(dest, std::move(val));
}

template<typename T>
void assign_dispatch(T& dest, const T& val, /* copy */ std::true_type, /* initialized */ std::false_type) {
dest = val;
void assign_dispatch(T& dest, T& val, uninitialized_relocate_tag) {
uninitialized_relocate(&dest, &val);
}

template<typename T>
void assign_dispatch(T& dest, const T& val, /* copy */ std::true_type, /* uninitialized */ std::true_type) {
assign_uninitialized(dest, val);
}

} // namespace parlay

Expand Down
10 changes: 10 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,18 @@ add_dtests(NAME test_merge_sort FILES test_merge_sort.cpp LIBS parlay)
add_dtests(NAME test_quicksort FILES test_quicksort.cpp LIBS parlay)
add_dtests(NAME test_bucket_sort FILES test_bucket_sort.cpp LIBS parlay)
add_dtests(NAME test_integer_sort FILES test_integer_sort.cpp LIBS parlay)
add_dtests(NAME test_counting_sort FILES test_counting_sort.cpp LIBS parlay)
add_dtests(NAME test_sample_sort FILES test_sample_sort.cpp LIBS parlay)

# -------------------------------- Primitives ---------------------------------

add_dtests(NAME test_primitives FILES test_primitives.cpp LIBS parlay)
add_dtests(NAME test_random FILES test_random.cpp LIBS parlay)

# -------------------------- Uninitialized memory testing ---------------------------

add_dtests(NAME test_uninitialized_memory FILES test_uninitialized_memory.cpp LIBS parlay NO_SANITIZE)

# -------------------------------- IO ---------------------------------

add_dtests(NAME test_io FILES test_io.cpp LIBS parlay)
Expand All @@ -45,6 +51,10 @@ add_dtests(NAME test_seq_scheduling FILES test_seq_scheduling.cpp LIBS parlay)

add_dtests(NAME test_allocator FILES test_allocator.cpp LIBS parlay)

# ----------------------------- Utilities ------------------------------

add_dtests(NAME test_relocate FILES test_relocate.cpp LIBS parlay)

# --------------------- External scheduler integration tests -------------------

# Cilk integration test (requires a Cilk-compatible compiler)
Expand Down
124 changes: 124 additions & 0 deletions test/test_counting_sort.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#include "gtest/gtest.h"

#include <algorithm>
#include <deque>
#include <numeric>

#include <parlay/primitives.h>
#include <parlay/sequence.h>
#include <parlay/type_traits.h>
#include <parlay/utilities.h>

#include <parlay/internal/counting_sort.h>

#include "sorting_utils.h"

constexpr size_t num_buckets = 1 << 16;

TEST(TestCountingSort, TestCountingSort) {
auto s = parlay::tabulate(100000, [](unsigned long long i) -> unsigned long long {
return (50021 * i + 61) % num_buckets;
});
auto [sorted, offsets] = parlay::internal::count_sort(parlay::make_slice(s), [](auto x) { return x; }, num_buckets);
ASSERT_EQ(s.size(), sorted.size());
std::sort(std::begin(s), std::end(s));
ASSERT_EQ(s, sorted);
ASSERT_TRUE(std::is_sorted(std::begin(sorted), std::end(sorted)));
}

TEST(TestCountingSort, TestCountingSortUnstable) {
auto s = parlay::tabulate(100000, [](long long i) -> UnstablePair {
UnstablePair x;
x.x = (53 * i + 61) % num_buckets;
x.y = 0;
return x;
});
auto [sorted, offsets] = parlay::internal::count_sort(parlay::make_slice(s), [](const auto& x) -> unsigned long long {
return x.x;
}, num_buckets);
ASSERT_EQ(s.size(), sorted.size());
std::stable_sort(std::begin(s), std::end(s));
ASSERT_EQ(s, sorted);
ASSERT_TRUE(std::is_sorted(std::begin(sorted), std::end(sorted)));
}

TEST(TestCountingSort, TestCountingSortInplaceCustomKey) {
auto s = parlay::tabulate(100000, [](long long i) -> UnstablePair {
UnstablePair x;
x.x = (53 * i + 61) % (1 << 10);
x.y = 0;
return x;
});
auto s2 = s;
ASSERT_EQ(s, s2);
parlay::internal::count_sort_inplace(parlay::make_slice(s), [](const auto& x) -> unsigned long long {
return x.x;
}, num_buckets);
std::stable_sort(std::begin(s2), std::end(s2));
ASSERT_EQ(s, s2);
ASSERT_TRUE(std::is_sorted(std::begin(s), std::end(s)));
}

TEST(TestCountingSort, TestCountingSortInplaceUncopyable) {
auto s = parlay::tabulate(100000, [](int i) -> UncopyableThing {
return UncopyableThing((100000-i) % num_buckets);
});
auto s2 = parlay::tabulate(100000, [](int i) -> UncopyableThing {
return UncopyableThing((100000-i) % num_buckets);
});
ASSERT_EQ(s, s2);
parlay::internal::count_sort_inplace(parlay::make_slice(s), [](const auto& a) { return a.x; }, num_buckets);
std::sort(std::begin(s2), std::end(s2));
ASSERT_EQ(s, s2);
ASSERT_TRUE(std::is_sorted(std::begin(s), std::end(s)));
}

TEST(TestCountingSort, TestCountingSortInplaceNonContiguous) {
auto ss = parlay::tabulate(100000, [](long long i) -> long long {
return (50021 * i + 61) % num_buckets;
});
auto s = std::deque<long long>(ss.begin(), ss.end());
auto s2 = s;
ASSERT_EQ(s, s2);
parlay::internal::count_sort_inplace(parlay::make_slice(s), [](auto x) { return x; }, num_buckets);
std::sort(std::begin(s2), std::end(s2));
ASSERT_EQ(s, s2);
ASSERT_TRUE(std::is_sorted(std::begin(s), std::end(s)));
}

namespace parlay {
// Specialize std::unique_ptr to be considered trivially relocatable
template<typename T>
struct is_trivially_relocatable<std::unique_ptr<T>> : public std::true_type { };
}

TEST(TestCountingSort, TestCountingSortInplaceUniquePtr) {
auto s = parlay::tabulate(100000, [](size_t i) {
return std::make_unique<int>((50021 * i + 61) % num_buckets);
});
auto sorted = parlay::tabulate(100000, [](size_t i) {
return std::make_unique<int>((50021 * i + 61) % num_buckets);
});
std::sort(std::begin(sorted), std::end(sorted), [](const auto& p1, const auto& p2) {
return *p1 < *p2;
});
parlay::internal::count_sort_inplace(parlay::make_slice(s), [](const auto& p) { return *p; }, num_buckets);
ASSERT_EQ(s.size(), sorted.size());
for (size_t i = 0; i < 100000; i++) {
ASSERT_EQ(*s[i], *sorted[i]);
}
}

TEST(TestCountingSort, TestCountingSortInplaceSelfReferential) {
auto s = parlay::tabulate(100000, [](int i) -> SelfReferentialThing {
return SelfReferentialThing(i % num_buckets);
});
auto s2 = parlay::tabulate(100000, [](int i) -> SelfReferentialThing {
return SelfReferentialThing(i % num_buckets);
});
ASSERT_EQ(s, s2);
parlay::internal::count_sort_inplace(parlay::make_slice(s), [](const auto& p) { return p.x; }, num_buckets);
std::stable_sort(std::begin(s2), std::end(s2));
ASSERT_EQ(s, s2);
ASSERT_TRUE(std::is_sorted(std::begin(s), std::end(s)));
}
176 changes: 174 additions & 2 deletions test/test_delayed_sequence.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,182 @@
#include "gtest/gtest.h"

#include <algorithm>
#include <numeric>
#include <vector>

#include <parlay/delayed_sequence.h>

TEST(TestDelayedSequence, TestConstruction) {
auto s = parlay::delayed_seq<int>(1000, [](int i) { return i; });
ASSERT_EQ(s.size(), 1000);
auto s = parlay::delayed_seq<int>(100000, [](size_t i) -> int { return i; });
ASSERT_EQ(s.size(), 100000);
}

struct MyFunctor {
std::unique_ptr<int> f;
MyFunctor(int _f) : f(std::make_unique<int>(_f)) { }
MyFunctor(const MyFunctor& other) : f(std::make_unique<int>(*(other.f))) {}
MyFunctor& operator=(const MyFunctor& other) {
f = std::make_unique<int>(*(other.f));
return *this;
}
MyFunctor(MyFunctor&& other) : f(std::move(other.f)) { }
MyFunctor& operator=(MyFunctor&& other) {
f = std::move(other.f);
return *this;
}
int operator()(int i) const { return *f*i; }
};

TEST(TestDelayedSequence, TestCopyConstruct) {
auto s = parlay::delayed_seq<int>(100000, MyFunctor(1));
auto s2 = s;
ASSERT_EQ(s2.size(), s.size());
ASSERT_TRUE(std::equal(std::begin(s), std::end(s), std::begin(s2)));
}

TEST(TestDelayedSequence, TestMoveConstruct) {
auto s = parlay::delayed_seq<int>(100000, MyFunctor(1));
auto s2 = std::move(s);
ASSERT_EQ(s2.size(), 100000);
for (size_t i = 0; i < 100000; i++) {
ASSERT_EQ(s2[i], i);
}
}

TEST(TestDelayedSequence, TestCopyAssign) {
auto s = parlay::delayed_seq<int>(100000, MyFunctor(1));
ASSERT_EQ(s.size(), 100000);
for (size_t i = 0; i < 10000; i++) {
ASSERT_EQ(s[i], i);
}
s = parlay::delayed_seq<int>(200000, MyFunctor(2));
ASSERT_EQ(s.size(), 200000);
for (size_t i = 0; i < 20000; i++) {
ASSERT_EQ(s[i], i*2);
}
}

TEST(TestDelayedSequence, TestMoveAssign) {
auto s = parlay::delayed_seq<int>(100000, MyFunctor(1));
for (size_t i = 0; i < 10000; i++) {
ASSERT_EQ(s[i], i);
}
auto s2 = parlay::delayed_seq<int>(200000, MyFunctor(2));
s = std::move(s2);
ASSERT_EQ(s.size(), 200000);
for (size_t i = 0; i < 20000; i++) {
ASSERT_EQ(s[i], i*2);
}
}

// I don't know why you would ever want to do this but
// hey, its possible
TEST(TestDelayedSequence, TestLambdaCapture) {
auto v = std::vector<int>(100000);
std::iota(std::begin(v), std::end(v), 0);
auto s = parlay::delayed_seq<int>(100000, [v = std::move(v)](size_t i) {
return v[i];
});
ASSERT_EQ(s.size(), 100000);
for (size_t i = 0; i < 10000; i++) {
ASSERT_EQ(s[i], i);
}
}

TEST(TestDelayedSequence, TestAsInputIterator) {
auto s = parlay::delayed_seq<int>(100000, [](size_t i) -> int { return i; });
std::vector<int> v;
std::copy(std::begin(s), std::end(s), std::back_inserter(v));
ASSERT_TRUE(std::equal(std::begin(s), std::end(s), std::begin(v)));
}

TEST(TestDelayedSequence, TestForwardIterator) {
auto s = parlay::delayed_seq<int>(100000, [](size_t i) -> int { return i; });
size_t i = 0;
for (auto it = s.begin(); it != s.end(); it++) {
ASSERT_EQ(*it, i++);
}
ASSERT_EQ(i, 100000);
}

TEST(TestDelayedSequence, TestBackwardIterator) {
auto s = parlay::delayed_seq<int>(100000, [](size_t i) -> int { return i; });
size_t i = 100000;
for (auto it = s.end(); it != s.begin(); it--) {
ASSERT_EQ(*(it-1), --i);
}
ASSERT_EQ(i, 0);
}

TEST(TestDelayedSequence, TestAsReverseIterator) {
auto s = parlay::delayed_seq<int>(100000, [](size_t i) -> int { return i; });
std::vector<int> v;
std::copy(std::rbegin(s), std::rend(s), std::back_inserter(v));
ASSERT_TRUE(std::equal(std::begin(s), std::end(s), std::rbegin(v)));
}

TEST(TestDelayedSequence, TestAsRandomAccess) {
auto s = parlay::delayed_seq<int>(100000, [](size_t i) -> int { return i; });
auto found = std::binary_search(std::begin(s), std::end(s), 49998);
ASSERT_TRUE(found);
auto it = std::lower_bound(std::begin(s), std::end(s), 49998);
ASSERT_NE(it, std::end(s));
ASSERT_EQ(*it, 49998);
}

TEST(TestDelayedSequence, TestSubscript) {
auto s = parlay::delayed_seq<int>(100000, [](size_t i) -> int { return i; });
for (size_t i = 0; i < 100000; i++) {
ASSERT_EQ(i, s[i]);
}
}

TEST(TestDelayedSequence, TestAt) {
auto s = parlay::delayed_seq<int>(100000, [](size_t i) -> int { return i; });
for (size_t i = 0; i < 100000; i++) {
ASSERT_EQ(i, s.at(i));
}
}

TEST(TestDelayedSequence, TestFront) {
auto s = parlay::delayed_seq<int>(100000, [](size_t i) -> int { return i; });
ASSERT_EQ(s.front(), 0);
}

TEST(TestDelayedSequence, TestBack) {
auto s = parlay::delayed_seq<int>(100000, [](size_t i) -> int { return i; });
ASSERT_EQ(s.back(), 99999);
}

// Delayed sequences can be used to return references, so that
// they will not make copies of the things they refer to.
TEST(TestDelayedSequence, TestDelayedSequenceOfReferences) {
std::vector<std::unique_ptr<int>> v;
for (size_t i = 0; i < 100000; i++) {
v.emplace_back(std::make_unique<int>(i));
}
auto s = parlay::delayed_seq<const std::unique_ptr<int>&>(100000, [&](size_t i) -> const std::unique_ptr<int>& {
return v[i];
});
for (size_t i = 0; i < 100000; i++) {
const auto& si = s[i];
ASSERT_EQ(*v[i], *si);
}
}

// Delayed sequences can be used to return references, and
// they can even be mutable references, so we can modify
// the underlying source!
TEST(TestDelayedSequence, TestDelayedSequenceOfMutableReferences) {
std::vector<std::unique_ptr<int>> v;
for (size_t i = 0; i < 100000; i++) {
v.emplace_back(std::make_unique<int>(i));
}
auto s = parlay::delayed_seq<int&>(100000, [&](size_t i) -> int& {
return *v[i];
});
for (size_t i = 0; i < 100000; i++) {
s[i]++;
ASSERT_EQ(*v[i], i+1);
}
}
Loading